Skip to main content

oak_protobuf/lexer/
mod.rs

1use crate::{kind::ProtobufSyntaxKind, language::ProtobufLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::LexOutput,
5    source::{Source, TextEdit},
6};
7
8type State<'a, S> = LexerState<'a, S, ProtobufLanguage>;
9
10#[derive(Clone)]
11pub struct ProtobufLexer<'config> {
12    _config: &'config ProtobufLanguage,
13}
14
15impl<'config> ProtobufLexer<'config> {
16    pub fn new(config: &'config ProtobufLanguage) -> Self {
17        Self { _config: config }
18    }
19
20    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
21        while state.not_at_end() {
22            let safe_point = state.get_position();
23
24            if self.skip_whitespace(state) {
25                continue;
26            }
27
28            if self.lex_newline(state) {
29                continue;
30            }
31
32            if self.lex_comment(state) {
33                continue;
34            }
35
36            if self.lex_string_literal(state) {
37                continue;
38            }
39
40            if self.lex_number_literal(state) {
41                continue;
42            }
43
44            if self.lex_identifier_or_keyword(state) {
45                continue;
46            }
47
48            if self.lex_operators_and_delimiters(state) {
49                continue;
50            }
51
52            // 如果没有匹配任何规则,跳过当前字符
53            if let Some(ch) = state.peek() {
54                let start_pos = state.get_position();
55                state.advance(ch.len_utf8());
56                state.add_token(ProtobufSyntaxKind::Error, start_pos, state.get_position());
57            }
58            else {
59                // 如果已到达文件末尾,退出循环
60                break;
61            }
62
63            state.advance_if_dead_lock(safe_point);
64        }
65
66        // Add EOF token
67        let pos = state.get_position();
68        state.add_token(ProtobufSyntaxKind::Eof, pos, pos);
69
70        Ok(())
71    }
72
73    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
74        let start_pos = state.get_position();
75
76        while let Some(ch) = state.peek() {
77            if ch == ' ' || ch == '\t' {
78                state.advance(ch.len_utf8());
79            }
80            else {
81                break;
82            }
83        }
84
85        if state.get_position() > start_pos {
86            state.add_token(ProtobufSyntaxKind::Whitespace, start_pos, state.get_position());
87            true
88        }
89        else {
90            false
91        }
92    }
93
94    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
95        let start_pos = state.get_position();
96
97        if let Some('\n') = state.peek() {
98            state.advance(1);
99            state.add_token(ProtobufSyntaxKind::Newline, start_pos, state.get_position());
100            true
101        }
102        else if let Some('\r') = state.peek() {
103            state.advance(1);
104            if let Some('\n') = state.peek() {
105                state.advance(1);
106            }
107            state.add_token(ProtobufSyntaxKind::Newline, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    fn lex_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
116        let start_pos = state.get_position();
117
118        if let Some('/') = state.peek() {
119            state.advance(1);
120            if let Some('/') = state.peek() {
121                state.advance(1);
122                // 单行注释
123                while let Some(ch) = state.peek() {
124                    if ch == '\n' || ch == '\r' {
125                        break;
126                    }
127                    state.advance(ch.len_utf8());
128                }
129                state.add_token(ProtobufSyntaxKind::Comment, start_pos, state.get_position());
130                true
131            }
132            else if let Some('*') = state.peek() {
133                state.advance(1);
134                // 多行注释 /* ... */
135                while let Some(ch) = state.peek() {
136                    if ch == '*' {
137                        state.advance(1);
138                        if let Some('/') = state.peek() {
139                            state.advance(1);
140                            break;
141                        }
142                    }
143                    else {
144                        state.advance(ch.len_utf8());
145                    }
146                }
147                state.add_token(ProtobufSyntaxKind::Comment, start_pos, state.get_position());
148                true
149            }
150            else {
151                // 回退,这不是注释
152                state.set_position(start_pos);
153                false
154            }
155        }
156        else {
157            false
158        }
159    }
160
161    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
162        let start_pos = state.get_position();
163
164        if let Some(quote_char) = state.peek() {
165            if quote_char == '"' || quote_char == '\'' {
166                state.advance(1); // 跳过开始引号
167
168                let mut escaped = false;
169                while let Some(ch) = state.peek() {
170                    if escaped {
171                        escaped = false;
172                        state.advance(ch.len_utf8());
173                    }
174                    else if ch == '\\' {
175                        escaped = true;
176                        state.advance(1);
177                    }
178                    else if ch == quote_char {
179                        state.advance(1); // 跳过结束引号
180                        break;
181                    }
182                    else if ch == '\n' || ch == '\r' {
183                        // 字符串不能跨行
184                        break;
185                    }
186                    else {
187                        state.advance(ch.len_utf8());
188                    }
189                }
190
191                state.add_token(ProtobufSyntaxKind::StringLiteral, start_pos, state.get_position());
192                true
193            }
194            else {
195                false
196            }
197        }
198        else {
199            false
200        }
201    }
202
203    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
204        if let Some(ch) = state.peek() {
205            if ch.is_ascii_digit() || (ch == '-' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
206                let start_pos = state.get_position();
207
208                // 处理负号
209                if ch == '-' {
210                    state.advance(1);
211                }
212
213                // 读取整数部分
214                while let Some(ch) = state.peek() {
215                    if ch.is_ascii_digit() {
216                        state.advance(1);
217                    }
218                    else {
219                        break;
220                    }
221                }
222
223                // 检查小数点
224                if let Some('.') = state.peek() {
225                    if state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
226                        state.advance(1);
227                        // 读取小数部分
228                        while let Some(ch) = state.peek() {
229                            if ch.is_ascii_digit() {
230                                state.advance(1);
231                            }
232                            else {
233                                break;
234                            }
235                        }
236                    }
237                }
238
239                // 检查科学记数法
240                if let Some(ch) = state.peek() {
241                    if ch == 'e' || ch == 'E' {
242                        state.advance(1);
243                        if let Some(ch) = state.peek() {
244                            if ch == '+' || ch == '-' {
245                                state.advance(1);
246                            }
247                        }
248                        while let Some(ch) = state.peek() {
249                            if ch.is_ascii_digit() {
250                                state.advance(1);
251                            }
252                            else {
253                                break;
254                            }
255                        }
256                    }
257                }
258
259                state.add_token(ProtobufSyntaxKind::NumberLiteral, start_pos, state.get_position());
260                true
261            }
262            else {
263                false
264            }
265        }
266        else {
267            false
268        }
269    }
270
271    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
272        if let Some(ch) = state.peek() {
273            if ch.is_ascii_alphabetic() || ch == '_' {
274                let start_pos = state.get_position();
275                let mut text = String::new();
276
277                // 读取标识符
278                while let Some(ch) = state.peek() {
279                    if ch.is_alphanumeric() || ch == '_' {
280                        text.push(ch);
281                        state.advance(ch.len_utf8());
282                    }
283                    else {
284                        break;
285                    }
286                }
287
288                // 检查是否是关键字
289                let kind = match text.as_str() {
290                    "kind" => ProtobufSyntaxKind::Syntax,
291                    "package" => ProtobufSyntaxKind::Package,
292                    "import" => ProtobufSyntaxKind::Import,
293                    "option" => ProtobufSyntaxKind::Option,
294                    "message" => ProtobufSyntaxKind::Message,
295                    "enum" => ProtobufSyntaxKind::Enum,
296                    "service" => ProtobufSyntaxKind::Service,
297                    "rpc" => ProtobufSyntaxKind::Rpc,
298                    "returns" => ProtobufSyntaxKind::Returns,
299                    "stream" => ProtobufSyntaxKind::Stream,
300                    "repeated" => ProtobufSyntaxKind::Repeated,
301                    "optional" => ProtobufSyntaxKind::Optional,
302                    "required" => ProtobufSyntaxKind::Required,
303                    "oneof" => ProtobufSyntaxKind::Oneof,
304                    "map" => ProtobufSyntaxKind::Map,
305                    "reserved" => ProtobufSyntaxKind::Reserved,
306                    "extensions" => ProtobufSyntaxKind::Extensions,
307                    "extend" => ProtobufSyntaxKind::Extend,
308                    "group" => ProtobufSyntaxKind::Group,
309                    "public" => ProtobufSyntaxKind::Public,
310                    "weak" => ProtobufSyntaxKind::Weak,
311                    // 数据类型
312                    "double" => ProtobufSyntaxKind::Double,
313                    "float" => ProtobufSyntaxKind::Float,
314                    "int32" => ProtobufSyntaxKind::Int32,
315                    "int64" => ProtobufSyntaxKind::Int64,
316                    "uint32" => ProtobufSyntaxKind::Uint32,
317                    "uint64" => ProtobufSyntaxKind::Uint64,
318                    "sint32" => ProtobufSyntaxKind::Sint32,
319                    "sint64" => ProtobufSyntaxKind::Sint64,
320                    "fixed32" => ProtobufSyntaxKind::Fixed32,
321                    "fixed64" => ProtobufSyntaxKind::Fixed64,
322                    "sfixed32" => ProtobufSyntaxKind::Sfixed32,
323                    "sfixed64" => ProtobufSyntaxKind::Sfixed64,
324                    "bool" => ProtobufSyntaxKind::Bool,
325                    "string" => ProtobufSyntaxKind::String,
326                    "bytes" => ProtobufSyntaxKind::Bytes,
327                    // 布尔字面量
328                    "true" | "false" => ProtobufSyntaxKind::BooleanLiteral,
329                    _ => ProtobufSyntaxKind::Identifier,
330                };
331
332                state.add_token(kind, start_pos, state.get_position());
333                true
334            }
335            else {
336                false
337            }
338        }
339        else {
340            false
341        }
342    }
343
344    fn lex_operators_and_delimiters<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
345        if let Some(ch) = state.peek() {
346            let start_pos = state.get_position();
347
348            let kind = match ch {
349                '=' => {
350                    state.advance(1);
351                    ProtobufSyntaxKind::Assign
352                }
353                ';' => {
354                    state.advance(1);
355                    ProtobufSyntaxKind::Semicolon
356                }
357                ',' => {
358                    state.advance(1);
359                    ProtobufSyntaxKind::Comma
360                }
361                '.' => {
362                    state.advance(1);
363                    ProtobufSyntaxKind::Dot
364                }
365                '(' => {
366                    state.advance(1);
367                    ProtobufSyntaxKind::LeftParen
368                }
369                ')' => {
370                    state.advance(1);
371                    ProtobufSyntaxKind::RightParen
372                }
373                '[' => {
374                    state.advance(1);
375                    ProtobufSyntaxKind::LeftBracket
376                }
377                ']' => {
378                    state.advance(1);
379                    ProtobufSyntaxKind::RightBracket
380                }
381                '{' => {
382                    state.advance(1);
383                    ProtobufSyntaxKind::LeftBrace
384                }
385                '}' => {
386                    state.advance(1);
387                    ProtobufSyntaxKind::RightBrace
388                }
389                '<' => {
390                    state.advance(1);
391                    ProtobufSyntaxKind::LeftAngle
392                }
393                '>' => {
394                    state.advance(1);
395                    ProtobufSyntaxKind::RightAngle
396                }
397                _ => return false,
398            };
399
400            state.add_token(kind, start_pos, state.get_position());
401            true
402        }
403        else {
404            false
405        }
406    }
407}
408
409impl<'config> Lexer<ProtobufLanguage> for ProtobufLexer<'config> {
410    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ProtobufLanguage>) -> LexOutput<ProtobufLanguage> {
411        let mut state = State::new(source);
412        let result = self.run(&mut state);
413        state.finish_with_cache(result, cache)
414    }
415}