oak_nginx/lexer/
mod.rs

1use crate::{kind::NginxSyntaxKind, language::NginxLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, NginxLanguage>;
5
6#[derive(Clone)]
7pub struct NginxLexer<'config> {
8    config: &'config NginxLanguage,
9}
10
11impl<'config> NginxLexer<'config> {
12    pub fn new(config: &'config NginxLanguage) -> Self {
13        Self { config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.peek() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(NginxSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.advance(1);
44            state.add_token(NginxSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.advance(1);
49            if let Some('\n') = state.peek() {
50                state.advance(1);
51            }
52            state.add_token(NginxSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理注释
61    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
62        let start_pos = state.get_position();
63
64        if let Some('#') = state.peek() {
65            state.advance(1);
66
67            // 读取到行
68            while let Some(ch) = state.peek() {
69                if ch == '\n' || ch == '\r' {
70                    break;
71                }
72                state.advance(ch.len_utf8());
73            }
74
75            state.add_token(NginxSyntaxKind::CommentToken, start_pos, state.get_position());
76            true
77        }
78        else {
79            false
80        }
81    }
82
83    /// 处理字符
84    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
85        let start_pos = state.get_position();
86
87        if let Some(quote) = state.peek() {
88            if quote != '"' && quote != '\'' {
89                return false;
90            }
91
92            state.advance(1); // 跳过开始引
93            while let Some(ch) = state.peek() {
94                if ch == quote {
95                    state.advance(1); // 跳过结束引号
96                    break;
97                }
98                else if ch == '\\' {
99                    state.advance(1); // 跳过转义字符
100                    if state.peek().is_some() {
101                        state.advance(state.peek().unwrap().len_utf8());
102                    }
103                }
104                else {
105                    state.advance(ch.len_utf8());
106                }
107            }
108
109            state.add_token(NginxSyntaxKind::String, start_pos, state.get_position());
110            true
111        }
112        else {
113            false
114        }
115    }
116
117    /// 处理数字
118    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
119        let start_pos = state.get_position();
120
121        if let Some(ch) = state.peek() {
122            if !ch.is_ascii_digit() {
123                return false;
124            }
125
126            // 处理整数部分
127            while let Some(ch) = state.peek() {
128                if ch.is_ascii_digit() {
129                    state.advance(ch.len_utf8());
130                }
131                else {
132                    break;
133                }
134            }
135
136            // 处理小数
137            if let Some('.') = state.peek() {
138                if let Some(next_ch) = state.peek_next_n(1) {
139                    if next_ch.is_ascii_digit() {
140                        state.advance(1); // 跳过小数
141                        while let Some(ch) = state.peek() {
142                            if ch.is_ascii_digit() {
143                                state.advance(ch.len_utf8());
144                            }
145                            else {
146                                break;
147                            }
148                        }
149                    }
150                }
151            }
152
153            // 处理单位后缀 (k, m, g, s, ms, etc.)
154            if let Some(ch) = state.peek() {
155                if ch.is_ascii_alphabetic() {
156                    while let Some(ch) = state.peek() {
157                        if ch.is_ascii_alphabetic() {
158                            state.advance(ch.len_utf8());
159                        }
160                        else {
161                            break;
162                        }
163                    }
164                }
165            }
166
167            state.add_token(NginxSyntaxKind::Number, start_pos, state.get_position());
168            true
169        }
170        else {
171            false
172        }
173    }
174
175    /// 处理路径
176    fn lex_path<S: Source>(&self, state: &mut State<S>) -> bool {
177        let start_pos = state.get_position();
178
179        if let Some('/') = state.peek() {
180            state.advance(1);
181
182            while let Some(ch) = state.peek() {
183                if ch.is_ascii_alphanumeric() || ch == '/' || ch == '.' || ch == '-' || ch == '_' || ch == '*' {
184                    state.advance(ch.len_utf8());
185                }
186                else {
187                    break;
188                }
189            }
190
191            state.add_token(NginxSyntaxKind::Path, start_pos, state.get_position());
192            true
193        }
194        else {
195            false
196        }
197    }
198
199    /// 处理 URL
200    fn lex_url<S: Source>(&self, state: &mut State<S>) -> bool {
201        let start_pos = state.get_position();
202
203        // 检查是否以 http:// https:// 开
204        let text = state.get_text_from(state.get_position());
205        if text.starts_with("http://") || text.starts_with("https://") {
206            let scheme_len = if text.starts_with("https://") { 8 } else { 7 };
207            state.advance(scheme_len);
208
209            while let Some(ch) = state.peek() {
210                if ch.is_ascii_alphanumeric()
211                    || ch == '.'
212                    || ch == '/'
213                    || ch == ':'
214                    || ch == '-'
215                    || ch == '_'
216                    || ch == '?'
217                    || ch == '&'
218                    || ch == '='
219                {
220                    state.advance(ch.len_utf8());
221                }
222                else {
223                    break;
224                }
225            }
226
227            state.add_token(NginxSyntaxKind::Url, start_pos, state.get_position());
228            true
229        }
230        else {
231            false
232        }
233    }
234
235    /// 处理标识符和关键
236    fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
237        let start_pos = state.get_position();
238
239        if let Some(ch) = state.peek() {
240            if !ch.is_ascii_alphabetic() && ch != '_' {
241                return false;
242            }
243
244            // 收集标识符字
245            while let Some(ch) = state.peek() {
246                if ch.is_ascii_alphanumeric() || ch == '_' {
247                    state.advance(ch.len_utf8());
248                }
249                else {
250                    break;
251                }
252            }
253
254            // 检查是否是关键
255            let text = state.get_text_in((start_pos..state.get_position()).into());
256            let token_kind = match text {
257                "server" => NginxSyntaxKind::ServerKeyword,
258                "location" => NginxSyntaxKind::LocationKeyword,
259                "upstream" => NginxSyntaxKind::UpstreamKeyword,
260                "http" => NginxSyntaxKind::HttpKeyword,
261                "events" => NginxSyntaxKind::EventsKeyword,
262                "listen" => NginxSyntaxKind::ListenKeyword,
263                "server_name" => NginxSyntaxKind::ServerNameKeyword,
264                "root" => NginxSyntaxKind::RootKeyword,
265                "index" => NginxSyntaxKind::IndexKeyword,
266                "proxy_pass" => NginxSyntaxKind::ProxyPassKeyword,
267                _ => NginxSyntaxKind::Identifier,
268            };
269
270            state.add_token(token_kind, start_pos, state.get_position());
271            true
272        }
273        else {
274            false
275        }
276    }
277
278    /// 处理分隔
279    fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
280        let start_pos = state.get_position();
281
282        if let Some(ch) = state.peek() {
283            let token_kind = match ch {
284                '{' => NginxSyntaxKind::LeftBrace,
285                '}' => NginxSyntaxKind::RightBrace,
286                ';' => NginxSyntaxKind::Semicolon,
287                _ => return false,
288            };
289
290            state.advance(ch.len_utf8());
291            state.add_token(token_kind, start_pos, state.get_position());
292            true
293        }
294        else {
295            false
296        }
297    }
298}
299
300impl<'config> Lexer<NginxLanguage> for NginxLexer<'config> {
301    fn lex_incremental(
302        &self,
303        source: impl Source,
304        changed: usize,
305        cache: IncrementalCache<NginxLanguage>,
306    ) -> LexOutput<NginxLanguage> {
307        let mut state = LexerState::new_with_cache(source, changed, cache);
308
309        while state.not_at_end() {
310            // 尝试各种词法规则
311            if self.skip_whitespace(&mut state) {
312                continue;
313            }
314
315            if self.lex_newline(&mut state) {
316                continue;
317            }
318
319            if self.lex_comment(&mut state) {
320                continue;
321            }
322
323            if self.lex_string(&mut state) {
324                continue;
325            }
326
327            if self.lex_url(&mut state) {
328                continue;
329            }
330
331            if self.lex_path(&mut state) {
332                continue;
333            }
334
335            if self.lex_number(&mut state) {
336                continue;
337            }
338
339            if self.lex_identifier(&mut state) {
340                continue;
341            }
342
343            if self.lex_delimiter(&mut state) {
344                continue;
345            }
346
347            // 如果所有规则都不匹配,跳过当前字符并标记为错误
348            let start_pos = state.get_position();
349            if let Some(ch) = state.peek() {
350                state.advance(ch.len_utf8());
351                state.add_token(NginxSyntaxKind::Error, start_pos, state.get_position());
352            }
353        }
354
355        // 添加 EOF kind
356        let eof_pos = state.get_position();
357        state.add_token(NginxSyntaxKind::Eof, eof_pos, eof_pos);
358
359        state.finish(Ok(()))
360    }
361}