Skip to main content

oak_nginx/lexer/
mod.rs

1use crate::{kind::NginxSyntaxKind, language::NginxLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, NginxLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct NginxLexer<'config> {
8    _config: &'config NginxLanguage,
9}
10
11impl<'config> NginxLexer<'config> {
12    pub fn new(config: &'config NginxLanguage) -> Self {
13        Self { _config: config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.peek() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(NginxSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.advance(1);
44            state.add_token(NginxSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.advance(1);
49            if let Some('\n') = state.peek() {
50                state.advance(1);
51            }
52            state.add_token(NginxSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理注释
61    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
62        let start_pos = state.get_position();
63
64        if let Some('#') = state.peek() {
65            state.advance(1);
66
67            // 读取到行
68            while let Some(ch) = state.peek() {
69                if ch == '\n' || ch == '\r' {
70                    break;
71                }
72                state.advance(ch.len_utf8());
73            }
74
75            state.add_token(NginxSyntaxKind::CommentToken, start_pos, state.get_position());
76            true
77        }
78        else {
79            false
80        }
81    }
82
83    /// 处理字符
84    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        let start_pos = state.get_position();
86
87        if let Some(quote) = state.peek() {
88            if quote != '"' && quote != '\'' {
89                return false;
90            }
91
92            state.advance(1); // 跳过开始引
93            while let Some(ch) = state.peek() {
94                if ch == quote {
95                    state.advance(1); // 跳过结束引号
96                    break;
97                }
98                else if ch == '\\' {
99                    state.advance(1); // 跳过转义字符
100                    if let Some(c) = state.peek() {
101                        state.advance(c.len_utf8());
102                    }
103                }
104                else {
105                    state.advance(ch.len_utf8());
106                }
107            }
108
109            state.add_token(NginxSyntaxKind::String, start_pos, state.get_position());
110            true
111        }
112        else {
113            false
114        }
115    }
116
117    /// 处理数字
118    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
119        let start_pos = state.get_position();
120
121        if let Some(ch) = state.peek() {
122            if !ch.is_ascii_digit() {
123                return false;
124            }
125
126            // 处理整数部分
127            while let Some(ch) = state.peek() {
128                if ch.is_ascii_digit() {
129                    state.advance(ch.len_utf8());
130                }
131                else {
132                    break;
133                }
134            }
135
136            // 处理小数
137            if let Some('.') = state.peek() {
138                if let Some(next_ch) = state.peek_next_n(1) {
139                    if next_ch.is_ascii_digit() {
140                        state.advance(1); // 跳过小数
141                        while let Some(ch) = state.peek() {
142                            if ch.is_ascii_digit() {
143                                state.advance(ch.len_utf8());
144                            }
145                            else {
146                                break;
147                            }
148                        }
149                    }
150                }
151            }
152
153            // 处理单位后缀 (k, m, g, s, ms, etc.)
154            if let Some(ch) = state.peek() {
155                if ch.is_ascii_alphabetic() {
156                    while let Some(ch) = state.peek() {
157                        if ch.is_ascii_alphabetic() {
158                            state.advance(ch.len_utf8());
159                        }
160                        else {
161                            break;
162                        }
163                    }
164                }
165            }
166
167            state.add_token(NginxSyntaxKind::Number, start_pos, state.get_position());
168            true
169        }
170        else {
171            false
172        }
173    }
174
175    /// 处理路径
176    fn lex_path<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
177        let start_pos = state.get_position();
178
179        if let Some('/') = state.peek() {
180            state.advance(1);
181
182            while let Some(ch) = state.peek() {
183                if ch.is_ascii_alphanumeric() || ch == '/' || ch == '.' || ch == '-' || ch == '_' || ch == '*' {
184                    state.advance(ch.len_utf8());
185                }
186                else {
187                    break;
188                }
189            }
190
191            state.add_token(NginxSyntaxKind::Path, start_pos, state.get_position());
192            true
193        }
194        else {
195            false
196        }
197    }
198
199    /// 处理 URL
200    fn lex_url<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
201        let start_pos = state.get_position();
202
203        // 检查是否以 http:// https:// 开
204        if state.starts_with("http://") || state.starts_with("https://") {
205            let scheme_len = if state.starts_with("https://") { 8 } else { 7 };
206            state.advance(scheme_len);
207
208            while let Some(ch) = state.peek() {
209                if ch.is_ascii_alphanumeric() || ch == '.' || ch == '/' || ch == ':' || ch == '-' || ch == '_' || ch == '?' || ch == '&' || ch == '=' {
210                    state.advance(ch.len_utf8());
211                }
212                else {
213                    break;
214                }
215            }
216
217            state.add_token(NginxSyntaxKind::Url, start_pos, state.get_position());
218            true
219        }
220        else {
221            false
222        }
223    }
224
225    /// 处理标识符和关键
226    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
227        let start_pos = state.get_position();
228
229        if let Some(ch) = state.peek() {
230            if !ch.is_ascii_alphabetic() && ch != '_' {
231                return false;
232            }
233
234            // 收集标识符字
235            while let Some(ch) = state.peek() {
236                if ch.is_ascii_alphanumeric() || ch == '_' {
237                    state.advance(ch.len_utf8());
238                }
239                else {
240                    break;
241                }
242            }
243
244            // 检查是否是关键
245            let end_pos = state.get_position();
246            let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
247            let token_kind = match text.as_ref() {
248                "server" => NginxSyntaxKind::ServerKeyword,
249                "location" => NginxSyntaxKind::LocationKeyword,
250                "upstream" => NginxSyntaxKind::UpstreamKeyword,
251                "http" => NginxSyntaxKind::HttpKeyword,
252                "events" => NginxSyntaxKind::EventsKeyword,
253                "listen" => NginxSyntaxKind::ListenKeyword,
254                "server_name" => NginxSyntaxKind::ServerNameKeyword,
255                "root" => NginxSyntaxKind::RootKeyword,
256                "index" => NginxSyntaxKind::IndexKeyword,
257                "proxy_pass" => NginxSyntaxKind::ProxyPassKeyword,
258                _ => NginxSyntaxKind::Identifier,
259            };
260
261            state.add_token(token_kind, start_pos, end_pos);
262            true
263        }
264        else {
265            false
266        }
267    }
268
269    /// 处理分隔
270    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
271        let start_pos = state.get_position();
272
273        if let Some(ch) = state.peek() {
274            let token_kind = match ch {
275                '{' => NginxSyntaxKind::LeftBrace,
276                '}' => NginxSyntaxKind::RightBrace,
277                ';' => NginxSyntaxKind::Semicolon,
278                _ => return false,
279            };
280
281            state.advance(ch.len_utf8());
282            state.add_token(token_kind, start_pos, state.get_position());
283            true
284        }
285        else {
286            false
287        }
288    }
289
290    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
291        while state.not_at_end() {
292            let start_pos = state.get_position();
293
294            // 尝试各种词法规则
295            if self.skip_whitespace(state) {
296                continue;
297            }
298
299            if self.lex_newline(state) {
300                continue;
301            }
302
303            if self.lex_comment(state) {
304                continue;
305            }
306
307            if self.lex_string(state) {
308                continue;
309            }
310
311            if self.lex_url(state) {
312                continue;
313            }
314
315            if self.lex_path(state) {
316                continue;
317            }
318
319            if self.lex_number(state) {
320                continue;
321            }
322
323            if self.lex_identifier(state) {
324                continue;
325            }
326
327            if self.lex_delimiter(state) {
328                continue;
329            }
330
331            // 如果所有规则都不匹配,跳过当前字符并标记为错误
332            state.advance_if_dead_lock(start_pos);
333            if state.get_position() > start_pos {
334                state.add_token(NginxSyntaxKind::Error, start_pos, state.get_position());
335            }
336        }
337        Ok(())
338    }
339}
340
341impl<'config> Lexer<NginxLanguage> for NginxLexer<'config> {
342    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<NginxLanguage>) -> LexOutput<NginxLanguage> {
343        let mut state = LexerState::new(source);
344        let result = self.run(&mut state);
345        if result.is_ok() {
346            state.add_eof();
347        }
348        state.finish_with_cache(result, cache)
349    }
350}