Skip to main content

oak_nginx/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::NginxLanguage, lexer::token_type::NginxTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, NginxLanguage>;
8
9#[derive(Clone, Debug)]
10pub struct NginxLexer<'config> {
11    _config: &'config NginxLanguage,
12}
13
14impl<'config> NginxLexer<'config> {
15    pub fn new(config: &'config NginxLanguage) -> Self {
16        Self { _config: config }
17    }
18
19    /// 跳过空白字符
20    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
21        let start_pos = state.get_position();
22
23        while let Some(ch) = state.peek() {
24            if ch == ' ' || ch == '\t' {
25                state.advance(ch.len_utf8());
26            }
27            else {
28                break;
29            }
30        }
31
32        if state.get_position() > start_pos {
33            state.add_token(NginxTokenType::Whitespace, start_pos, state.get_position());
34            true
35        }
36        else {
37            false
38        }
39    }
40
41    /// 处理换行
42    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
43        let start_pos = state.get_position();
44
45        if let Some('\n') = state.peek() {
46            state.advance(1);
47            state.add_token(NginxTokenType::Newline, start_pos, state.get_position());
48            true
49        }
50        else if let Some('\r') = state.peek() {
51            state.advance(1);
52            if let Some('\n') = state.peek() {
53                state.advance(1);
54            }
55            state.add_token(NginxTokenType::Newline, start_pos, state.get_position());
56            true
57        }
58        else {
59            false
60        }
61    }
62
63    /// 处理注释
64    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
65        let start_pos = state.get_position();
66
67        if let Some('#') = state.peek() {
68            state.advance(1);
69
70            // 读取到行
71            while let Some(ch) = state.peek() {
72                if ch == '\n' || ch == '\r' {
73                    break;
74                }
75                state.advance(ch.len_utf8());
76            }
77
78            state.add_token(NginxTokenType::CommentToken, start_pos, state.get_position());
79            true
80        }
81        else {
82            false
83        }
84    }
85
86    /// 处理字符
87    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start_pos = state.get_position();
89
90        if let Some(quote) = state.peek() {
91            if quote != '"' && quote != '\'' {
92                return false;
93            }
94
95            state.advance(1); // 跳过开始引
96            while let Some(ch) = state.peek() {
97                if ch == quote {
98                    state.advance(1); // 跳过结束引号
99                    break;
100                }
101                else if ch == '\\' {
102                    state.advance(1); // 跳过转义字符
103                    if let Some(c) = state.peek() {
104                        state.advance(c.len_utf8());
105                    }
106                }
107                else {
108                    state.advance(ch.len_utf8());
109                }
110            }
111
112            state.add_token(NginxTokenType::String, start_pos, state.get_position());
113            true
114        }
115        else {
116            false
117        }
118    }
119
120    /// 处理数字
121    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
122        let start_pos = state.get_position();
123
124        if let Some(ch) = state.peek() {
125            if !ch.is_ascii_digit() {
126                return false;
127            }
128
129            // 处理整数部分
130            while let Some(ch) = state.peek() {
131                if ch.is_ascii_digit() {
132                    state.advance(ch.len_utf8());
133                }
134                else {
135                    break;
136                }
137            }
138
139            // 处理小数
140            if let Some('.') = state.peek() {
141                if let Some(next_ch) = state.peek_next_n(1) {
142                    if next_ch.is_ascii_digit() {
143                        state.advance(1); // 跳过小数
144                        while let Some(ch) = state.peek() {
145                            if ch.is_ascii_digit() {
146                                state.advance(ch.len_utf8());
147                            }
148                            else {
149                                break;
150                            }
151                        }
152                    }
153                }
154            }
155
156            // 处理单位后缀 (k, m, g, s, ms, etc.)
157            if let Some(ch) = state.peek() {
158                if ch.is_ascii_alphabetic() {
159                    while let Some(ch) = state.peek() {
160                        if ch.is_ascii_alphabetic() {
161                            state.advance(ch.len_utf8());
162                        }
163                        else {
164                            break;
165                        }
166                    }
167                }
168            }
169
170            state.add_token(NginxTokenType::Number, start_pos, state.get_position());
171            true
172        }
173        else {
174            false
175        }
176    }
177
178    /// 处理路径
179    fn lex_path<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
180        let start_pos = state.get_position();
181
182        if let Some('/') = state.peek() {
183            state.advance(1);
184
185            while let Some(ch) = state.peek() {
186                if ch.is_ascii_alphanumeric() || ch == '/' || ch == '.' || ch == '-' || ch == '_' || ch == '*' {
187                    state.advance(ch.len_utf8());
188                }
189                else {
190                    break;
191                }
192            }
193
194            state.add_token(NginxTokenType::Path, start_pos, state.get_position());
195            true
196        }
197        else {
198            false
199        }
200    }
201
202    /// 处理 URL
203    fn lex_url<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
204        let start_pos = state.get_position();
205
206        // 检查是否以 http:// https:// 开
207        if state.starts_with("http://") || state.starts_with("https://") {
208            let scheme_len = if state.starts_with("https://") { 8 } else { 7 };
209            state.advance(scheme_len);
210
211            while let Some(ch) = state.peek() {
212                if ch.is_ascii_alphanumeric() || ch == '.' || ch == '/' || ch == ':' || ch == '-' || ch == '_' || ch == '?' || ch == '&' || ch == '=' {
213                    state.advance(ch.len_utf8());
214                }
215                else {
216                    break;
217                }
218            }
219
220            state.add_token(NginxTokenType::Url, start_pos, state.get_position());
221            true
222        }
223        else {
224            false
225        }
226    }
227
228    /// 处理标识符和关键
229    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
230        let start_pos = state.get_position();
231
232        if let Some(ch) = state.peek() {
233            if !ch.is_ascii_alphanumeric() && ch != '_' && ch != '$' {
234                return false;
235            }
236
237            // 收集标识符字
238            while let Some(ch) = state.peek() {
239                if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
240                    state.advance(ch.len_utf8());
241                }
242                else {
243                    break;
244                }
245            }
246
247            // 检查是否是关键
248            let end_pos = state.get_position();
249            let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
250            let token_kind = match text.as_ref() {
251                "server" => NginxTokenType::ServerKeyword,
252                "location" => NginxTokenType::LocationKeyword,
253                "upstream" => NginxTokenType::UpstreamKeyword,
254                "http" => NginxTokenType::HttpKeyword,
255                "events" => NginxTokenType::EventsKeyword,
256                "listen" => NginxTokenType::ListenKeyword,
257                "server_name" => NginxTokenType::ServerNameKeyword,
258                "root" => NginxTokenType::RootKeyword,
259                "index" => NginxTokenType::IndexKeyword,
260                "proxy_pass" => NginxTokenType::ProxyPassKeyword,
261                _ => NginxTokenType::Identifier,
262            };
263
264            state.add_token(token_kind, start_pos, end_pos);
265            true
266        }
267        else {
268            false
269        }
270    }
271
272    /// 处理分隔
273    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
274        let start_pos = state.get_position();
275
276        if let Some(ch) = state.peek() {
277            let token_kind = match ch {
278                '{' => NginxTokenType::LeftBrace,
279                '}' => NginxTokenType::RightBrace,
280                ';' => NginxTokenType::Semicolon,
281                _ => return false,
282            };
283
284            state.advance(ch.len_utf8());
285            state.add_token(token_kind, start_pos, state.get_position());
286            true
287        }
288        else {
289            false
290        }
291    }
292
293    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
294        while state.not_at_end() {
295            let start_pos = state.get_position();
296
297            // 尝试各种词法规则
298            if self.skip_whitespace(state) {
299                continue;
300            }
301
302            if self.lex_newline(state) {
303                continue;
304            }
305
306            if self.lex_comment(state) {
307                continue;
308            }
309
310            if self.lex_string(state) {
311                continue;
312            }
313
314            if self.lex_url(state) {
315                continue;
316            }
317
318            if self.lex_path(state) {
319                continue;
320            }
321
322            if self.lex_number(state) {
323                continue;
324            }
325
326            if self.lex_identifier(state) {
327                continue;
328            }
329
330            if self.lex_delimiter(state) {
331                continue;
332            }
333
334            // 如果所有规则都不匹配,跳过当前字符并标记为错误
335            state.advance_if_dead_lock(start_pos);
336            if state.get_position() > start_pos {
337                state.add_token(NginxTokenType::Error, start_pos, state.get_position())
338            }
339        }
340        Ok(())
341    }
342}
343
344impl<'config> Lexer<NginxLanguage> for NginxLexer<'config> {
345    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<NginxLanguage>) -> LexOutput<NginxLanguage> {
346        let mut state = LexerState::new(source);
347        let result = self.run(&mut state);
348        if result.is_ok() {
349            state.add_eof()
350        }
351        state.finish_with_cache(result, cache)
352    }
353}