Skip to main content

oak_nginx/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the Nginx lexer.
3pub mod token_type;
4
5use crate::{language::NginxLanguage, lexer::token_type::NginxTokenType};
6use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
7
8pub(crate) type State<'a, S> = LexerState<'a, S, NginxLanguage>;
9
10/// Lexer for Nginx configuration files.
11#[derive(Clone, Debug)]
12pub struct NginxLexer<'config> {
13    config: &'config NginxLanguage,
14}
15
16impl<'config> NginxLexer<'config> {
17    /// Creates a new Nginx lexer with the given configuration.
18    pub fn new(config: &'config NginxLanguage) -> Self {
19        Self { config }
20    }
21
22    /// Skips whitespace characters.
23    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
24        let start_pos = state.get_position();
25
26        while let Some(ch) = state.peek() {
27            if ch == ' ' || ch == '\t' {
28                state.advance(ch.len_utf8());
29            }
30            else {
31                break;
32            }
33        }
34
35        if state.get_position() > start_pos {
36            state.add_token(NginxTokenType::Whitespace, start_pos, state.get_position());
37            true
38        }
39        else {
40            false
41        }
42    }
43
44    /// Handles newline characters.
45    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
46        let start_pos = state.get_position();
47
48        if let Some('\n') = state.peek() {
49            state.advance(1);
50            state.add_token(NginxTokenType::Newline, start_pos, state.get_position());
51            true
52        }
53        else if let Some('\r') = state.peek() {
54            state.advance(1);
55            if let Some('\n') = state.peek() {
56                state.advance(1);
57            }
58            state.add_token(NginxTokenType::Newline, start_pos, state.get_position());
59            true
60        }
61        else {
62            false
63        }
64    }
65
66    /// Handles comments.
67    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
68        let start_pos = state.get_position();
69
70        if let Some('#') = state.peek() {
71            state.advance(1);
72
73            // Read until the end of the line
74            while let Some(ch) = state.peek() {
75                if ch == '\n' || ch == '\r' {
76                    break;
77                }
78                state.advance(ch.len_utf8());
79            }
80
81            state.add_token(NginxTokenType::CommentToken, start_pos, state.get_position());
82            true
83        }
84        else {
85            false
86        }
87    }
88
89    /// Handles strings.
90    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
91        let start_pos = state.get_position();
92
93        if let Some(quote) = state.peek() {
94            if quote != '"' && quote != '\'' {
95                return false;
96            }
97
98            state.advance(1); // Skip start quote
99            while let Some(ch) = state.peek() {
100                if ch == quote {
101                    state.advance(1); // Skip end quote
102                    break;
103                }
104                else if ch == '\\' {
105                    state.advance(1); // Skip escape character
106                    if let Some(c) = state.peek() {
107                        state.advance(c.len_utf8());
108                    }
109                }
110                else {
111                    state.advance(ch.len_utf8());
112                }
113            }
114
115            state.add_token(NginxTokenType::String, start_pos, state.get_position());
116            true
117        }
118        else {
119            false
120        }
121    }
122
123    /// Handles numbers.
124    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
125        let start_pos = state.get_position();
126
127        if let Some(ch) = state.peek() {
128            if !ch.is_ascii_digit() {
129                return false;
130            }
131
132            // Handle integer part
133            while let Some(ch) = state.peek() {
134                if ch.is_ascii_digit() {
135                    state.advance(ch.len_utf8());
136                }
137                else {
138                    break;
139                }
140            }
141
142            // Handle decimal part
143            if let Some('.') = state.peek() {
144                if let Some(next_ch) = state.peek_next_n(1) {
145                    if next_ch.is_ascii_digit() {
146                        state.advance(1); // Skip decimal point
147                        while let Some(ch) = state.peek() {
148                            if ch.is_ascii_digit() {
149                                state.advance(ch.len_utf8());
150                            }
151                            else {
152                                break;
153                            }
154                        }
155                    }
156                }
157            }
158
159            // Handle unit suffixes (k, m, g, s, ms, etc.)
160            if let Some(ch) = state.peek() {
161                if ch.is_ascii_alphabetic() {
162                    while let Some(ch) = state.peek() {
163                        if ch.is_ascii_alphabetic() {
164                            state.advance(ch.len_utf8());
165                        }
166                        else {
167                            break;
168                        }
169                    }
170                }
171            }
172
173            state.add_token(NginxTokenType::Number, start_pos, state.get_position());
174            true
175        }
176        else {
177            false
178        }
179    }
180
181    /// Handles paths.
182    fn lex_path<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
183        let start_pos = state.get_position();
184
185        if let Some('/') = state.peek() {
186            state.advance(1);
187
188            while let Some(ch) = state.peek() {
189                if ch.is_ascii_alphanumeric() || ch == '/' || ch == '.' || ch == '-' || ch == '_' || ch == '*' {
190                    state.advance(ch.len_utf8());
191                }
192                else {
193                    break;
194                }
195            }
196
197            state.add_token(NginxTokenType::Path, start_pos, state.get_position());
198            true
199        }
200        else {
201            false
202        }
203    }
204
205    /// Handles URLs.
206    fn lex_url<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
207        let start_pos = state.get_position();
208
209        // Check if starts with http:// or https://
210        if state.starts_with("http://") || state.starts_with("https://") {
211            let scheme_len = if state.starts_with("https://") { 8 } else { 7 };
212            state.advance(scheme_len);
213
214            while let Some(ch) = state.peek() {
215                if ch.is_ascii_alphanumeric() || ch == '.' || ch == '/' || ch == ':' || ch == '-' || ch == '_' || ch == '?' || ch == '&' || ch == '=' {
216                    state.advance(ch.len_utf8());
217                }
218                else {
219                    break;
220                }
221            }
222
223            state.add_token(NginxTokenType::Url, start_pos, state.get_position());
224            true
225        }
226        else {
227            false
228        }
229    }
230
231    /// Handles identifiers and keywords.
232    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
233        let start_pos = state.get_position();
234
235        if let Some(ch) = state.peek() {
236            if !ch.is_ascii_alphanumeric() && ch != '_' && ch != '$' {
237                return false;
238            }
239
240            // Collect identifier characters
241            while let Some(ch) = state.peek() {
242                if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
243                    state.advance(ch.len_utf8());
244                }
245                else {
246                    break;
247                }
248            }
249
250            // Check if it's a keyword
251            let end_pos = state.get_position();
252            let text = state.source().get_text_in(oak_core::Range { start: start_pos, end: end_pos });
253            let token_kind = match text.as_ref() {
254                "server" => NginxTokenType::ServerKeyword,
255                "location" => NginxTokenType::LocationKeyword,
256                "upstream" => NginxTokenType::UpstreamKeyword,
257                "http" => NginxTokenType::HttpKeyword,
258                "events" => NginxTokenType::EventsKeyword,
259                "listen" => NginxTokenType::ListenKeyword,
260                "server_name" => NginxTokenType::ServerNameKeyword,
261                "root" => NginxTokenType::RootKeyword,
262                "index" => NginxTokenType::IndexKeyword,
263                "proxy_pass" => NginxTokenType::ProxyPassKeyword,
264                _ => NginxTokenType::Identifier,
265            };
266
267            state.add_token(token_kind, start_pos, end_pos);
268            true
269        }
270        else {
271            false
272        }
273    }
274
275    /// Handles delimiters.
276    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
277        let start_pos = state.get_position();
278
279        if let Some(ch) = state.peek() {
280            let token_kind = match ch {
281                '{' => NginxTokenType::LeftBrace,
282                '}' => NginxTokenType::RightBrace,
283                ';' => NginxTokenType::Semicolon,
284                _ => return false,
285            };
286
287            state.advance(ch.len_utf8());
288            state.add_token(token_kind, start_pos, state.get_position());
289            true
290        }
291        else {
292            false
293        }
294    }
295
296    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
297        while state.not_at_end() {
298            let start_pos = state.get_position();
299
300            // Try various lexical rules
301            if self.skip_whitespace(state) {
302                continue;
303            }
304
305            if self.lex_newline(state) {
306                continue;
307            }
308
309            if self.lex_comment(state) {
310                continue;
311            }
312
313            if self.lex_string(state) {
314                continue;
315            }
316
317            if self.lex_url(state) {
318                continue;
319            }
320
321            if self.lex_path(state) {
322                continue;
323            }
324
325            if self.lex_number(state) {
326                continue;
327            }
328
329            if self.lex_identifier(state) {
330                continue;
331            }
332
333            if self.lex_delimiter(state) {
334                continue;
335            }
336
337            // If no rules match, skip current character and mark as error
338            state.advance_if_dead_lock(start_pos);
339            if state.get_position() > start_pos {
340                state.add_token(NginxTokenType::Error, start_pos, state.get_position())
341            }
342        }
343        Ok(())
344    }
345}
346
347impl<'config> Lexer<NginxLanguage> for NginxLexer<'config> {
348    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<NginxLanguage>) -> LexOutput<NginxLanguage> {
349        let mut state = LexerState::new(source);
350        let result = self.run(&mut state);
351        if result.is_ok() {
352            state.add_eof()
353        }
354        state.finish_with_cache(result, cache)
355    }
356}