Skip to main content

nginx_discovery/parser/
lexer.rs

1//! Lexer for NGINX configuration files
2use crate::ast::Span;
3use crate::error::{Error, Result};
4use crate::parser::{Token, TokenKind};
5
6/// Lexer for tokenizing NGINX configuration
7pub struct Lexer<'a> {
8    /// The input source code
9    input: &'a str,
10    /// Current position in bytes
11    pos: usize,
12    /// Current line number (1-indexed)
13    line: usize,
14    /// Current column number (1-indexed)
15    col: usize,
16}
17
18impl<'a> Lexer<'a> {
19    /// Create a new lexer
20    #[must_use]
21    pub fn new(input: &'a str) -> Self {
22        Self {
23            input,
24            pos: 0,
25            line: 1,
26            col: 1,
27        }
28    }
29
30    /// Get the next token
31    ///
32    /// # Errors
33    ///
34    /// Returns an error if:
35    /// - An unexpected character is encountered
36    /// - A string literal is unterminated
37    /// - A variable reference is malformed
38    pub fn next_token(&mut self) -> Result<Token> {
39        // Skip whitespace
40        self.skip_whitespace();
41
42        // Check for EOF
43        if self.is_eof() {
44            return Ok(self.make_token(TokenKind::Eof));
45        }
46
47        let start_pos = self.pos;
48        let start_line = self.line;
49        let start_col = self.col;
50
51        let ch = self.current_char();
52
53        let kind = match ch {
54            // Comments
55            '#' => self.lex_comment(),
56
57            // Braces
58            '{' => {
59                self.advance();
60                TokenKind::LeftBrace
61            }
62            '}' => {
63                self.advance();
64                TokenKind::RightBrace
65            }
66
67            // Semicolon
68            ';' => {
69                self.advance();
70                TokenKind::Semicolon
71            }
72
73            // Equals (for options like buffer=32k)
74            '=' => {
75                self.advance();
76                TokenKind::Word("=".to_string()) // Treat = as a word token
77            }
78
79            // Strings
80            '"' => self.lex_string('"')?,
81            '\'' => self.lex_string('\'')?,
82
83            // Variables
84            '$' => self.lex_variable()?,
85
86            // Numbers or words
87            _ if ch.is_ascii_digit() => self.lex_number(),
88            _ if is_word_start(ch) => self.lex_word(),
89
90            _ => {
91                return Err(Error::syntax(
92                    format!("unexpected character '{ch}'"),
93                    self.line,
94                    self.col,
95                    Some("valid token".to_string()),
96                    Some(format!("'{ch}'")),
97                ));
98            }
99        };
100
101        let span = Span::new(start_pos, self.pos, start_line, start_col);
102        Ok(Token::new(kind, span))
103    }
104
105    /// Tokenize the entire input
106    ///
107    /// # Errors
108    ///
109    /// Returns an error if any token cannot be parsed.
110    /// See [`next_token`](Self::next_token) for specific error conditions.
111    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
112        let mut tokens = Vec::new();
113
114        loop {
115            let token = self.next_token()?;
116            let is_eof = token.kind == TokenKind::Eof;
117            tokens.push(token);
118
119            if is_eof {
120                break;
121            }
122        }
123
124        Ok(tokens)
125    }
126
127    /// Skip whitespace characters
128    fn skip_whitespace(&mut self) {
129        while !self.is_eof() {
130            let ch = self.current_char();
131            if ch.is_whitespace() {
132                if ch == '\n' {
133                    self.line += 1;
134                    self.col = 1;
135                    self.pos += 1;
136                } else {
137                    self.advance();
138                }
139            } else {
140                break;
141            }
142        }
143    }
144
145    /// Lex a comment
146    fn lex_comment(&mut self) -> TokenKind {
147        self.advance(); // Skip '#'
148
149        let start = self.pos;
150        while !self.is_eof() && self.current_char() != '\n' {
151            self.advance();
152        }
153
154        let comment = self.input[start..self.pos].trim().to_string();
155        TokenKind::Comment(comment)
156    }
157
158    /// Lex a quoted string
159    fn lex_string(&mut self, quote: char) -> Result<TokenKind> {
160        self.advance(); // Skip opening quote
161
162        let start = self.pos;
163        let mut escaped = false;
164
165        while !self.is_eof() {
166            let ch = self.current_char();
167
168            if escaped {
169                escaped = false;
170                self.advance();
171                continue;
172            }
173
174            if ch == '\\' {
175                escaped = true;
176                self.advance();
177                continue;
178            }
179
180            if ch == quote {
181                let value = self.input[start..self.pos].to_string();
182                self.advance(); // Skip closing quote
183                return Ok(TokenKind::String(value));
184            }
185
186            if ch == '\n' {
187                return Err(Error::syntax(
188                    "unterminated string literal",
189                    self.line,
190                    self.col,
191                    Some("closing quote".to_string()),
192                    Some("newline".to_string()),
193                ));
194            }
195
196            self.advance();
197        }
198
199        Err(Error::unexpected_eof("closing quote", self.line))
200    }
201
202    /// Lex a variable ($name)
203    fn lex_variable(&mut self) -> Result<TokenKind> {
204        self.advance(); // Skip '$'
205
206        let start = self.pos;
207
208        // Variable name can be in braces: ${var_name}
209        if !self.is_eof() && self.current_char() == '{' {
210            self.advance(); // Skip '{'
211            let name_start = self.pos;
212
213            while !self.is_eof() && self.current_char() != '}' {
214                self.advance();
215            }
216
217            if self.is_eof() {
218                return Err(Error::unexpected_eof("'}'", self.line));
219            }
220
221            let name = self.input[name_start..self.pos].to_string();
222            self.advance(); // Skip '}'
223            return Ok(TokenKind::Variable(name));
224        }
225
226        // Regular variable: $name
227        while !self.is_eof() && is_word_char(self.current_char()) {
228            self.advance();
229        }
230
231        let name = self.input[start..self.pos].to_string();
232
233        if name.is_empty() {
234            return Err(Error::syntax(
235                "expected variable name after '$'",
236                self.line,
237                self.col,
238                Some("variable name".to_string()),
239                None,
240            ));
241        }
242
243        Ok(TokenKind::Variable(name))
244    }
245
246    /// Lex a number
247    fn lex_number(&mut self) -> TokenKind {
248        let start = self.pos;
249
250        while !self.is_eof() && (self.current_char().is_ascii_digit() || self.current_char() == '.')
251        {
252            self.advance();
253        }
254
255        let number = self.input[start..self.pos].to_string();
256        TokenKind::Number(number)
257    }
258
259    /// Lex a word (identifier)
260    fn lex_word(&mut self) -> TokenKind {
261        let start = self.pos;
262
263        while !self.is_eof() && is_word_char(self.current_char()) {
264            self.advance();
265        }
266
267        let word = self.input[start..self.pos].to_string();
268        TokenKind::Word(word)
269    }
270
271    /// Make a token at current position
272    fn make_token(&self, kind: TokenKind) -> Token {
273        Token::new(kind, Span::new(self.pos, self.pos, self.line, self.col))
274    }
275
276    /// Get current character
277    fn current_char(&self) -> char {
278        self.input[self.pos..].chars().next().unwrap_or('\0')
279    }
280
281    /// Check if at end of file
282    fn is_eof(&self) -> bool {
283        self.pos >= self.input.len()
284    }
285
286    /// Advance to next character
287    fn advance(&mut self) {
288        if !self.is_eof() {
289            let ch = self.current_char();
290            self.pos += ch.len_utf8();
291            if ch != '\n' {
292                self.col += 1;
293            }
294        }
295    }
296}
297
298/// Check if character can start a word
299fn is_word_start(ch: char) -> bool {
300    ch.is_ascii_alphabetic()
301        || ch == '_'
302        || ch == '/'
303        || ch == '.'
304        || ch == '*'
305        || ch == '^'
306        || ch == '~'
307        || ch == '\\'
308}
309
310/// Check if character can be part of a word
311fn is_word_char(ch: char) -> bool {
312    ch.is_ascii_alphanumeric()
313        || ch == '_'
314        || ch == '-'
315        || ch == '/'
316        || ch == '.'
317        || ch == ':'
318        || ch == '='
319        || ch == '*'
320        || ch == '^'
321        || ch == '~'
322        || ch == '\\'
323        || ch == '$' // Add $ too for regex patterns like $
324}
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329
330    #[test]
331    fn test_lex_simple_directive() {
332        let mut lexer = Lexer::new("user nginx;");
333        let tokens = lexer.tokenize().unwrap();
334
335        assert_eq!(tokens.len(), 4); // user, nginx, ;, EOF
336        assert_eq!(tokens[0].kind, TokenKind::Word("user".to_string()));
337        assert_eq!(tokens[1].kind, TokenKind::Word("nginx".to_string()));
338        assert_eq!(tokens[2].kind, TokenKind::Semicolon);
339        assert_eq!(tokens[3].kind, TokenKind::Eof);
340    }
341
342    #[test]
343    fn test_lex_block() {
344        let mut lexer = Lexer::new("server { listen 80; }");
345        let tokens = lexer.tokenize().unwrap();
346
347        assert_eq!(tokens[0].kind, TokenKind::Word("server".to_string()));
348        assert_eq!(tokens[1].kind, TokenKind::LeftBrace);
349        assert_eq!(tokens[2].kind, TokenKind::Word("listen".to_string()));
350        assert_eq!(tokens[3].kind, TokenKind::Number("80".to_string()));
351        assert_eq!(tokens[4].kind, TokenKind::Semicolon);
352        assert_eq!(tokens[5].kind, TokenKind::RightBrace);
353    }
354
355    #[test]
356    fn test_lex_string() {
357        let mut lexer = Lexer::new(r#"root "/var/www";"#);
358        let tokens = lexer.tokenize().unwrap();
359
360        assert_eq!(tokens[0].kind, TokenKind::Word("root".to_string()));
361        assert_eq!(tokens[1].kind, TokenKind::String("/var/www".to_string()));
362        assert_eq!(tokens[2].kind, TokenKind::Semicolon);
363    }
364
365    #[test]
366    fn test_lex_variable() {
367        let mut lexer = Lexer::new("set $host;");
368        let tokens = lexer.tokenize().unwrap();
369
370        assert_eq!(tokens[0].kind, TokenKind::Word("set".to_string()));
371        assert_eq!(tokens[1].kind, TokenKind::Variable("host".to_string()));
372        assert_eq!(tokens[2].kind, TokenKind::Semicolon);
373    }
374
375    #[test]
376    fn test_lex_comment() {
377        let mut lexer = Lexer::new("# This is a comment\nuser nginx;");
378        let tokens = lexer.tokenize().unwrap();
379
380        assert_eq!(
381            tokens[0].kind,
382            TokenKind::Comment("This is a comment".to_string())
383        );
384        assert_eq!(tokens[1].kind, TokenKind::Word("user".to_string()));
385    }
386
387    #[test]
388    fn test_position_tracking() {
389        let mut lexer = Lexer::new("server\n{\n  listen 80;\n}");
390        let tokens = lexer.tokenize().unwrap();
391
392        // Check that positions are tracked
393        assert_eq!(tokens[0].span.line, 1);
394        assert_eq!(tokens[1].span.line, 2);
395        assert_eq!(tokens[2].span.line, 3);
396    }
397
398    #[test]
399    fn test_unterminated_string() {
400        let mut lexer = Lexer::new(r#"root "/var/www"#);
401        let result = lexer.tokenize();
402
403        assert!(result.is_err());
404    }
405}