nginx_discovery/parser/
lexer.rs1use crate::ast::Span;
3use crate::error::{Error, Result};
4use crate::parser::{Token, TokenKind};
5
6pub struct Lexer<'a> {
8 input: &'a str,
10 pos: usize,
12 line: usize,
14 col: usize,
16}
17
18impl<'a> Lexer<'a> {
19 #[must_use]
21 pub fn new(input: &'a str) -> Self {
22 Self {
23 input,
24 pos: 0,
25 line: 1,
26 col: 1,
27 }
28 }
29
30 pub fn next_token(&mut self) -> Result<Token> {
39 self.skip_whitespace();
41
42 if self.is_eof() {
44 return Ok(self.make_token(TokenKind::Eof));
45 }
46
47 let start_pos = self.pos;
48 let start_line = self.line;
49 let start_col = self.col;
50
51 let ch = self.current_char();
52
53 let kind = match ch {
54 '#' => self.lex_comment(),
56
57 '{' => {
59 self.advance();
60 TokenKind::LeftBrace
61 }
62 '}' => {
63 self.advance();
64 TokenKind::RightBrace
65 }
66
67 ';' => {
69 self.advance();
70 TokenKind::Semicolon
71 }
72
73 '=' => {
75 self.advance();
76 TokenKind::Word("=".to_string()) }
78
79 '"' => self.lex_string('"')?,
81 '\'' => self.lex_string('\'')?,
82
83 '$' => self.lex_variable()?,
85
86 _ if ch.is_ascii_digit() => self.lex_number(),
88 _ if is_word_start(ch) => self.lex_word(),
89
90 _ => {
91 return Err(Error::syntax(
92 format!("unexpected character '{ch}'"),
93 self.line,
94 self.col,
95 Some("valid token".to_string()),
96 Some(format!("'{ch}'")),
97 ));
98 }
99 };
100
101 let span = Span::new(start_pos, self.pos, start_line, start_col);
102 Ok(Token::new(kind, span))
103 }
104
105 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
112 let mut tokens = Vec::new();
113
114 loop {
115 let token = self.next_token()?;
116 let is_eof = token.kind == TokenKind::Eof;
117 tokens.push(token);
118
119 if is_eof {
120 break;
121 }
122 }
123
124 Ok(tokens)
125 }
126
127 fn skip_whitespace(&mut self) {
129 while !self.is_eof() {
130 let ch = self.current_char();
131 if ch.is_whitespace() {
132 if ch == '\n' {
133 self.line += 1;
134 self.col = 1;
135 self.pos += 1;
136 } else {
137 self.advance();
138 }
139 } else {
140 break;
141 }
142 }
143 }
144
145 fn lex_comment(&mut self) -> TokenKind {
147 self.advance(); let start = self.pos;
150 while !self.is_eof() && self.current_char() != '\n' {
151 self.advance();
152 }
153
154 let comment = self.input[start..self.pos].trim().to_string();
155 TokenKind::Comment(comment)
156 }
157
158 fn lex_string(&mut self, quote: char) -> Result<TokenKind> {
160 self.advance(); let start = self.pos;
163 let mut escaped = false;
164
165 while !self.is_eof() {
166 let ch = self.current_char();
167
168 if escaped {
169 escaped = false;
170 self.advance();
171 continue;
172 }
173
174 if ch == '\\' {
175 escaped = true;
176 self.advance();
177 continue;
178 }
179
180 if ch == quote {
181 let value = self.input[start..self.pos].to_string();
182 self.advance(); return Ok(TokenKind::String(value));
184 }
185
186 if ch == '\n' {
187 return Err(Error::syntax(
188 "unterminated string literal",
189 self.line,
190 self.col,
191 Some("closing quote".to_string()),
192 Some("newline".to_string()),
193 ));
194 }
195
196 self.advance();
197 }
198
199 Err(Error::unexpected_eof("closing quote", self.line))
200 }
201
202 fn lex_variable(&mut self) -> Result<TokenKind> {
204 self.advance(); let start = self.pos;
207
208 if !self.is_eof() && self.current_char() == '{' {
210 self.advance(); let name_start = self.pos;
212
213 while !self.is_eof() && self.current_char() != '}' {
214 self.advance();
215 }
216
217 if self.is_eof() {
218 return Err(Error::unexpected_eof("'}'", self.line));
219 }
220
221 let name = self.input[name_start..self.pos].to_string();
222 self.advance(); return Ok(TokenKind::Variable(name));
224 }
225
226 while !self.is_eof() && is_word_char(self.current_char()) {
228 self.advance();
229 }
230
231 let name = self.input[start..self.pos].to_string();
232
233 if name.is_empty() {
234 return Err(Error::syntax(
235 "expected variable name after '$'",
236 self.line,
237 self.col,
238 Some("variable name".to_string()),
239 None,
240 ));
241 }
242
243 Ok(TokenKind::Variable(name))
244 }
245
246 fn lex_number(&mut self) -> TokenKind {
248 let start = self.pos;
249
250 while !self.is_eof() && (self.current_char().is_ascii_digit() || self.current_char() == '.')
251 {
252 self.advance();
253 }
254
255 let number = self.input[start..self.pos].to_string();
256 TokenKind::Number(number)
257 }
258
259 fn lex_word(&mut self) -> TokenKind {
261 let start = self.pos;
262
263 while !self.is_eof() && is_word_char(self.current_char()) {
264 self.advance();
265 }
266
267 let word = self.input[start..self.pos].to_string();
268 TokenKind::Word(word)
269 }
270
271 fn make_token(&self, kind: TokenKind) -> Token {
273 Token::new(kind, Span::new(self.pos, self.pos, self.line, self.col))
274 }
275
276 fn current_char(&self) -> char {
278 self.input[self.pos..].chars().next().unwrap_or('\0')
279 }
280
281 fn is_eof(&self) -> bool {
283 self.pos >= self.input.len()
284 }
285
286 fn advance(&mut self) {
288 if !self.is_eof() {
289 let ch = self.current_char();
290 self.pos += ch.len_utf8();
291 if ch != '\n' {
292 self.col += 1;
293 }
294 }
295 }
296}
297
298fn is_word_start(ch: char) -> bool {
300 ch.is_ascii_alphabetic()
301 || ch == '_'
302 || ch == '/'
303 || ch == '.'
304 || ch == '*'
305 || ch == '^'
306 || ch == '~'
307 || ch == '\\'
308}
309
310fn is_word_char(ch: char) -> bool {
312 ch.is_ascii_alphanumeric()
313 || ch == '_'
314 || ch == '-'
315 || ch == '/'
316 || ch == '.'
317 || ch == ':'
318 || ch == '='
319 || ch == '*'
320 || ch == '^'
321 || ch == '~'
322 || ch == '\\'
323 || ch == '$' }
325
326#[cfg(test)]
327mod tests {
328 use super::*;
329
330 #[test]
331 fn test_lex_simple_directive() {
332 let mut lexer = Lexer::new("user nginx;");
333 let tokens = lexer.tokenize().unwrap();
334
335 assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].kind, TokenKind::Word("user".to_string()));
337 assert_eq!(tokens[1].kind, TokenKind::Word("nginx".to_string()));
338 assert_eq!(tokens[2].kind, TokenKind::Semicolon);
339 assert_eq!(tokens[3].kind, TokenKind::Eof);
340 }
341
342 #[test]
343 fn test_lex_block() {
344 let mut lexer = Lexer::new("server { listen 80; }");
345 let tokens = lexer.tokenize().unwrap();
346
347 assert_eq!(tokens[0].kind, TokenKind::Word("server".to_string()));
348 assert_eq!(tokens[1].kind, TokenKind::LeftBrace);
349 assert_eq!(tokens[2].kind, TokenKind::Word("listen".to_string()));
350 assert_eq!(tokens[3].kind, TokenKind::Number("80".to_string()));
351 assert_eq!(tokens[4].kind, TokenKind::Semicolon);
352 assert_eq!(tokens[5].kind, TokenKind::RightBrace);
353 }
354
355 #[test]
356 fn test_lex_string() {
357 let mut lexer = Lexer::new(r#"root "/var/www";"#);
358 let tokens = lexer.tokenize().unwrap();
359
360 assert_eq!(tokens[0].kind, TokenKind::Word("root".to_string()));
361 assert_eq!(tokens[1].kind, TokenKind::String("/var/www".to_string()));
362 assert_eq!(tokens[2].kind, TokenKind::Semicolon);
363 }
364
365 #[test]
366 fn test_lex_variable() {
367 let mut lexer = Lexer::new("set $host;");
368 let tokens = lexer.tokenize().unwrap();
369
370 assert_eq!(tokens[0].kind, TokenKind::Word("set".to_string()));
371 assert_eq!(tokens[1].kind, TokenKind::Variable("host".to_string()));
372 assert_eq!(tokens[2].kind, TokenKind::Semicolon);
373 }
374
375 #[test]
376 fn test_lex_comment() {
377 let mut lexer = Lexer::new("# This is a comment\nuser nginx;");
378 let tokens = lexer.tokenize().unwrap();
379
380 assert_eq!(
381 tokens[0].kind,
382 TokenKind::Comment("This is a comment".to_string())
383 );
384 assert_eq!(tokens[1].kind, TokenKind::Word("user".to_string()));
385 }
386
387 #[test]
388 fn test_position_tracking() {
389 let mut lexer = Lexer::new("server\n{\n listen 80;\n}");
390 let tokens = lexer.tokenize().unwrap();
391
392 assert_eq!(tokens[0].span.line, 1);
394 assert_eq!(tokens[1].span.line, 2);
395 assert_eq!(tokens[2].span.line, 3);
396 }
397
398 #[test]
399 fn test_unterminated_string() {
400 let mut lexer = Lexer::new(r#"root "/var/www"#);
401 let result = lexer.tokenize();
402
403 assert!(result.is_err());
404 }
405}