1use crate::ast::{Position, Span};
2use crate::error::{LexerError, ParseResult};
3
4#[derive(Debug, Clone, PartialEq)]
6pub enum TokenKind {
7 Ident(String),
9 Argument(String),
12 DoubleQuotedString(String),
14 SingleQuotedString(String),
16 Variable(String),
18 Semicolon,
20 OpenBrace,
22 CloseBrace,
24 Comment(String),
26 Newline,
28 Eof,
30}
31
32impl TokenKind {
33 pub fn display_name(&self) -> &str {
35 match self {
36 TokenKind::Ident(_) => "identifier",
37 TokenKind::Argument(_) => "argument",
38 TokenKind::DoubleQuotedString(_) => "string",
39 TokenKind::SingleQuotedString(_) => "string",
40 TokenKind::Variable(_) => "variable",
41 TokenKind::Semicolon => "';'",
42 TokenKind::OpenBrace => "'{'",
43 TokenKind::CloseBrace => "'}'",
44 TokenKind::Comment(_) => "comment",
45 TokenKind::Newline => "newline",
46 TokenKind::Eof => "end of file",
47 }
48 }
49}
50
51#[derive(Debug, Clone)]
53pub struct Token {
54 pub kind: TokenKind,
56 pub span: Span,
58 pub raw: String,
60 pub leading_whitespace: String,
62}
63
64pub struct Lexer<'a> {
69 source: &'a str,
70 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
71 line: usize,
72 column: usize,
73 offset: usize,
74}
75
76impl<'a> Lexer<'a> {
77 pub fn new(source: &'a str) -> Self {
79 Self {
80 source,
81 chars: source.char_indices().peekable(),
82 line: 1,
83 column: 1,
84 offset: 0,
85 }
86 }
87
88 fn position(&self) -> Position {
89 Position::new(self.line, self.column, self.offset)
90 }
91
92 fn advance(&mut self) -> Option<(usize, char)> {
93 if let Some((idx, ch)) = self.chars.next() {
94 self.offset = idx + ch.len_utf8();
95 if ch == '\n' {
96 self.line += 1;
97 self.column = 1;
98 } else {
99 self.column += 1;
100 }
101 Some((idx, ch))
102 } else {
103 None
104 }
105 }
106
107 fn peek(&mut self) -> Option<char> {
108 self.chars.peek().map(|(_, ch)| *ch)
109 }
110
111 fn skip_whitespace_same_line(&mut self) -> String {
112 let mut whitespace = String::new();
113 while let Some(ch) = self.peek() {
114 if ch == ' ' || ch == '\t' {
115 whitespace.push(ch);
116 self.advance();
117 } else {
118 break;
119 }
120 }
121 whitespace
122 }
123
124 pub fn next_token(&mut self) -> ParseResult<Token> {
125 let leading_whitespace = self.skip_whitespace_same_line();
126
127 let start_pos = self.position();
128 let start_offset = self.offset;
129
130 let Some((_, ch)) = self.advance() else {
131 return Ok(Token {
132 kind: TokenKind::Eof,
133 span: Span::new(start_pos, start_pos),
134 raw: String::new(),
135 leading_whitespace,
136 });
137 };
138
139 let kind = match ch {
140 '\n' => TokenKind::Newline,
141 ';' => TokenKind::Semicolon,
142 '{' => TokenKind::OpenBrace,
143 '}' => TokenKind::CloseBrace,
144 '#' if !leading_whitespace.is_empty() || start_pos.column == 1 => {
145 let mut text = String::from('#');
147 while let Some(c) = self.peek() {
148 if c == '\n' {
149 break;
150 }
151 text.push(c);
152 self.advance();
153 }
154 TokenKind::Comment(text)
155 }
156 '#' => {
157 let value = self.read_argument(ch);
159 TokenKind::Argument(value)
160 }
161 '"' => self.read_double_quoted_string(start_pos)?,
162 '\'' => self.read_single_quoted_string(start_pos)?,
163 '$' => {
164 let name = self.read_variable_name();
166 TokenKind::Variable(name)
167 }
168 _ if is_ident_start(ch) => {
169 let value = self.read_identifier(ch);
171 TokenKind::Ident(value)
172 }
173 _ if is_argument_char(ch) => {
174 let value = self.read_argument(ch);
176 TokenKind::Argument(value)
177 }
178 _ => {
179 return Err(LexerError::UnexpectedChar {
180 ch,
181 position: start_pos,
182 }
183 .into());
184 }
185 };
186
187 let end_pos = self.position();
188 let raw = self.source[start_offset..self.offset].to_string();
189
190 Ok(Token {
191 kind,
192 span: Span::new(start_pos, end_pos),
193 raw,
194 leading_whitespace,
195 })
196 }
197
198 fn read_double_quoted_string(&mut self, start_pos: Position) -> ParseResult<TokenKind> {
199 let mut value = String::new();
200
201 loop {
202 match self.advance() {
203 Some((_, '"')) => break,
204 Some((_, '\\')) => {
205 match self.advance() {
207 Some((_, 'n')) => value.push('\n'),
208 Some((_, 't')) => value.push('\t'),
209 Some((_, 'r')) => value.push('\r'),
210 Some((_, '\\')) => value.push('\\'),
211 Some((_, '"')) => value.push('"'),
212 Some((_, '$')) => value.push('$'),
213 Some((_, c)) => {
214 value.push('\\');
216 value.push(c);
217 }
218 None => {
219 return Err(LexerError::UnterminatedString {
220 position: start_pos,
221 }
222 .into());
223 }
224 }
225 }
226 Some((_, ch)) => value.push(ch),
227 None => {
228 return Err(LexerError::UnterminatedString {
229 position: start_pos,
230 }
231 .into());
232 }
233 }
234 }
235
236 Ok(TokenKind::DoubleQuotedString(value))
237 }
238
239 fn read_single_quoted_string(&mut self, start_pos: Position) -> ParseResult<TokenKind> {
240 let mut value = String::new();
241
242 loop {
243 match self.advance() {
244 Some((_, '\'')) => break,
245 Some((_, '\\')) => {
246 match self.advance() {
248 Some((_, '\\')) => value.push('\\'),
249 Some((_, '\'')) => value.push('\''),
250 Some((_, c)) => {
251 value.push('\\');
253 value.push(c);
254 }
255 None => {
256 return Err(LexerError::UnterminatedString {
257 position: start_pos,
258 }
259 .into());
260 }
261 }
262 }
263 Some((_, ch)) => value.push(ch),
264 None => {
265 return Err(LexerError::UnterminatedString {
266 position: start_pos,
267 }
268 .into());
269 }
270 }
271 }
272
273 Ok(TokenKind::SingleQuotedString(value))
274 }
275
276 fn read_variable_name(&mut self) -> String {
277 let mut name = String::new();
278
279 if self.peek() == Some('{') {
281 self.advance(); while let Some(ch) = self.peek() {
283 if ch == '}' {
284 self.advance();
285 break;
286 }
287 name.push(ch);
288 self.advance();
289 }
290 } else {
291 while let Some(ch) = self.peek() {
293 if ch.is_alphanumeric() || ch == '_' {
294 name.push(ch);
295 self.advance();
296 } else {
297 break;
298 }
299 }
300 }
301
302 name
303 }
304
305 fn read_identifier(&mut self, first: char) -> String {
306 let mut value = String::from(first);
307
308 while let Some(ch) = self.peek() {
310 if is_ident_continue(ch) {
311 value.push(ch);
312 self.advance();
313 } else {
314 break;
315 }
316 }
317
318 self.read_argument_continuation(&mut value);
321
322 value
323 }
324
325 fn read_argument(&mut self, first: char) -> String {
326 let mut value = String::from(first);
327 self.read_argument_continuation(&mut value);
328 value
329 }
330
331 fn read_argument_continuation(&mut self, value: &mut String) {
334 while let Some(ch) = self.peek() {
335 if is_argument_char(ch) || is_ident_continue(ch) {
336 if ch == '\\'
338 && let Some(escaped) = self.peek_escaped_brace()
339 {
340 value.push('\\');
341 self.advance(); value.push(escaped);
343 self.advance(); continue;
345 }
346 value.push(ch);
347 self.advance();
348 } else if ch == '{' {
349 if let Some(quantifier) = self.peek_regex_quantifier() {
351 for _ in 0..quantifier.len() {
353 self.advance();
354 }
355 value.push_str(&quantifier);
356 } else {
357 break;
358 }
359 } else if ch == '$' {
360 if self.is_regex_end_anchor() {
363 value.push(ch);
364 self.advance();
365 } else {
366 break;
367 }
368 } else {
369 break;
370 }
371 }
372 }
373
374 fn is_regex_end_anchor(&self) -> bool {
376 let remaining = &self.source[self.offset..];
377 let mut chars = remaining.chars();
378
379 if chars.next() != Some('$') {
381 return false;
382 }
383
384 match chars.next() {
386 None => true, Some(c) if c.is_whitespace() => true, Some('{') => false, Some(c) if c.is_alphanumeric() => false, Some('_') => false, _ => true, }
393 }
394
395 fn peek_escaped_brace(&self) -> Option<char> {
398 let remaining = &self.source[self.offset..];
399 let mut chars = remaining.chars();
400
401 if chars.next() != Some('\\') {
403 return None;
404 }
405
406 match chars.next() {
408 Some('{') => Some('{'),
409 Some('}') => Some('}'),
410 _ => None,
411 }
412 }
413
414 fn peek_regex_quantifier(&self) -> Option<String> {
417 let remaining = &self.source[self.offset..];
419
420 if !remaining.starts_with('{') {
422 return None;
423 }
424
425 let mut chars = remaining.chars().peekable();
426 chars.next(); let mut quantifier = String::from("{");
429
430 match chars.peek() {
432 Some(ch) if ch.is_ascii_digit() => {
433 quantifier.push(*ch);
434 chars.next();
435 }
436 _ => return None,
437 }
438
439 while let Some(&ch) = chars.peek() {
441 if ch.is_ascii_digit() {
442 quantifier.push(ch);
443 chars.next();
444 } else {
445 break;
446 }
447 }
448
449 match chars.peek() {
451 Some('}') => {
452 quantifier.push('}');
453 Some(quantifier)
454 }
455 Some(',') => {
456 quantifier.push(',');
457 chars.next();
458
459 while let Some(&ch) = chars.peek() {
461 if ch.is_ascii_digit() {
462 quantifier.push(ch);
463 chars.next();
464 } else {
465 break;
466 }
467 }
468
469 if chars.peek() == Some(&'}') {
471 quantifier.push('}');
472 Some(quantifier)
473 } else {
474 None
475 }
476 }
477 _ => None,
478 }
479 }
480
481 pub fn tokenize(&mut self) -> ParseResult<Vec<Token>> {
483 let mut tokens = Vec::new();
484 loop {
485 let token = self.next_token()?;
486 let is_eof = matches!(token.kind, TokenKind::Eof);
487 tokens.push(token);
488 if is_eof {
489 break;
490 }
491 }
492 Ok(tokens)
493 }
494}
495
496fn is_ident_start(ch: char) -> bool {
498 ch.is_alphabetic() || ch == '_'
499}
500
501fn is_ident_continue(ch: char) -> bool {
503 ch.is_alphanumeric() || ch == '_' || ch == '-'
504}
505
506fn is_argument_char(ch: char) -> bool {
508 !ch.is_whitespace() && !matches!(ch, ';' | '{' | '}' | '"' | '\'' | '$')
512}
513
514#[cfg(test)]
515mod tests {
516 use super::*;
517
518 fn tokenize(source: &str) -> Vec<TokenKind> {
519 let mut lexer = Lexer::new(source);
520 let tokens = lexer.tokenize().unwrap();
521 tokens.into_iter().map(|t| t.kind).collect()
522 }
523
524 #[test]
525 fn test_simple_directive() {
526 let tokens = tokenize("listen 80;");
527 assert_eq!(
528 tokens,
529 vec![
530 TokenKind::Ident("listen".to_string()),
531 TokenKind::Argument("80".to_string()),
532 TokenKind::Semicolon,
533 TokenKind::Eof,
534 ]
535 );
536 }
537
538 #[test]
539 fn test_block() {
540 let tokens = tokenize("http { }");
541 assert_eq!(
542 tokens,
543 vec![
544 TokenKind::Ident("http".to_string()),
545 TokenKind::OpenBrace,
546 TokenKind::CloseBrace,
547 TokenKind::Eof,
548 ]
549 );
550 }
551
552 #[test]
553 fn test_double_quoted_string() {
554 let tokens = tokenize(r#"return 200 "hello world";"#);
555 assert_eq!(
556 tokens,
557 vec![
558 TokenKind::Ident("return".to_string()),
559 TokenKind::Argument("200".to_string()),
560 TokenKind::DoubleQuotedString("hello world".to_string()),
561 TokenKind::Semicolon,
562 TokenKind::Eof,
563 ]
564 );
565 }
566
567 #[test]
568 fn test_single_quoted_string() {
569 let tokens = tokenize("return 200 'hello world';");
570 assert_eq!(
571 tokens,
572 vec![
573 TokenKind::Ident("return".to_string()),
574 TokenKind::Argument("200".to_string()),
575 TokenKind::SingleQuotedString("hello world".to_string()),
576 TokenKind::Semicolon,
577 TokenKind::Eof,
578 ]
579 );
580 }
581
582 #[test]
583 fn test_escape_sequences() {
584 let tokens = tokenize(r#"return 200 "hello\nworld";"#);
585 assert_eq!(
586 tokens,
587 vec![
588 TokenKind::Ident("return".to_string()),
589 TokenKind::Argument("200".to_string()),
590 TokenKind::DoubleQuotedString("hello\nworld".to_string()),
591 TokenKind::Semicolon,
592 TokenKind::Eof,
593 ]
594 );
595 }
596
597 #[test]
598 fn test_variable() {
599 let tokens = tokenize("set $var value;");
600 assert_eq!(
601 tokens,
602 vec![
603 TokenKind::Ident("set".to_string()),
604 TokenKind::Variable("var".to_string()),
605 TokenKind::Ident("value".to_string()),
606 TokenKind::Semicolon,
607 TokenKind::Eof,
608 ]
609 );
610 }
611
612 #[test]
613 fn test_variable_braces() {
614 let tokens = tokenize("return 200 ${request_uri};");
615 assert_eq!(
616 tokens,
617 vec![
618 TokenKind::Ident("return".to_string()),
619 TokenKind::Argument("200".to_string()),
620 TokenKind::Variable("request_uri".to_string()),
621 TokenKind::Semicolon,
622 TokenKind::Eof,
623 ]
624 );
625 }
626
627 #[test]
628 fn test_comment() {
629 let tokens = tokenize("# this is a comment\nlisten 80;");
630 assert_eq!(
631 tokens,
632 vec![
633 TokenKind::Comment("# this is a comment".to_string()),
634 TokenKind::Newline,
635 TokenKind::Ident("listen".to_string()),
636 TokenKind::Argument("80".to_string()),
637 TokenKind::Semicolon,
638 TokenKind::Eof,
639 ]
640 );
641 }
642
643 #[test]
644 fn test_path_argument() {
645 let tokens = tokenize("root /var/www/html;");
646 assert_eq!(
647 tokens,
648 vec![
649 TokenKind::Ident("root".to_string()),
650 TokenKind::Argument("/var/www/html".to_string()),
651 TokenKind::Semicolon,
652 TokenKind::Eof,
653 ]
654 );
655 }
656
657 #[test]
658 fn test_extension_directive() {
659 let tokens = tokenize(r#"more_set_headers "Server: Custom";"#);
660 assert_eq!(
661 tokens,
662 vec![
663 TokenKind::Ident("more_set_headers".to_string()),
664 TokenKind::DoubleQuotedString("Server: Custom".to_string()),
665 TokenKind::Semicolon,
666 TokenKind::Eof,
667 ]
668 );
669 }
670
671 #[test]
672 fn test_glob_pattern() {
673 let tokens = tokenize("include /etc/nginx/conf.d/*.conf;");
674 assert_eq!(
675 tokens,
676 vec![
677 TokenKind::Ident("include".to_string()),
678 TokenKind::Argument("/etc/nginx/conf.d/*.conf".to_string()),
679 TokenKind::Semicolon,
680 TokenKind::Eof,
681 ]
682 );
683 }
684
685 #[test]
686 fn test_utf8_comment() {
687 let tokens = tokenize("# これは日本語コメント\nlisten 80;");
688 assert_eq!(
689 tokens,
690 vec![
691 TokenKind::Comment("# これは日本語コメント".to_string()),
692 TokenKind::Newline,
693 TokenKind::Ident("listen".to_string()),
694 TokenKind::Argument("80".to_string()),
695 TokenKind::Semicolon,
696 TokenKind::Eof,
697 ]
698 );
699 }
700
701 #[test]
702 fn test_utf8_string() {
703 let tokens = tokenize(r#"return 200 "こんにちは";"#);
704 assert_eq!(
705 tokens,
706 vec![
707 TokenKind::Ident("return".to_string()),
708 TokenKind::Argument("200".to_string()),
709 TokenKind::DoubleQuotedString("こんにちは".to_string()),
710 TokenKind::Semicolon,
711 TokenKind::Eof,
712 ]
713 );
714 }
715
716 #[test]
717 fn test_position_tracking() {
718 let mut lexer = Lexer::new("http {\n listen 80;\n}");
719 let tokens = lexer.tokenize().unwrap();
720
721 assert_eq!(tokens[0].span.start.line, 1);
723 assert_eq!(tokens[0].span.start.column, 1);
724
725 assert_eq!(tokens[1].span.start.line, 1);
727 assert_eq!(tokens[1].span.start.column, 6);
728
729 assert_eq!(tokens[2].span.start.line, 1);
731
732 assert_eq!(tokens[3].span.start.line, 2);
734 assert_eq!(tokens[3].span.start.column, 5);
735 }
736
737 #[test]
738 fn test_regex_quantifier() {
739 let tokens = tokenize(r"location ~ ^/[a-z]{8}$ {");
741 assert_eq!(
742 tokens,
743 vec![
744 TokenKind::Ident("location".to_string()),
745 TokenKind::Argument("~".to_string()),
746 TokenKind::Argument("^/[a-z]{8}$".to_string()),
747 TokenKind::OpenBrace,
748 TokenKind::Eof,
749 ]
750 );
751 }
752
753 #[test]
754 fn test_regex_quantifier_range() {
755 let tokens = tokenize(r"location ~ ^/[0-9]{1,3}$ {");
757 assert_eq!(
758 tokens,
759 vec![
760 TokenKind::Ident("location".to_string()),
761 TokenKind::Argument("~".to_string()),
762 TokenKind::Argument("^/[0-9]{1,3}$".to_string()),
763 TokenKind::OpenBrace,
764 TokenKind::Eof,
765 ]
766 );
767 }
768
769 #[test]
770 fn test_regex_quantifier_open_ended() {
771 let tokens = tokenize(r"location ~ ^/[a-z]{8,}$ {");
773 assert_eq!(
774 tokens,
775 vec![
776 TokenKind::Ident("location".to_string()),
777 TokenKind::Argument("~".to_string()),
778 TokenKind::Argument("^/[a-z]{8,}$".to_string()),
779 TokenKind::OpenBrace,
780 TokenKind::Eof,
781 ]
782 );
783 }
784
785 #[test]
786 fn test_escaped_braces_in_regex() {
787 let tokens = tokenize(r"location ~ ^/nested/\{[a-z]+\}$ {");
789 assert_eq!(
790 tokens,
791 vec![
792 TokenKind::Ident("location".to_string()),
793 TokenKind::Argument("~".to_string()),
794 TokenKind::Argument(r"^/nested/\{[a-z]+\}$".to_string()),
795 TokenKind::OpenBrace,
796 TokenKind::Eof,
797 ]
798 );
799 }
800
801 #[test]
802 fn test_multiple_escaped_braces() {
803 let tokens = tokenize(r"location ~ ^/data/\{id\}/\{name\}$ {");
805 assert_eq!(
806 tokens,
807 vec![
808 TokenKind::Ident("location".to_string()),
809 TokenKind::Argument("~".to_string()),
810 TokenKind::Argument(r"^/data/\{id\}/\{name\}$".to_string()),
811 TokenKind::OpenBrace,
812 TokenKind::Eof,
813 ]
814 );
815 }
816
817 #[test]
818 fn test_hash_in_argument() {
819 let tokens = tokenize("location ~* foo#bar {");
821 assert_eq!(
822 tokens,
823 vec![
824 TokenKind::Ident("location".to_string()),
825 TokenKind::Argument("~*".to_string()),
826 TokenKind::Ident("foo#bar".to_string()),
827 TokenKind::OpenBrace,
828 TokenKind::Eof,
829 ]
830 );
831 }
832
833 #[test]
834 fn test_hash_in_regex_pattern() {
835 let tokens = tokenize(r"location ~* (?:#.*#|\.bak)$ {");
837 assert_eq!(
838 tokens,
839 vec![
840 TokenKind::Ident("location".to_string()),
841 TokenKind::Argument("~*".to_string()),
842 TokenKind::Argument(r"(?:#.*#|\.bak)$".to_string()),
843 TokenKind::OpenBrace,
844 TokenKind::Eof,
845 ]
846 );
847 }
848
849 #[test]
850 fn test_hash_comment_after_whitespace() {
851 let tokens = tokenize("listen 80; # this is a comment");
853 assert_eq!(
854 tokens,
855 vec![
856 TokenKind::Ident("listen".to_string()),
857 TokenKind::Argument("80".to_string()),
858 TokenKind::Semicolon,
859 TokenKind::Comment("# this is a comment".to_string()),
860 TokenKind::Eof,
861 ]
862 );
863 }
864
865 fn tokenize_full(source: &str) -> Vec<Token> {
866 let mut lexer = Lexer::new(source);
867 lexer.tokenize().unwrap()
868 }
869
870 #[test]
871 fn test_utf8_comment_column_is_char_based() {
872 let input = "# 開発環境\nlisten 80;";
875 let tokens = tokenize_full(input);
876 assert_eq!(tokens[0].span.start.column, 1);
878 let listen_token = tokens
880 .iter()
881 .find(|t| t.kind == TokenKind::Ident("listen".to_string()))
882 .unwrap();
883 assert_eq!(listen_token.span.start.line, 2);
884 assert_eq!(listen_token.span.start.column, 1);
885 }
886
887 #[test]
888 fn test_utf8_argument_column_tracking() {
889 let input = "listen 80;";
891 let tokens = tokenize_full(input);
892 let arg_token = tokens
894 .iter()
895 .find(|t| t.kind == TokenKind::Argument("80".to_string()))
896 .unwrap();
897 assert_eq!(arg_token.span.start.column, 8);
898 }
899}