1use std::fmt;
6
7use crate::token::{Span, Token, TokenKind};
8
9#[derive(Debug, Clone, PartialEq, Eq)]
11pub enum LexErrorKind {
12 UnterminatedString,
14 UnterminatedBacktick,
16 UnterminatedHeredoc { marker: String },
18 EmptyHeredocMarker,
20 UnexpectedCharacter(char),
22}
23
24impl fmt::Display for LexErrorKind {
25 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
26 match self {
27 Self::UnterminatedString => {
28 write!(f, "unterminated quoted string")
29 }
30 Self::UnterminatedBacktick => {
31 write!(f, "unterminated backtick string")
32 }
33 Self::UnterminatedHeredoc { marker } => {
34 write!(
35 f,
36 "unterminated heredoc, \
37 expected closing marker: {marker}"
38 )
39 }
40 Self::EmptyHeredocMarker => {
41 write!(f, "empty heredoc marker")
42 }
43 Self::UnexpectedCharacter(ch) => {
44 write!(f, "unexpected character: {ch}")
45 }
46 }
47 }
48}
49
50#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
52#[error("{kind} at line {}, column {}", span.line, span.column)]
53pub struct LexError {
54 pub kind: LexErrorKind,
55 pub span: Span,
56}
57
58pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
65 Lexer::new(input).tokenize()
66}
67
68struct Lexer<'a> {
69 input: &'a [u8],
70 pos: usize,
71 line: usize,
72 col: usize,
73}
74
75impl<'a> Lexer<'a> {
76 fn new(input: &'a str) -> Self {
77 let bytes = input.as_bytes();
78 let start = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
79 3
80 } else {
81 0
82 };
83 Self {
84 input: bytes,
85 pos: start,
86 line: 1,
87 col: 1,
88 }
89 }
90
91 fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
92 let mut tokens = Vec::new();
93
94 while self.pos < self.input.len() {
95 let ch = self.input[self.pos];
96
97 match ch {
98 b'\n' => {
99 tokens.push(self.make_token(TokenKind::Newline, "\n".to_string()));
100 self.advance();
101 }
102 b'\r' => {
103 self.advance();
104 if self.peek() == Some(b'\n') {
105 self.advance();
106 }
107 tokens.push(Self::make_token_at(
108 TokenKind::Newline,
109 "\n".to_string(),
110 self.line - 1,
111 self.col,
112 ));
113 }
114 b' ' | b'\t' => {
115 self.advance();
116 }
117 b'#' => {
118 tokens.push(self.read_comment());
119 }
120 b'{' => {
121 if self.try_read_env_var(&mut tokens) {
122 } else {
124 tokens.push(self.make_token(TokenKind::OpenBrace, "{".to_string()));
125 self.advance();
126 }
127 }
128 b'}' => {
129 tokens.push(self.make_token(TokenKind::CloseBrace, "}".to_string()));
130 self.advance();
131 }
132 b'"' => {
133 tokens.push(self.read_quoted_string()?);
134 }
135 b'`' => {
136 tokens.push(self.read_backtick_string()?);
137 }
138 b'\\' if self.peek_at(1) == Some(b'\n') => {
139 self.advance(); self.advance(); }
143 b'\\' if self.peek_at(1) == Some(b'\r') => {
144 self.advance();
145 self.advance();
146 if self.peek() == Some(b'\n') {
147 self.advance();
148 }
149 }
150 _ => {
151 tokens.push(self.read_word()?);
152 }
153 }
154 }
155
156 Ok(tokens)
157 }
158
159 const fn span(&self) -> Span {
160 Span {
161 line: self.line,
162 column: self.col,
163 }
164 }
165
166 const fn make_token(&self, kind: TokenKind, text: String) -> Token {
167 Token {
168 kind,
169 text,
170 span: self.span(),
171 }
172 }
173
174 const fn make_token_at(kind: TokenKind, text: String, line: usize, col: usize) -> Token {
175 Token {
176 kind,
177 text,
178 span: Span { line, column: col },
179 }
180 }
181
182 fn peek(&self) -> Option<u8> {
183 self.input.get(self.pos).copied()
184 }
185
186 fn peek_at(&self, offset: usize) -> Option<u8> {
187 self.input.get(self.pos + offset).copied()
188 }
189
190 fn advance(&mut self) {
191 if self.pos < self.input.len() {
192 if self.input[self.pos] == b'\n' {
193 self.line += 1;
194 self.col = 1;
195 } else {
196 self.col += 1;
197 }
198 self.pos += 1;
199 }
200 }
201
202 fn read_comment(&mut self) -> Token {
203 let start_line = self.line;
204 let start_col = self.col;
205 let start = self.pos;
206
207 while self.pos < self.input.len() && self.input[self.pos] != b'\n' {
208 self.pos += 1;
209 self.col += 1;
210 }
211
212 let text = String::from_utf8_lossy(&self.input[start..self.pos]).into_owned();
213
214 Token {
215 kind: TokenKind::Comment,
216 text,
217 span: Span {
218 line: start_line,
219 column: start_col,
220 },
221 }
222 }
223
224 fn read_quoted_string(&mut self) -> Result<Token, LexError> {
225 let start_line = self.line;
226 let start_col = self.col;
227 self.advance(); let mut value = String::new();
230 loop {
231 match self.peek() {
232 None => {
233 return Err(LexError {
234 kind: LexErrorKind::UnterminatedString,
235 span: Span {
236 line: start_line,
237 column: start_col,
238 },
239 });
240 }
241 Some(b'\\') => {
242 self.advance();
243 match self.peek() {
244 Some(b'n') => {
245 value.push('\n');
246 self.advance();
247 }
248 Some(b't') => {
249 value.push('\t');
250 self.advance();
251 }
252 Some(b'r') => {
253 value.push('\r');
254 self.advance();
255 }
256 Some(b'"') => {
257 value.push('"');
258 self.advance();
259 }
260 Some(b'\\') => {
261 value.push('\\');
262 self.advance();
263 }
264 Some(c) => {
265 value.push('\\');
266 value.push(char::from(c));
267 self.advance();
268 }
269 None => {
270 value.push('\\');
271 }
272 }
273 }
274 Some(b'"') => {
275 self.advance();
276 break;
277 }
278 Some(c) => {
279 if c == b'\n' {
280 self.advance();
282 value.push('\n');
283 } else {
284 value.push(char::from(c));
285 self.advance();
286 }
287 }
288 }
289 }
290
291 Ok(Token {
292 kind: TokenKind::QuotedString,
293 text: value,
294 span: Span {
295 line: start_line,
296 column: start_col,
297 },
298 })
299 }
300
301 fn read_backtick_string(&mut self) -> Result<Token, LexError> {
302 let start_line = self.line;
303 let start_col = self.col;
304 self.advance(); let mut value = String::new();
307 loop {
308 match self.peek() {
309 None => {
310 return Err(LexError {
311 kind: LexErrorKind::UnterminatedBacktick,
312 span: Span {
313 line: start_line,
314 column: start_col,
315 },
316 });
317 }
318 Some(b'`') => {
319 self.advance();
320 break;
321 }
322 Some(c) => {
323 if c == b'\n' {
324 self.advance();
325 value.push('\n');
326 } else {
327 value.push(char::from(c));
328 self.advance();
329 }
330 }
331 }
332 }
333
334 Ok(Token {
335 kind: TokenKind::BacktickString,
336 text: value,
337 span: Span {
338 line: start_line,
339 column: start_col,
340 },
341 })
342 }
343
344 fn try_read_env_var(&mut self, tokens: &mut Vec<Token>) -> bool {
345 if self.peek_at(1) != Some(b'$') {
347 return false;
348 }
349
350 let start_line = self.line;
351 let start_col = self.col;
352 let save_pos = self.pos;
353 let save_line = self.line;
354 let save_col = self.col;
355
356 self.advance(); self.advance(); let name_start = self.pos;
360 while self.pos < self.input.len()
361 && self.input[self.pos] != b'}'
362 && self.input[self.pos] != b':'
363 && self.input[self.pos] != b'\n'
364 {
365 self.pos += 1;
366 self.col += 1;
367 }
368 let name = String::from_utf8_lossy(&self.input[name_start..self.pos]).into_owned();
369
370 let default = if self.peek() == Some(b':') {
371 self.pos += 1;
372 self.col += 1;
373 let def_start = self.pos;
374 while self.pos < self.input.len()
375 && self.input[self.pos] != b'}'
376 && self.input[self.pos] != b'\n'
377 {
378 self.pos += 1;
379 self.col += 1;
380 }
381 Some(String::from_utf8_lossy(&self.input[def_start..self.pos]).into_owned())
382 } else {
383 None
384 };
385
386 if self.peek() != Some(b'}') {
387 self.pos = save_pos;
389 self.line = save_line;
390 self.col = save_col;
391 return false;
392 }
393
394 self.pos += 1;
395 self.col += 1;
396
397 let text = String::from_utf8_lossy(&self.input[save_pos..self.pos]).into_owned();
398
399 tokens.push(Token {
400 kind: TokenKind::EnvVar { name, default },
401 text,
402 span: Span {
403 line: start_line,
404 column: start_col,
405 },
406 });
407
408 true
409 }
410
411 fn read_word(&mut self) -> Result<Token, LexError> {
412 let start_line = self.line;
413 let start_col = self.col;
414 let start = self.pos;
415
416 if self.input[self.pos] == b'<' && self.peek_at(1) == Some(b'<') {
418 return self.read_heredoc(start_line, start_col);
419 }
420
421 while self.pos < self.input.len() {
422 let ch = self.input[self.pos];
423 match ch {
424 b' ' | b'\t' | b'\n' | b'\r' => break,
425 b'{' | b'}' => {
426 if ch == b'{' && self.peek_at(1) == Some(b'$') {
428 break;
429 }
430 if self.pos == start {
433 break;
434 }
435 self.pos += 1;
438 self.col += 1;
439 }
440 b'\\' => {
441 self.pos += 1;
443 self.col += 1;
444 if self.pos < self.input.len() {
445 self.pos += 1;
446 self.col += 1;
447 }
448 }
449 _ => {
450 self.pos += 1;
451 self.col += 1;
452 }
453 }
454 }
455
456 let text = String::from_utf8_lossy(&self.input[start..self.pos]).into_owned();
457
458 if text.is_empty() {
459 return Err(LexError {
460 kind: LexErrorKind::UnexpectedCharacter(char::from(self.input[start])),
461 span: Span {
462 line: start_line,
463 column: start_col,
464 },
465 });
466 }
467
468 Ok(Token {
469 kind: TokenKind::Word,
470 text,
471 span: Span {
472 line: start_line,
473 column: start_col,
474 },
475 })
476 }
477
478 fn read_heredoc(&mut self, start_line: usize, start_col: usize) -> Result<Token, LexError> {
479 self.advance(); self.advance(); let marker_start = self.pos;
484 while self.pos < self.input.len()
485 && self.input[self.pos] != b'\n'
486 && self.input[self.pos] != b'\r'
487 && self.input[self.pos] != b' '
488 && self.input[self.pos] != b'\t'
489 {
490 self.pos += 1;
491 self.col += 1;
492 }
493
494 let marker = String::from_utf8_lossy(&self.input[marker_start..self.pos]).into_owned();
495
496 if marker.is_empty() {
497 return Err(LexError {
498 kind: LexErrorKind::EmptyHeredocMarker,
499 span: Span {
500 line: start_line,
501 column: start_col,
502 },
503 });
504 }
505
506 if self.peek() == Some(b'\r') {
508 self.advance();
509 }
510 if self.peek() == Some(b'\n') {
511 self.advance();
512 }
513
514 let content_start = self.pos;
516
517 while self.pos < self.input.len() {
518 let line_start = self.pos;
519 while self.pos < self.input.len() && self.input[self.pos] != b'\n' {
521 self.pos += 1;
522 self.col += 1;
523 }
524
525 let line = String::from_utf8_lossy(&self.input[line_start..self.pos]);
526 let trimmed = line.trim();
527
528 if trimmed == marker {
529 let content =
530 String::from_utf8_lossy(&self.input[content_start..line_start]).into_owned();
531 let content = content
533 .strip_suffix('\n')
534 .or_else(|| content.strip_suffix("\r\n"))
535 .unwrap_or(&content)
536 .to_string();
537
538 if self.peek() == Some(b'\n') {
539 self.advance();
540 }
541
542 return Ok(Token {
543 kind: TokenKind::Heredoc { marker },
544 text: content,
545 span: Span {
546 line: start_line,
547 column: start_col,
548 },
549 });
550 }
551
552 if self.peek() == Some(b'\n') {
553 self.advance();
554 }
555 }
556
557 Err(LexError {
558 kind: LexErrorKind::UnterminatedHeredoc { marker },
559 span: Span {
560 line: start_line,
561 column: start_col,
562 },
563 })
564 }
565}
566
567#[cfg(test)]
568mod tests {
569 use super::*;
570
571 #[test]
572 fn simple_words() {
573 let tokens = tokenize("reverse_proxy app:3000").expect("should tokenize");
574 assert_eq!(tokens.len(), 2);
575 assert_eq!(tokens[0].text, "reverse_proxy");
576 assert_eq!(tokens[1].text, "app:3000");
577 }
578
579 #[test]
580 fn braces_and_newlines() {
581 let tokens = tokenize("example.com {\n log\n}\n").expect("should tokenize");
582 let kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
583 assert!(matches!(kinds[0], TokenKind::Word));
584 assert!(matches!(kinds[1], TokenKind::OpenBrace));
585 assert!(matches!(kinds[2], TokenKind::Newline));
586 assert!(matches!(kinds[3], TokenKind::Word));
587 assert!(matches!(kinds[4], TokenKind::Newline));
588 assert!(matches!(kinds[5], TokenKind::CloseBrace));
589 }
590
591 #[test]
592 fn quoted_string() {
593 let tokens = tokenize(r#"header "X-Frame-Options" "DENY""#).expect("should tokenize");
594 assert_eq!(tokens.len(), 3);
595 assert!(matches!(tokens[1].kind, TokenKind::QuotedString));
596 assert_eq!(tokens[1].text, "X-Frame-Options");
597 assert_eq!(tokens[2].text, "DENY");
598 }
599
600 #[test]
601 fn quoted_string_with_escapes() {
602 let tokens = tokenize(r#""hello \"world\"""#).expect("should tokenize");
603 assert_eq!(tokens[0].text, r#"hello "world""#);
604 }
605
606 #[test]
607 fn backtick_string() {
608 let tokens = tokenize("`raw string`").expect("should tokenize");
609 assert!(matches!(tokens[0].kind, TokenKind::BacktickString));
610 assert_eq!(tokens[0].text, "raw string");
611 }
612
613 #[test]
614 fn comment() {
615 let tokens = tokenize("log # access log\nfile_server").expect("should tokenize");
616 assert_eq!(tokens[1].kind, TokenKind::Comment);
617 assert_eq!(tokens[1].text, "# access log");
618 }
619
620 #[test]
621 fn env_var() {
622 let tokens = tokenize("{$API_KEY}").expect("should tokenize");
623 assert!(matches!(
624 &tokens[0].kind,
625 TokenKind::EnvVar { name, default: None }
626 if name == "API_KEY"
627 ));
628 }
629
630 #[test]
631 fn env_var_with_default() {
632 let tokens = tokenize("{$PORT:8080}").expect("should tokenize");
633 assert!(matches!(
634 &tokens[0].kind,
635 TokenKind::EnvVar {
636 name,
637 default: Some(def)
638 }
639 if name == "PORT" && def == "8080"
640 ));
641 }
642
643 #[test]
644 fn heredoc() {
645 let input = "respond <<EOF\nHello World\nEOF\n";
646 let tokens = tokenize(input).expect("should tokenize");
647 assert_eq!(tokens[0].text, "respond");
648 assert!(matches!(
649 &tokens[1].kind,
650 TokenKind::Heredoc { marker }
651 if marker == "EOF"
652 ));
653 assert_eq!(tokens[1].text, "Hello World");
654 }
655
656 #[test]
657 fn unterminated_quote() {
658 let result = tokenize("\"unclosed");
659 assert!(result.is_err());
660 let err = result.unwrap_err();
661 assert_eq!(err.kind, LexErrorKind::UnterminatedString);
662 }
663
664 #[test]
665 fn line_continuation() {
666 let tokens = tokenize("reverse_proxy \\\napp:3000").expect("should tokenize");
667 assert_eq!(tokens.len(), 2);
668 assert_eq!(tokens[0].text, "reverse_proxy");
669 assert_eq!(tokens[1].text, "app:3000");
670 }
671
672 #[test]
673 fn bom_stripping() {
674 let input = "\u{FEFF}example.com";
675 let tokens = tokenize(input).expect("should tokenize");
676 assert_eq!(tokens[0].text, "example.com");
677 }
678
679 #[test]
680 fn escaped_braces() {
681 let tokens = tokenize(r"respond \{hello\}").expect("should tokenize");
682 assert_eq!(tokens[0].text, "respond");
683 assert_eq!(tokens[1].text, r"\{hello\}");
684 }
685
686 #[test]
687 fn span_tracking() {
688 let tokens = tokenize("a\nb c").expect("should tokenize");
689 assert_eq!(tokens[0].span.line, 1);
690 assert_eq!(tokens[0].span.column, 1);
691 assert_eq!(tokens[2].span.line, 2);
693 assert_eq!(tokens[2].span.column, 1);
694 assert_eq!(tokens[3].span.line, 2);
695 assert_eq!(tokens[3].span.column, 3);
696 }
697}