Skip to main content

caddyfile_rs/
lexer.rs

1//! Tokenizer that converts raw Caddyfile source text into a stream of tokens.
2//!
3//! Handles strings, braces, comments, and whitespace-delimited words.
4
5use std::fmt;
6
7use crate::token::{Span, Token, TokenKind};
8
9/// Classifies a lexer error.
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub enum LexErrorKind {
12    /// Unterminated double-quoted string.
13    UnterminatedString,
14    /// Unterminated backtick string.
15    UnterminatedBacktick,
16    /// Unterminated heredoc (closing marker never found).
17    UnterminatedHeredoc { marker: String },
18    /// Heredoc marker is empty (`<<` followed by whitespace).
19    EmptyHeredocMarker,
20    /// Byte that cannot start any token.
21    UnexpectedCharacter(char),
22}
23
24impl fmt::Display for LexErrorKind {
25    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
26        match self {
27            Self::UnterminatedString => {
28                write!(f, "unterminated quoted string")
29            }
30            Self::UnterminatedBacktick => {
31                write!(f, "unterminated backtick string")
32            }
33            Self::UnterminatedHeredoc { marker } => {
34                write!(
35                    f,
36                    "unterminated heredoc, \
37                     expected closing marker: {marker}"
38                )
39            }
40            Self::EmptyHeredocMarker => {
41                write!(f, "empty heredoc marker")
42            }
43            Self::UnexpectedCharacter(ch) => {
44                write!(f, "unexpected character: {ch}")
45            }
46        }
47    }
48}
49
50/// Error produced during lexing.
51#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
52#[error("{kind} at line {}, column {}", span.line, span.column)]
53pub struct LexError {
54    pub kind: LexErrorKind,
55    pub span: Span,
56}
57
58/// Tokenize a Caddyfile source string into a sequence of tokens.
59///
60/// # Errors
61///
62/// Returns `LexError` on unterminated strings, invalid heredocs,
63/// or other lexical errors.
64pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
65    Lexer::new(input).tokenize()
66}
67
68struct Lexer<'a> {
69    input: &'a [u8],
70    pos: usize,
71    line: usize,
72    col: usize,
73}
74
75impl<'a> Lexer<'a> {
76    fn new(input: &'a str) -> Self {
77        let bytes = input.as_bytes();
78        let start = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
79            3
80        } else {
81            0
82        };
83        Self {
84            input: bytes,
85            pos: start,
86            line: 1,
87            col: 1,
88        }
89    }
90
91    fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
92        let mut tokens = Vec::new();
93
94        while self.pos < self.input.len() {
95            let ch = self.input[self.pos];
96
97            match ch {
98                b'\n' => {
99                    tokens.push(self.make_token(TokenKind::Newline, "\n".to_string()));
100                    self.advance();
101                }
102                b'\r' => {
103                    self.advance();
104                    if self.peek() == Some(b'\n') {
105                        self.advance();
106                    }
107                    tokens.push(Self::make_token_at(
108                        TokenKind::Newline,
109                        "\n".to_string(),
110                        self.line - 1,
111                        self.col,
112                    ));
113                }
114                b' ' | b'\t' => {
115                    self.advance();
116                }
117                b'#' => {
118                    tokens.push(self.read_comment());
119                }
120                b'{' => {
121                    if self.try_read_env_var(&mut tokens) {
122                        // consumed as env var
123                    } else {
124                        tokens.push(self.make_token(TokenKind::OpenBrace, "{".to_string()));
125                        self.advance();
126                    }
127                }
128                b'}' => {
129                    tokens.push(self.make_token(TokenKind::CloseBrace, "}".to_string()));
130                    self.advance();
131                }
132                b'"' => {
133                    tokens.push(self.read_quoted_string()?);
134                }
135                b'`' => {
136                    tokens.push(self.read_backtick_string()?);
137                }
138                b'\\' if self.peek_at(1) == Some(b'\n') => {
139                    // line continuation
140                    self.advance(); // skip backslash
141                    self.advance(); // skip newline
142                }
143                b'\\' if self.peek_at(1) == Some(b'\r') => {
144                    self.advance();
145                    self.advance();
146                    if self.peek() == Some(b'\n') {
147                        self.advance();
148                    }
149                }
150                _ => {
151                    tokens.push(self.read_word()?);
152                }
153            }
154        }
155
156        Ok(tokens)
157    }
158
159    const fn span(&self) -> Span {
160        Span {
161            line: self.line,
162            column: self.col,
163        }
164    }
165
166    const fn make_token(&self, kind: TokenKind, text: String) -> Token {
167        Token {
168            kind,
169            text,
170            span: self.span(),
171        }
172    }
173
174    const fn make_token_at(kind: TokenKind, text: String, line: usize, col: usize) -> Token {
175        Token {
176            kind,
177            text,
178            span: Span { line, column: col },
179        }
180    }
181
182    fn peek(&self) -> Option<u8> {
183        self.input.get(self.pos).copied()
184    }
185
186    fn peek_at(&self, offset: usize) -> Option<u8> {
187        self.input.get(self.pos + offset).copied()
188    }
189
190    fn advance(&mut self) {
191        if self.pos < self.input.len() {
192            if self.input[self.pos] == b'\n' {
193                self.line += 1;
194                self.col = 1;
195            } else {
196                self.col += 1;
197            }
198            self.pos += 1;
199        }
200    }
201
202    fn read_comment(&mut self) -> Token {
203        let start_line = self.line;
204        let start_col = self.col;
205        let start = self.pos;
206
207        while self.pos < self.input.len() && self.input[self.pos] != b'\n' {
208            self.pos += 1;
209            self.col += 1;
210        }
211
212        let text = String::from_utf8_lossy(&self.input[start..self.pos]).into_owned();
213
214        Token {
215            kind: TokenKind::Comment,
216            text,
217            span: Span {
218                line: start_line,
219                column: start_col,
220            },
221        }
222    }
223
224    fn read_quoted_string(&mut self) -> Result<Token, LexError> {
225        let start_line = self.line;
226        let start_col = self.col;
227        self.advance(); // skip opening quote
228
229        let mut value = String::new();
230        loop {
231            match self.peek() {
232                None => {
233                    return Err(LexError {
234                        kind: LexErrorKind::UnterminatedString,
235                        span: Span {
236                            line: start_line,
237                            column: start_col,
238                        },
239                    });
240                }
241                Some(b'\\') => {
242                    self.advance();
243                    match self.peek() {
244                        Some(b'n') => {
245                            value.push('\n');
246                            self.advance();
247                        }
248                        Some(b't') => {
249                            value.push('\t');
250                            self.advance();
251                        }
252                        Some(b'r') => {
253                            value.push('\r');
254                            self.advance();
255                        }
256                        Some(b'"') => {
257                            value.push('"');
258                            self.advance();
259                        }
260                        Some(b'\\') => {
261                            value.push('\\');
262                            self.advance();
263                        }
264                        Some(c) => {
265                            value.push('\\');
266                            value.push(char::from(c));
267                            self.advance();
268                        }
269                        None => {
270                            value.push('\\');
271                        }
272                    }
273                }
274                Some(b'"') => {
275                    self.advance();
276                    break;
277                }
278                Some(c) => {
279                    if c == b'\n' {
280                        // track newlines inside strings
281                        self.advance();
282                        value.push('\n');
283                    } else {
284                        value.push(char::from(c));
285                        self.advance();
286                    }
287                }
288            }
289        }
290
291        Ok(Token {
292            kind: TokenKind::QuotedString,
293            text: value,
294            span: Span {
295                line: start_line,
296                column: start_col,
297            },
298        })
299    }
300
301    fn read_backtick_string(&mut self) -> Result<Token, LexError> {
302        let start_line = self.line;
303        let start_col = self.col;
304        self.advance(); // skip opening backtick
305
306        let mut value = String::new();
307        loop {
308            match self.peek() {
309                None => {
310                    return Err(LexError {
311                        kind: LexErrorKind::UnterminatedBacktick,
312                        span: Span {
313                            line: start_line,
314                            column: start_col,
315                        },
316                    });
317                }
318                Some(b'`') => {
319                    self.advance();
320                    break;
321                }
322                Some(c) => {
323                    if c == b'\n' {
324                        self.advance();
325                        value.push('\n');
326                    } else {
327                        value.push(char::from(c));
328                        self.advance();
329                    }
330                }
331            }
332        }
333
334        Ok(Token {
335            kind: TokenKind::BacktickString,
336            text: value,
337            span: Span {
338                line: start_line,
339                column: start_col,
340            },
341        })
342    }
343
344    fn try_read_env_var(&mut self, tokens: &mut Vec<Token>) -> bool {
345        // Check for {$ pattern
346        if self.peek_at(1) != Some(b'$') {
347            return false;
348        }
349
350        let start_line = self.line;
351        let start_col = self.col;
352        let save_pos = self.pos;
353        let save_line = self.line;
354        let save_col = self.col;
355
356        self.advance(); // skip {
357        self.advance(); // skip $
358
359        let name_start = self.pos;
360        while self.pos < self.input.len()
361            && self.input[self.pos] != b'}'
362            && self.input[self.pos] != b':'
363            && self.input[self.pos] != b'\n'
364        {
365            self.pos += 1;
366            self.col += 1;
367        }
368        let name = String::from_utf8_lossy(&self.input[name_start..self.pos]).into_owned();
369
370        let default = if self.peek() == Some(b':') {
371            self.pos += 1;
372            self.col += 1;
373            let def_start = self.pos;
374            while self.pos < self.input.len()
375                && self.input[self.pos] != b'}'
376                && self.input[self.pos] != b'\n'
377            {
378                self.pos += 1;
379                self.col += 1;
380            }
381            Some(String::from_utf8_lossy(&self.input[def_start..self.pos]).into_owned())
382        } else {
383            None
384        };
385
386        if self.peek() != Some(b'}') {
387            // Not a valid env var, restore position
388            self.pos = save_pos;
389            self.line = save_line;
390            self.col = save_col;
391            return false;
392        }
393
394        self.pos += 1;
395        self.col += 1;
396
397        let text = String::from_utf8_lossy(&self.input[save_pos..self.pos]).into_owned();
398
399        tokens.push(Token {
400            kind: TokenKind::EnvVar { name, default },
401            text,
402            span: Span {
403                line: start_line,
404                column: start_col,
405            },
406        });
407
408        true
409    }
410
411    fn read_word(&mut self) -> Result<Token, LexError> {
412        let start_line = self.line;
413        let start_col = self.col;
414        let start = self.pos;
415
416        // Check for heredoc marker
417        if self.input[self.pos] == b'<' && self.peek_at(1) == Some(b'<') {
418            return self.read_heredoc(start_line, start_col);
419        }
420
421        while self.pos < self.input.len() {
422            let ch = self.input[self.pos];
423            match ch {
424                b' ' | b'\t' | b'\n' | b'\r' => break,
425                b'{' | b'}' => {
426                    // check for {$ env var or placeholder
427                    if ch == b'{' && self.peek_at(1) == Some(b'$') {
428                        break;
429                    }
430                    // standalone brace at start means it's
431                    // a brace token, not part of a word
432                    if self.pos == start {
433                        break;
434                    }
435                    // otherwise it could be a placeholder like
436                    // {path} inside a word - consume it
437                    self.pos += 1;
438                    self.col += 1;
439                }
440                b'\\' => {
441                    // escaped character
442                    self.pos += 1;
443                    self.col += 1;
444                    if self.pos < self.input.len() {
445                        self.pos += 1;
446                        self.col += 1;
447                    }
448                }
449                _ => {
450                    self.pos += 1;
451                    self.col += 1;
452                }
453            }
454        }
455
456        let text = String::from_utf8_lossy(&self.input[start..self.pos]).into_owned();
457
458        if text.is_empty() {
459            return Err(LexError {
460                kind: LexErrorKind::UnexpectedCharacter(char::from(self.input[start])),
461                span: Span {
462                    line: start_line,
463                    column: start_col,
464                },
465            });
466        }
467
468        Ok(Token {
469            kind: TokenKind::Word,
470            text,
471            span: Span {
472                line: start_line,
473                column: start_col,
474            },
475        })
476    }
477
478    fn read_heredoc(&mut self, start_line: usize, start_col: usize) -> Result<Token, LexError> {
479        self.advance(); // skip first <
480        self.advance(); // skip second <
481
482        // Read marker
483        let marker_start = self.pos;
484        while self.pos < self.input.len()
485            && self.input[self.pos] != b'\n'
486            && self.input[self.pos] != b'\r'
487            && self.input[self.pos] != b' '
488            && self.input[self.pos] != b'\t'
489        {
490            self.pos += 1;
491            self.col += 1;
492        }
493
494        let marker = String::from_utf8_lossy(&self.input[marker_start..self.pos]).into_owned();
495
496        if marker.is_empty() {
497            return Err(LexError {
498                kind: LexErrorKind::EmptyHeredocMarker,
499                span: Span {
500                    line: start_line,
501                    column: start_col,
502                },
503            });
504        }
505
506        // Skip to next line
507        if self.peek() == Some(b'\r') {
508            self.advance();
509        }
510        if self.peek() == Some(b'\n') {
511            self.advance();
512        }
513
514        // Read content until marker on its own line
515        let content_start = self.pos;
516
517        while self.pos < self.input.len() {
518            let line_start = self.pos;
519            // read one line
520            while self.pos < self.input.len() && self.input[self.pos] != b'\n' {
521                self.pos += 1;
522                self.col += 1;
523            }
524
525            let line = String::from_utf8_lossy(&self.input[line_start..self.pos]);
526            let trimmed = line.trim();
527
528            if trimmed == marker {
529                let content =
530                    String::from_utf8_lossy(&self.input[content_start..line_start]).into_owned();
531                // Remove trailing newline from content
532                let content = content
533                    .strip_suffix('\n')
534                    .or_else(|| content.strip_suffix("\r\n"))
535                    .unwrap_or(&content)
536                    .to_string();
537
538                if self.peek() == Some(b'\n') {
539                    self.advance();
540                }
541
542                return Ok(Token {
543                    kind: TokenKind::Heredoc { marker },
544                    text: content,
545                    span: Span {
546                        line: start_line,
547                        column: start_col,
548                    },
549                });
550            }
551
552            if self.peek() == Some(b'\n') {
553                self.advance();
554            }
555        }
556
557        Err(LexError {
558            kind: LexErrorKind::UnterminatedHeredoc { marker },
559            span: Span {
560                line: start_line,
561                column: start_col,
562            },
563        })
564    }
565}
566
567#[cfg(test)]
568mod tests {
569    use super::*;
570
571    #[test]
572    fn simple_words() {
573        let tokens = tokenize("reverse_proxy app:3000").expect("should tokenize");
574        assert_eq!(tokens.len(), 2);
575        assert_eq!(tokens[0].text, "reverse_proxy");
576        assert_eq!(tokens[1].text, "app:3000");
577    }
578
579    #[test]
580    fn braces_and_newlines() {
581        let tokens = tokenize("example.com {\n    log\n}\n").expect("should tokenize");
582        let kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
583        assert!(matches!(kinds[0], TokenKind::Word));
584        assert!(matches!(kinds[1], TokenKind::OpenBrace));
585        assert!(matches!(kinds[2], TokenKind::Newline));
586        assert!(matches!(kinds[3], TokenKind::Word));
587        assert!(matches!(kinds[4], TokenKind::Newline));
588        assert!(matches!(kinds[5], TokenKind::CloseBrace));
589    }
590
591    #[test]
592    fn quoted_string() {
593        let tokens = tokenize(r#"header "X-Frame-Options" "DENY""#).expect("should tokenize");
594        assert_eq!(tokens.len(), 3);
595        assert!(matches!(tokens[1].kind, TokenKind::QuotedString));
596        assert_eq!(tokens[1].text, "X-Frame-Options");
597        assert_eq!(tokens[2].text, "DENY");
598    }
599
600    #[test]
601    fn quoted_string_with_escapes() {
602        let tokens = tokenize(r#""hello \"world\"""#).expect("should tokenize");
603        assert_eq!(tokens[0].text, r#"hello "world""#);
604    }
605
606    #[test]
607    fn backtick_string() {
608        let tokens = tokenize("`raw string`").expect("should tokenize");
609        assert!(matches!(tokens[0].kind, TokenKind::BacktickString));
610        assert_eq!(tokens[0].text, "raw string");
611    }
612
613    #[test]
614    fn comment() {
615        let tokens = tokenize("log # access log\nfile_server").expect("should tokenize");
616        assert_eq!(tokens[1].kind, TokenKind::Comment);
617        assert_eq!(tokens[1].text, "# access log");
618    }
619
620    #[test]
621    fn env_var() {
622        let tokens = tokenize("{$API_KEY}").expect("should tokenize");
623        assert!(matches!(
624            &tokens[0].kind,
625            TokenKind::EnvVar { name, default: None }
626            if name == "API_KEY"
627        ));
628    }
629
630    #[test]
631    fn env_var_with_default() {
632        let tokens = tokenize("{$PORT:8080}").expect("should tokenize");
633        assert!(matches!(
634            &tokens[0].kind,
635            TokenKind::EnvVar {
636                name,
637                default: Some(def)
638            }
639            if name == "PORT" && def == "8080"
640        ));
641    }
642
643    #[test]
644    fn heredoc() {
645        let input = "respond <<EOF\nHello World\nEOF\n";
646        let tokens = tokenize(input).expect("should tokenize");
647        assert_eq!(tokens[0].text, "respond");
648        assert!(matches!(
649            &tokens[1].kind,
650            TokenKind::Heredoc { marker }
651            if marker == "EOF"
652        ));
653        assert_eq!(tokens[1].text, "Hello World");
654    }
655
656    #[test]
657    fn unterminated_quote() {
658        let result = tokenize("\"unclosed");
659        assert!(result.is_err());
660        let err = result.unwrap_err();
661        assert_eq!(err.kind, LexErrorKind::UnterminatedString);
662    }
663
664    #[test]
665    fn line_continuation() {
666        let tokens = tokenize("reverse_proxy \\\napp:3000").expect("should tokenize");
667        assert_eq!(tokens.len(), 2);
668        assert_eq!(tokens[0].text, "reverse_proxy");
669        assert_eq!(tokens[1].text, "app:3000");
670    }
671
672    #[test]
673    fn bom_stripping() {
674        let input = "\u{FEFF}example.com";
675        let tokens = tokenize(input).expect("should tokenize");
676        assert_eq!(tokens[0].text, "example.com");
677    }
678
679    #[test]
680    fn escaped_braces() {
681        let tokens = tokenize(r"respond \{hello\}").expect("should tokenize");
682        assert_eq!(tokens[0].text, "respond");
683        assert_eq!(tokens[1].text, r"\{hello\}");
684    }
685
686    #[test]
687    fn span_tracking() {
688        let tokens = tokenize("a\nb c").expect("should tokenize");
689        assert_eq!(tokens[0].span.line, 1);
690        assert_eq!(tokens[0].span.column, 1);
691        // newline token
692        assert_eq!(tokens[2].span.line, 2);
693        assert_eq!(tokens[2].span.column, 1);
694        assert_eq!(tokens[3].span.line, 2);
695        assert_eq!(tokens[3].span.column, 3);
696    }
697}