styx_parse/
lexer.rs

1//! Lexer for the Styx configuration language.
2
3#[allow(unused_imports)]
4use crate::trace;
5use crate::{Span, Token, TokenKind};
6
7/// A lexer that produces tokens from Styx source text.
8#[derive(Clone)]
9pub struct Lexer<'src> {
10    /// The source text being lexed.
11    source: &'src str,
12    /// The remaining source text (suffix of `source`).
13    remaining: &'src str,
14    /// Current byte position in `source`.
15    pos: u32,
16
17    /// State for heredoc parsing.
18    heredoc_state: Option<HeredocState>,
19}
20
21/// State for tracking heredoc parsing.
22#[derive(Debug, Clone)]
23struct HeredocState {
24    /// The delimiter to match (e.g., "EOF" for `<<EOF`)
25    delimiter: String,
26}
27
28impl<'src> Lexer<'src> {
29    /// Create a new lexer for the given source text.
30    pub fn new(source: &'src str) -> Self {
31        Self {
32            source,
33            remaining: source,
34            pos: 0,
35            heredoc_state: None,
36        }
37    }
38
39    /// Get the current byte position.
40    #[inline]
41    pub fn position(&self) -> u32 {
42        self.pos
43    }
44
45    /// Check if we're at the end of input.
46    #[inline]
47    pub fn is_eof(&self) -> bool {
48        self.remaining.is_empty()
49    }
50
51    /// Peek at the next character without consuming it.
52    #[inline]
53    fn peek(&self) -> Option<char> {
54        self.remaining.chars().next()
55    }
56
57    /// Peek at the nth character (0-indexed) without consuming.
58    #[inline]
59    fn peek_nth(&self, n: usize) -> Option<char> {
60        self.remaining.chars().nth(n)
61    }
62
63    /// Advance by one character and return it.
64    #[inline]
65    fn advance(&mut self) -> Option<char> {
66        let c = self.peek()?;
67        self.pos += c.len_utf8() as u32;
68        self.remaining = &self.remaining[c.len_utf8()..];
69        Some(c)
70    }
71
72    /// Advance by n bytes.
73    #[inline]
74    fn advance_by(&mut self, n: usize) {
75        self.pos += n as u32;
76        self.remaining = &self.remaining[n..];
77    }
78
79    /// Check if the remaining text starts with the given prefix.
80    #[inline]
81    fn starts_with(&self, prefix: &str) -> bool {
82        self.remaining.starts_with(prefix)
83    }
84
85    /// Create a token from the given start position to current position.
86    fn token(&self, kind: TokenKind, start: u32) -> Token<'src> {
87        let span = Span::new(start, self.pos);
88        let text = &self.source[start as usize..self.pos as usize];
89        trace!("Token {:?} at {:?}: {:?}", kind, span, text);
90        Token::new(kind, span, text)
91    }
92
93    /// Get the next token.
94    pub fn next_token(&mut self) -> Token<'src> {
95        // Handle heredoc content if we're inside one
96        if let Some(ref state) = self.heredoc_state.clone() {
97            return self.lex_heredoc_content(&state.delimiter);
98        }
99
100        // Check for EOF
101        if self.is_eof() {
102            return self.token(TokenKind::Eof, self.pos);
103        }
104
105        let start = self.pos;
106        let c = self.peek().unwrap();
107
108        match c {
109            // Structural tokens
110            '{' => {
111                self.advance();
112                self.token(TokenKind::LBrace, start)
113            }
114            '}' => {
115                self.advance();
116                self.token(TokenKind::RBrace, start)
117            }
118            '(' => {
119                self.advance();
120                self.token(TokenKind::LParen, start)
121            }
122            ')' => {
123                self.advance();
124                self.token(TokenKind::RParen, start)
125            }
126            ',' => {
127                self.advance();
128                self.token(TokenKind::Comma, start)
129            }
130            '>' => {
131                self.advance();
132                self.token(TokenKind::Gt, start)
133            }
134            '@' => {
135                self.advance();
136                self.token(TokenKind::At, start)
137            }
138
139            // Quoted scalar
140            '"' => self.lex_quoted_scalar(),
141
142            // Comment or doc comment
143            '/' if self.starts_with("///") => self.lex_doc_comment(),
144            '/' if self.starts_with("//") => self.lex_line_comment(),
145            // Single / is a bare scalar (e.g., /usr/bin/foo)
146            '/' => self.lex_bare_scalar(),
147
148            // Heredoc - only if << is followed by uppercase letter
149            // parser[impl scalar.heredoc.invalid]
150            '<' if self.starts_with("<<")
151                && matches!(self.peek_nth(2), Some(c) if c.is_ascii_uppercase()) =>
152            {
153                self.lex_heredoc_start()
154            }
155            // << not followed by uppercase is an error
156            '<' if self.starts_with("<<") => {
157                let start = self.pos;
158                self.advance(); // <
159                self.advance(); // <
160                self.token(TokenKind::Error, start)
161            }
162
163            // Raw string
164            'r' if matches!(self.peek_nth(1), Some('#' | '"')) => self.lex_raw_string(),
165
166            // Whitespace
167            ' ' | '\t' => self.lex_whitespace(),
168
169            // Newline
170            '\n' => {
171                self.advance();
172                self.token(TokenKind::Newline, start)
173            }
174            '\r' if self.peek_nth(1) == Some('\n') => {
175                self.advance();
176                self.advance();
177                self.token(TokenKind::Newline, start)
178            }
179
180            // Bare scalar (default for anything else that's not a special char)
181            _ if is_bare_scalar_start(c) => self.lex_bare_scalar(),
182
183            // Error: unrecognized character
184            _ => {
185                self.advance();
186                self.token(TokenKind::Error, start)
187            }
188        }
189    }
190
191    /// Lex horizontal whitespace (spaces and tabs).
192    fn lex_whitespace(&mut self) -> Token<'src> {
193        let start = self.pos;
194        while let Some(c) = self.peek() {
195            if c == ' ' || c == '\t' {
196                self.advance();
197            } else {
198                break;
199            }
200        }
201        self.token(TokenKind::Whitespace, start)
202    }
203
204    /// Lex a bare (unquoted) scalar.
205    fn lex_bare_scalar(&mut self) -> Token<'src> {
206        let start = self.pos;
207        while let Some(c) = self.peek() {
208            if is_bare_scalar_char(c) {
209                self.advance();
210            } else {
211                break;
212            }
213        }
214        self.token(TokenKind::BareScalar, start)
215    }
216
217    /// Lex a quoted scalar: `"..."`.
218    fn lex_quoted_scalar(&mut self) -> Token<'src> {
219        let start = self.pos;
220
221        // Consume opening quote
222        self.advance();
223
224        loop {
225            match self.peek() {
226                None => {
227                    // Unterminated string - return error
228                    return self.token(TokenKind::Error, start);
229                }
230                Some('"') => {
231                    self.advance();
232                    break;
233                }
234                Some('\\') => {
235                    // Escape sequence - consume backslash and next char
236                    self.advance();
237                    if self.peek().is_some() {
238                        self.advance();
239                    }
240                }
241                Some(_) => {
242                    self.advance();
243                }
244            }
245        }
246
247        self.token(TokenKind::QuotedScalar, start)
248    }
249
250    // parser[impl comment.line]
251    /// Lex a line comment: `// ...`.
252    fn lex_line_comment(&mut self) -> Token<'src> {
253        let start = self.pos;
254
255        // Consume `//`
256        self.advance();
257        self.advance();
258
259        // Consume until end of line
260        while let Some(c) = self.peek() {
261            if c == '\n' || c == '\r' {
262                break;
263            }
264            self.advance();
265        }
266
267        self.token(TokenKind::LineComment, start)
268    }
269
270    /// Lex a doc comment: `/// ...`.
271    fn lex_doc_comment(&mut self) -> Token<'src> {
272        let start = self.pos;
273
274        // Consume `///`
275        self.advance();
276        self.advance();
277        self.advance();
278
279        // Consume until end of line
280        while let Some(c) = self.peek() {
281            if c == '\n' || c == '\r' {
282                break;
283            }
284            self.advance();
285        }
286
287        self.token(TokenKind::DocComment, start)
288    }
289
290    /// Lex a heredoc start: `<<DELIM`.
291    ///
292    /// Per parser[scalar.heredoc.syntax]: delimiter MUST match `[A-Z][A-Z0-9_]*`
293    /// and not exceed 16 characters.
294    // parser[impl scalar.heredoc.syntax]
295    fn lex_heredoc_start(&mut self) -> Token<'src> {
296        let start = self.pos;
297
298        // Consume `<<`
299        self.advance();
300        self.advance();
301
302        let delim_start = self.pos as usize;
303
304        // First char MUST be uppercase letter
305        match self.peek() {
306            Some(c) if c.is_ascii_uppercase() => {
307                self.advance();
308            }
309            _ => {
310                // Invalid delimiter - first char not uppercase letter
311                // Consume any remaining delimiter-like chars for error recovery
312                while let Some(c) = self.peek() {
313                    if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
314                        self.advance();
315                    } else {
316                        break;
317                    }
318                }
319                return self.token(TokenKind::Error, start);
320            }
321        }
322
323        // Rest: uppercase, digit, or underscore
324        while let Some(c) = self.peek() {
325            if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
326                self.advance();
327            } else {
328                break;
329            }
330        }
331
332        let delimiter = &self.source[delim_start..self.pos as usize];
333
334        // Check length <= 16
335        if delimiter.len() > 16 {
336            return self.token(TokenKind::Error, start);
337        }
338
339        // Consume optional language hint: ,lang where lang matches [a-z][a-z0-9_.-]*
340        // parser[impl scalar.heredoc.lang]
341        // The language hint is metadata and does not affect the scalar content.
342        if self.peek() == Some(',') {
343            self.advance(); // consume ','
344            // First char must be lowercase letter
345            if let Some(c) = self.peek()
346                && c.is_ascii_lowercase()
347            {
348                self.advance();
349                // Rest: lowercase, digit, underscore, dot, hyphen
350                while let Some(c) = self.peek() {
351                    if c.is_ascii_lowercase()
352                        || c.is_ascii_digit()
353                        || c == '_'
354                        || c == '.'
355                        || c == '-'
356                    {
357                        self.advance();
358                    } else {
359                        break;
360                    }
361                }
362            }
363        }
364
365        // Consume newline after delimiter (and optional lang hint)
366        if self.peek() == Some('\r') {
367            self.advance();
368        }
369        if self.peek() == Some('\n') {
370            self.advance();
371        }
372
373        // Set state for heredoc content
374        self.heredoc_state = Some(HeredocState {
375            delimiter: delimiter.to_string(),
376        });
377
378        self.token(TokenKind::HeredocStart, start)
379    }
380
381    /// Check if the remaining input starts with the heredoc delimiter (possibly indented).
382    /// Returns Some(indent_len) if found, where indent_len is the number of leading spaces/tabs.
383    /// The delimiter must be followed by newline or EOF to be valid.
384    fn find_heredoc_delimiter(&self, delimiter: &str) -> Option<usize> {
385        // Count leading whitespace
386        let indent_len = self
387            .remaining
388            .chars()
389            .take_while(|c| *c == ' ' || *c == '\t')
390            .count();
391
392        // Check if delimiter follows the whitespace
393        let after_indent = &self.remaining[indent_len..];
394        if let Some(after_delim) = after_indent.strip_prefix(delimiter)
395            && (after_delim.is_empty()
396                || after_delim.starts_with('\n')
397                || after_delim.starts_with("\r\n"))
398        {
399            return Some(indent_len);
400        }
401        None
402    }
403
404    /// Lex heredoc content until we find the closing delimiter.
405    /// Per parser[scalar.heredoc.syntax]: The closing delimiter line MAY be indented;
406    /// that indentation is stripped from content lines.
407    fn lex_heredoc_content(&mut self, delimiter: &str) -> Token<'src> {
408        let start = self.pos;
409
410        // Check if we're at the delimiter (possibly indented) - end of heredoc
411        if let Some(indent_len) = self.find_heredoc_delimiter(delimiter) {
412            // This is the end delimiter - consume indent + delimiter
413            self.advance_by(indent_len + delimiter.len());
414            self.heredoc_state = None;
415            return self.token(TokenKind::HeredocEnd, start);
416        }
417
418        // Consume content until we find the delimiter at start of a line (possibly indented)
419        let mut found_end = false;
420        while !self.is_eof() {
421            // Consume the current line
422            while let Some(c) = self.peek() {
423                if c == '\n' {
424                    self.advance();
425                    break;
426                } else if c == '\r' && self.peek_nth(1) == Some('\n') {
427                    self.advance();
428                    self.advance();
429                    break;
430                }
431                self.advance();
432            }
433
434            // Check if next line starts with delimiter (possibly indented)
435            if self.find_heredoc_delimiter(delimiter).is_some() {
436                found_end = true;
437                break;
438            }
439
440            if self.is_eof() {
441                break;
442            }
443        }
444
445        if start == self.pos
446            && found_end
447            && let Some(indent_len) = self.find_heredoc_delimiter(delimiter)
448        {
449            // No content, return the end delimiter
450            self.advance_by(indent_len + delimiter.len());
451            self.heredoc_state = None;
452            return self.token(TokenKind::HeredocEnd, start);
453        }
454
455        // CRITICAL: If we hit EOF without finding the closing delimiter,
456        // we must clear the heredoc state to avoid an infinite loop.
457        // The next call would otherwise re-enter lex_heredoc_content forever.
458        if self.is_eof() && !found_end {
459            self.heredoc_state = None;
460            return self.token(TokenKind::Error, start);
461        }
462
463        self.token(TokenKind::HeredocContent, start)
464    }
465
466    // parser[impl scalar.raw.syntax]
467    /// Lex a raw string: `r#*"..."#*`.
468    /// Returns the entire raw string including delimiters.
469    fn lex_raw_string(&mut self) -> Token<'src> {
470        let start = self.pos;
471
472        // Consume `r`
473        self.advance();
474
475        // Count and consume `#` marks
476        let mut hash_count: u8 = 0;
477        while self.peek() == Some('#') {
478            hash_count = hash_count.saturating_add(1);
479            self.advance();
480        }
481
482        // Consume opening `"`
483        if self.peek() == Some('"') {
484            self.advance();
485        } else {
486            // Invalid raw string - no opening quote
487            return self.token(TokenKind::Error, start);
488        }
489
490        // Consume content until we find the closing `"#*`
491        loop {
492            match self.peek() {
493                None => {
494                    // Unterminated raw string - return error
495                    return self.token(TokenKind::Error, start);
496                }
497                Some('"') => {
498                    // Check for closing sequence
499                    let mut matched_hashes = 0u8;
500                    let mut lookahead = 1;
501                    while matched_hashes < hash_count {
502                        if self.peek_nth(lookahead) == Some('#') {
503                            matched_hashes += 1;
504                            lookahead += 1;
505                        } else {
506                            break;
507                        }
508                    }
509
510                    if matched_hashes == hash_count {
511                        // Found the closing delimiter - consume it
512                        self.advance(); // consume `"`
513                        for _ in 0..hash_count {
514                            self.advance(); // consume `#`s
515                        }
516                        // Return token with full text including delimiters
517                        return self.token(TokenKind::RawScalar, start);
518                    } else {
519                        // Not a closing delimiter, consume the `"` as content
520                        self.advance();
521                    }
522                }
523                Some(_) => {
524                    self.advance();
525                }
526            }
527        }
528    }
529}
530
531impl<'src> Iterator for Lexer<'src> {
532    type Item = Token<'src>;
533
534    fn next(&mut self) -> Option<Self::Item> {
535        let token = self.next_token();
536        if token.kind == TokenKind::Eof {
537            None
538        } else {
539            Some(token)
540        }
541    }
542}
543
544// parser[impl scalar.bare.chars]
545/// Check if a character can start a bare scalar.
546fn is_bare_scalar_start(c: char) -> bool {
547    // Cannot be special chars, whitespace, or `/` (to avoid confusion with comments)
548    // `=` and `@` are allowed after first char but not at start
549    !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '=' | '@' | '>' | '/') && !c.is_whitespace()
550}
551
552// parser[impl scalar.bare.chars]
553/// Check if a character can continue a bare scalar.
554fn is_bare_scalar_char(c: char) -> bool {
555    // Cannot be special chars or whitespace
556    // `/`, `@`, and `=` are allowed after the first char
557    // `>` is never allowed (attribute separator)
558    !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '>') && !c.is_whitespace()
559}
560
561#[cfg(test)]
562mod tests {
563    use super::*;
564
565    fn lex(source: &str) -> Vec<(TokenKind, &str)> {
566        Lexer::new(source).map(|t| (t.kind, t.text)).collect()
567    }
568
569    #[test]
570    fn test_structural_tokens() {
571        assert_eq!(lex("{"), vec![(TokenKind::LBrace, "{")]);
572        assert_eq!(lex("}"), vec![(TokenKind::RBrace, "}")]);
573        assert_eq!(lex("("), vec![(TokenKind::LParen, "(")]);
574        assert_eq!(lex(")"), vec![(TokenKind::RParen, ")")]);
575        assert_eq!(lex(","), vec![(TokenKind::Comma, ",")]);
576        assert_eq!(lex(">"), vec![(TokenKind::Gt, ">")]);
577        assert_eq!(lex("@"), vec![(TokenKind::At, "@")]);
578    }
579
580    #[test]
581    fn test_bare_scalar() {
582        assert_eq!(lex("hello"), vec![(TokenKind::BareScalar, "hello")]);
583        assert_eq!(lex("42"), vec![(TokenKind::BareScalar, "42")]);
584        assert_eq!(lex("true"), vec![(TokenKind::BareScalar, "true")]);
585        assert_eq!(
586            lex("https://example.com/path"),
587            vec![(TokenKind::BareScalar, "https://example.com/path")]
588        );
589    }
590
591    #[test]
592    fn test_quoted_scalar() {
593        assert_eq!(
594            lex(r#""hello world""#),
595            vec![(TokenKind::QuotedScalar, r#""hello world""#)]
596        );
597        assert_eq!(
598            lex(r#""with \"escapes\"""#),
599            vec![(TokenKind::QuotedScalar, r#""with \"escapes\"""#)]
600        );
601    }
602
603    #[test]
604    fn test_raw_scalar() {
605        // Raw scalars now include the full text with delimiters (for lossless CST)
606        assert_eq!(
607            lex(r#"r"hello""#),
608            vec![(TokenKind::RawScalar, r#"r"hello""#)]
609        );
610        assert_eq!(
611            lex(r##"r#"hello"#"##),
612            vec![(TokenKind::RawScalar, r##"r#"hello"#"##)]
613        );
614    }
615
616    #[test]
617    fn test_comments() {
618        assert_eq!(
619            lex("// comment"),
620            vec![(TokenKind::LineComment, "// comment")]
621        );
622        assert_eq!(lex("/// doc"), vec![(TokenKind::DocComment, "/// doc")]);
623    }
624
625    #[test]
626    fn test_whitespace() {
627        assert_eq!(lex("  \t"), vec![(TokenKind::Whitespace, "  \t")]);
628        assert_eq!(lex("\n"), vec![(TokenKind::Newline, "\n")]);
629        assert_eq!(lex("\r\n"), vec![(TokenKind::Newline, "\r\n")]);
630    }
631
632    #[test]
633    fn test_mixed() {
634        let tokens = lex("{host localhost}");
635        assert_eq!(
636            tokens,
637            vec![
638                (TokenKind::LBrace, "{"),
639                (TokenKind::BareScalar, "host"),
640                (TokenKind::Whitespace, " "),
641                (TokenKind::BareScalar, "localhost"),
642                (TokenKind::RBrace, "}"),
643            ]
644        );
645    }
646
647    #[test]
648    fn test_heredoc() {
649        let tokens = lex("<<EOF\nhello\nworld\nEOF");
650        assert_eq!(
651            tokens,
652            vec![
653                (TokenKind::HeredocStart, "<<EOF\n"),
654                (TokenKind::HeredocContent, "hello\nworld\n"),
655                (TokenKind::HeredocEnd, "EOF"),
656            ]
657        );
658    }
659
660    // parser[verify scalar.heredoc.syntax]
661    #[test]
662    fn test_heredoc_valid_delimiters() {
663        // Single uppercase letter
664        assert!(lex("<<A\nx\nA").iter().all(|t| t.0 != TokenKind::Error));
665        // Multiple uppercase letters
666        assert!(lex("<<EOF\nx\nEOF").iter().all(|t| t.0 != TokenKind::Error));
667        // With digits after first char
668        assert!(
669            lex("<<MY123\nx\nMY123")
670                .iter()
671                .all(|t| t.0 != TokenKind::Error)
672        );
673        // With underscores
674        assert!(
675            lex("<<MY_DELIM\nx\nMY_DELIM")
676                .iter()
677                .all(|t| t.0 != TokenKind::Error)
678        );
679        // 16 chars (max allowed)
680        assert!(
681            lex("<<ABCDEFGHIJKLMNOP\nx\nABCDEFGHIJKLMNOP")
682                .iter()
683                .all(|t| t.0 != TokenKind::Error)
684        );
685    }
686
687    // parser[verify scalar.heredoc.syntax]
688    #[test]
689    fn test_heredoc_must_start_uppercase() {
690        // Starts with digit - error
691        assert!(lex("<<123FOO").iter().any(|t| t.0 == TokenKind::Error));
692        // Starts with underscore - error
693        assert!(lex("<<_FOO").iter().any(|t| t.0 == TokenKind::Error));
694        // Lowercase - error (lexer won't even recognize it as heredoc delimiter chars)
695        let tokens = lex("<<foo");
696        // This will be << followed by bare scalar "foo"
697        assert!(!tokens.iter().any(|t| t.0 == TokenKind::HeredocStart));
698    }
699
700    // parser[verify scalar.heredoc.syntax]
701    #[test]
702    fn test_heredoc_max_16_chars() {
703        // 17 chars - error
704        assert!(
705            lex("<<ABCDEFGHIJKLMNOPQ\nx\nABCDEFGHIJKLMNOPQ")
706                .iter()
707                .any(|t| t.0 == TokenKind::Error)
708        );
709    }
710
711    #[test]
712    fn test_slash_in_bare_scalar() {
713        // Single slash followed by text should be a bare scalar
714        let tokens = lex("/foo");
715        assert_eq!(tokens, vec![(TokenKind::BareScalar, "/foo")]);
716
717        // Path-like value
718        let tokens = lex("/usr/bin/foo");
719        assert_eq!(tokens, vec![(TokenKind::BareScalar, "/usr/bin/foo")]);
720
721        // But // is still a comment
722        let tokens = lex("// comment");
723        assert_eq!(tokens, vec![(TokenKind::LineComment, "// comment")]);
724    }
725
726    #[test]
727    fn test_unterminated_heredoc() {
728        // Heredoc without closing delimiter should be an error
729        let tokens = lex("<<EOF\nhello world\n");
730        eprintln!("tokens = {:?}", tokens);
731        assert!(
732            tokens.iter().any(|t| t.0 == TokenKind::Error),
733            "Expected Error token for unterminated heredoc"
734        );
735    }
736
737    #[test]
738    fn test_unterminated_string() {
739        // String without closing quote should be an error
740        let tokens = lex("\"hello");
741        eprintln!("tokens = {:?}", tokens);
742        assert!(
743            tokens.iter().any(|t| t.0 == TokenKind::Error),
744            "Expected Error token for unterminated string"
745        );
746    }
747}