1#[allow(unused_imports)]
4use crate::trace;
5use crate::{Span, Token, TokenKind};
6
7#[derive(Clone)]
9pub struct Lexer<'src> {
10 source: &'src str,
12 remaining: &'src str,
14 pos: u32,
16
17 heredoc_state: Option<HeredocState>,
19}
20
21#[derive(Debug, Clone)]
23struct HeredocState {
24 delimiter: String,
26}
27
28impl<'src> Lexer<'src> {
29 pub fn new(source: &'src str) -> Self {
31 Self {
32 source,
33 remaining: source,
34 pos: 0,
35 heredoc_state: None,
36 }
37 }
38
39 #[inline]
41 pub fn position(&self) -> u32 {
42 self.pos
43 }
44
45 #[inline]
47 pub fn is_eof(&self) -> bool {
48 self.remaining.is_empty()
49 }
50
51 #[inline]
53 fn peek(&self) -> Option<char> {
54 self.remaining.chars().next()
55 }
56
57 #[inline]
59 fn peek_nth(&self, n: usize) -> Option<char> {
60 self.remaining.chars().nth(n)
61 }
62
63 #[inline]
65 fn advance(&mut self) -> Option<char> {
66 let c = self.peek()?;
67 self.pos += c.len_utf8() as u32;
68 self.remaining = &self.remaining[c.len_utf8()..];
69 Some(c)
70 }
71
72 #[inline]
74 fn advance_by(&mut self, n: usize) {
75 self.pos += n as u32;
76 self.remaining = &self.remaining[n..];
77 }
78
79 #[inline]
81 fn starts_with(&self, prefix: &str) -> bool {
82 self.remaining.starts_with(prefix)
83 }
84
85 fn token(&self, kind: TokenKind, start: u32) -> Token<'src> {
87 let span = Span::new(start, self.pos);
88 let text = &self.source[start as usize..self.pos as usize];
89 trace!("Token {:?} at {:?}: {:?}", kind, span, text);
90 Token::new(kind, span, text)
91 }
92
93 pub fn next_token(&mut self) -> Token<'src> {
95 if let Some(ref state) = self.heredoc_state.clone() {
97 return self.lex_heredoc_content(&state.delimiter);
98 }
99
100 if self.is_eof() {
102 return self.token(TokenKind::Eof, self.pos);
103 }
104
105 let start = self.pos;
106 let c = self.peek().unwrap();
107
108 match c {
109 '{' => {
111 self.advance();
112 self.token(TokenKind::LBrace, start)
113 }
114 '}' => {
115 self.advance();
116 self.token(TokenKind::RBrace, start)
117 }
118 '(' => {
119 self.advance();
120 self.token(TokenKind::LParen, start)
121 }
122 ')' => {
123 self.advance();
124 self.token(TokenKind::RParen, start)
125 }
126 ',' => {
127 self.advance();
128 self.token(TokenKind::Comma, start)
129 }
130 '>' => {
131 self.advance();
132 self.token(TokenKind::Gt, start)
133 }
134 '@' => {
135 self.advance();
136 self.token(TokenKind::At, start)
137 }
138
139 '"' => self.lex_quoted_scalar(),
141
142 '/' if self.starts_with("///") => self.lex_doc_comment(),
144 '/' if self.starts_with("//") => self.lex_line_comment(),
145 '/' => self.lex_bare_scalar(),
147
148 '<' if self.starts_with("<<")
151 && matches!(self.peek_nth(2), Some(c) if c.is_ascii_uppercase()) =>
152 {
153 self.lex_heredoc_start()
154 }
155 '<' if self.starts_with("<<") => {
157 let start = self.pos;
158 self.advance(); self.advance(); self.token(TokenKind::Error, start)
161 }
162
163 'r' if matches!(self.peek_nth(1), Some('#' | '"')) => self.lex_raw_string(),
165
166 ' ' | '\t' => self.lex_whitespace(),
168
169 '\n' => {
171 self.advance();
172 self.token(TokenKind::Newline, start)
173 }
174 '\r' if self.peek_nth(1) == Some('\n') => {
175 self.advance();
176 self.advance();
177 self.token(TokenKind::Newline, start)
178 }
179
180 _ if is_bare_scalar_start(c) => self.lex_bare_scalar(),
182
183 _ => {
185 self.advance();
186 self.token(TokenKind::Error, start)
187 }
188 }
189 }
190
191 fn lex_whitespace(&mut self) -> Token<'src> {
193 let start = self.pos;
194 while let Some(c) = self.peek() {
195 if c == ' ' || c == '\t' {
196 self.advance();
197 } else {
198 break;
199 }
200 }
201 self.token(TokenKind::Whitespace, start)
202 }
203
204 fn lex_bare_scalar(&mut self) -> Token<'src> {
206 let start = self.pos;
207 while let Some(c) = self.peek() {
208 if is_bare_scalar_char(c) {
209 self.advance();
210 } else {
211 break;
212 }
213 }
214 self.token(TokenKind::BareScalar, start)
215 }
216
217 fn lex_quoted_scalar(&mut self) -> Token<'src> {
219 let start = self.pos;
220
221 self.advance();
223
224 loop {
225 match self.peek() {
226 None => {
227 return self.token(TokenKind::Error, start);
229 }
230 Some('"') => {
231 self.advance();
232 break;
233 }
234 Some('\\') => {
235 self.advance();
237 if self.peek().is_some() {
238 self.advance();
239 }
240 }
241 Some(_) => {
242 self.advance();
243 }
244 }
245 }
246
247 self.token(TokenKind::QuotedScalar, start)
248 }
249
250 fn lex_line_comment(&mut self) -> Token<'src> {
253 let start = self.pos;
254
255 self.advance();
257 self.advance();
258
259 while let Some(c) = self.peek() {
261 if c == '\n' || c == '\r' {
262 break;
263 }
264 self.advance();
265 }
266
267 self.token(TokenKind::LineComment, start)
268 }
269
270 fn lex_doc_comment(&mut self) -> Token<'src> {
272 let start = self.pos;
273
274 self.advance();
276 self.advance();
277 self.advance();
278
279 while let Some(c) = self.peek() {
281 if c == '\n' || c == '\r' {
282 break;
283 }
284 self.advance();
285 }
286
287 self.token(TokenKind::DocComment, start)
288 }
289
290 fn lex_heredoc_start(&mut self) -> Token<'src> {
296 let start = self.pos;
297
298 self.advance();
300 self.advance();
301
302 let delim_start = self.pos as usize;
303
304 match self.peek() {
306 Some(c) if c.is_ascii_uppercase() => {
307 self.advance();
308 }
309 _ => {
310 while let Some(c) = self.peek() {
313 if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
314 self.advance();
315 } else {
316 break;
317 }
318 }
319 return self.token(TokenKind::Error, start);
320 }
321 }
322
323 while let Some(c) = self.peek() {
325 if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
326 self.advance();
327 } else {
328 break;
329 }
330 }
331
332 let delimiter = &self.source[delim_start..self.pos as usize];
333
334 if delimiter.len() > 16 {
336 return self.token(TokenKind::Error, start);
337 }
338
339 if self.peek() == Some(',') {
343 self.advance(); if let Some(c) = self.peek()
346 && c.is_ascii_lowercase()
347 {
348 self.advance();
349 while let Some(c) = self.peek() {
351 if c.is_ascii_lowercase()
352 || c.is_ascii_digit()
353 || c == '_'
354 || c == '.'
355 || c == '-'
356 {
357 self.advance();
358 } else {
359 break;
360 }
361 }
362 }
363 }
364
365 if self.peek() == Some('\r') {
367 self.advance();
368 }
369 if self.peek() == Some('\n') {
370 self.advance();
371 }
372
373 self.heredoc_state = Some(HeredocState {
375 delimiter: delimiter.to_string(),
376 });
377
378 self.token(TokenKind::HeredocStart, start)
379 }
380
381 fn find_heredoc_delimiter(&self, delimiter: &str) -> Option<usize> {
385 let indent_len = self
387 .remaining
388 .chars()
389 .take_while(|c| *c == ' ' || *c == '\t')
390 .count();
391
392 let after_indent = &self.remaining[indent_len..];
394 if let Some(after_delim) = after_indent.strip_prefix(delimiter)
395 && (after_delim.is_empty()
396 || after_delim.starts_with('\n')
397 || after_delim.starts_with("\r\n"))
398 {
399 return Some(indent_len);
400 }
401 None
402 }
403
404 fn lex_heredoc_content(&mut self, delimiter: &str) -> Token<'src> {
408 let start = self.pos;
409
410 if let Some(indent_len) = self.find_heredoc_delimiter(delimiter) {
412 self.advance_by(indent_len + delimiter.len());
414 self.heredoc_state = None;
415 return self.token(TokenKind::HeredocEnd, start);
416 }
417
418 let mut found_end = false;
420 while !self.is_eof() {
421 while let Some(c) = self.peek() {
423 if c == '\n' {
424 self.advance();
425 break;
426 } else if c == '\r' && self.peek_nth(1) == Some('\n') {
427 self.advance();
428 self.advance();
429 break;
430 }
431 self.advance();
432 }
433
434 if self.find_heredoc_delimiter(delimiter).is_some() {
436 found_end = true;
437 break;
438 }
439
440 if self.is_eof() {
441 break;
442 }
443 }
444
445 if start == self.pos
446 && found_end
447 && let Some(indent_len) = self.find_heredoc_delimiter(delimiter)
448 {
449 self.advance_by(indent_len + delimiter.len());
451 self.heredoc_state = None;
452 return self.token(TokenKind::HeredocEnd, start);
453 }
454
455 if self.is_eof() && !found_end {
459 self.heredoc_state = None;
460 return self.token(TokenKind::Error, start);
461 }
462
463 self.token(TokenKind::HeredocContent, start)
464 }
465
466 fn lex_raw_string(&mut self) -> Token<'src> {
470 let start = self.pos;
471
472 self.advance();
474
475 let mut hash_count: u8 = 0;
477 while self.peek() == Some('#') {
478 hash_count = hash_count.saturating_add(1);
479 self.advance();
480 }
481
482 if self.peek() == Some('"') {
484 self.advance();
485 } else {
486 return self.token(TokenKind::Error, start);
488 }
489
490 loop {
492 match self.peek() {
493 None => {
494 return self.token(TokenKind::Error, start);
496 }
497 Some('"') => {
498 let mut matched_hashes = 0u8;
500 let mut lookahead = 1;
501 while matched_hashes < hash_count {
502 if self.peek_nth(lookahead) == Some('#') {
503 matched_hashes += 1;
504 lookahead += 1;
505 } else {
506 break;
507 }
508 }
509
510 if matched_hashes == hash_count {
511 self.advance(); for _ in 0..hash_count {
514 self.advance(); }
516 return self.token(TokenKind::RawScalar, start);
518 } else {
519 self.advance();
521 }
522 }
523 Some(_) => {
524 self.advance();
525 }
526 }
527 }
528 }
529}
530
531impl<'src> Iterator for Lexer<'src> {
532 type Item = Token<'src>;
533
534 fn next(&mut self) -> Option<Self::Item> {
535 let token = self.next_token();
536 if token.kind == TokenKind::Eof {
537 None
538 } else {
539 Some(token)
540 }
541 }
542}
543
544fn is_bare_scalar_start(c: char) -> bool {
547 !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '=' | '@' | '>' | '/') && !c.is_whitespace()
550}
551
552fn is_bare_scalar_char(c: char) -> bool {
555 !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '>') && !c.is_whitespace()
559}
560
561#[cfg(test)]
562mod tests {
563 use super::*;
564
565 fn lex(source: &str) -> Vec<(TokenKind, &str)> {
566 Lexer::new(source).map(|t| (t.kind, t.text)).collect()
567 }
568
569 #[test]
570 fn test_structural_tokens() {
571 assert_eq!(lex("{"), vec![(TokenKind::LBrace, "{")]);
572 assert_eq!(lex("}"), vec![(TokenKind::RBrace, "}")]);
573 assert_eq!(lex("("), vec![(TokenKind::LParen, "(")]);
574 assert_eq!(lex(")"), vec![(TokenKind::RParen, ")")]);
575 assert_eq!(lex(","), vec![(TokenKind::Comma, ",")]);
576 assert_eq!(lex(">"), vec![(TokenKind::Gt, ">")]);
577 assert_eq!(lex("@"), vec![(TokenKind::At, "@")]);
578 }
579
580 #[test]
581 fn test_bare_scalar() {
582 assert_eq!(lex("hello"), vec![(TokenKind::BareScalar, "hello")]);
583 assert_eq!(lex("42"), vec![(TokenKind::BareScalar, "42")]);
584 assert_eq!(lex("true"), vec![(TokenKind::BareScalar, "true")]);
585 assert_eq!(
586 lex("https://example.com/path"),
587 vec![(TokenKind::BareScalar, "https://example.com/path")]
588 );
589 }
590
591 #[test]
592 fn test_quoted_scalar() {
593 assert_eq!(
594 lex(r#""hello world""#),
595 vec![(TokenKind::QuotedScalar, r#""hello world""#)]
596 );
597 assert_eq!(
598 lex(r#""with \"escapes\"""#),
599 vec![(TokenKind::QuotedScalar, r#""with \"escapes\"""#)]
600 );
601 }
602
603 #[test]
604 fn test_raw_scalar() {
605 assert_eq!(
607 lex(r#"r"hello""#),
608 vec![(TokenKind::RawScalar, r#"r"hello""#)]
609 );
610 assert_eq!(
611 lex(r##"r#"hello"#"##),
612 vec![(TokenKind::RawScalar, r##"r#"hello"#"##)]
613 );
614 }
615
616 #[test]
617 fn test_comments() {
618 assert_eq!(
619 lex("// comment"),
620 vec![(TokenKind::LineComment, "// comment")]
621 );
622 assert_eq!(lex("/// doc"), vec![(TokenKind::DocComment, "/// doc")]);
623 }
624
625 #[test]
626 fn test_whitespace() {
627 assert_eq!(lex(" \t"), vec![(TokenKind::Whitespace, " \t")]);
628 assert_eq!(lex("\n"), vec![(TokenKind::Newline, "\n")]);
629 assert_eq!(lex("\r\n"), vec![(TokenKind::Newline, "\r\n")]);
630 }
631
632 #[test]
633 fn test_mixed() {
634 let tokens = lex("{host localhost}");
635 assert_eq!(
636 tokens,
637 vec![
638 (TokenKind::LBrace, "{"),
639 (TokenKind::BareScalar, "host"),
640 (TokenKind::Whitespace, " "),
641 (TokenKind::BareScalar, "localhost"),
642 (TokenKind::RBrace, "}"),
643 ]
644 );
645 }
646
647 #[test]
648 fn test_heredoc() {
649 let tokens = lex("<<EOF\nhello\nworld\nEOF");
650 assert_eq!(
651 tokens,
652 vec![
653 (TokenKind::HeredocStart, "<<EOF\n"),
654 (TokenKind::HeredocContent, "hello\nworld\n"),
655 (TokenKind::HeredocEnd, "EOF"),
656 ]
657 );
658 }
659
660 #[test]
662 fn test_heredoc_valid_delimiters() {
663 assert!(lex("<<A\nx\nA").iter().all(|t| t.0 != TokenKind::Error));
665 assert!(lex("<<EOF\nx\nEOF").iter().all(|t| t.0 != TokenKind::Error));
667 assert!(
669 lex("<<MY123\nx\nMY123")
670 .iter()
671 .all(|t| t.0 != TokenKind::Error)
672 );
673 assert!(
675 lex("<<MY_DELIM\nx\nMY_DELIM")
676 .iter()
677 .all(|t| t.0 != TokenKind::Error)
678 );
679 assert!(
681 lex("<<ABCDEFGHIJKLMNOP\nx\nABCDEFGHIJKLMNOP")
682 .iter()
683 .all(|t| t.0 != TokenKind::Error)
684 );
685 }
686
687 #[test]
689 fn test_heredoc_must_start_uppercase() {
690 assert!(lex("<<123FOO").iter().any(|t| t.0 == TokenKind::Error));
692 assert!(lex("<<_FOO").iter().any(|t| t.0 == TokenKind::Error));
694 let tokens = lex("<<foo");
696 assert!(!tokens.iter().any(|t| t.0 == TokenKind::HeredocStart));
698 }
699
700 #[test]
702 fn test_heredoc_max_16_chars() {
703 assert!(
705 lex("<<ABCDEFGHIJKLMNOPQ\nx\nABCDEFGHIJKLMNOPQ")
706 .iter()
707 .any(|t| t.0 == TokenKind::Error)
708 );
709 }
710
711 #[test]
712 fn test_slash_in_bare_scalar() {
713 let tokens = lex("/foo");
715 assert_eq!(tokens, vec![(TokenKind::BareScalar, "/foo")]);
716
717 let tokens = lex("/usr/bin/foo");
719 assert_eq!(tokens, vec![(TokenKind::BareScalar, "/usr/bin/foo")]);
720
721 let tokens = lex("// comment");
723 assert_eq!(tokens, vec![(TokenKind::LineComment, "// comment")]);
724 }
725
726 #[test]
727 fn test_unterminated_heredoc() {
728 let tokens = lex("<<EOF\nhello world\n");
730 eprintln!("tokens = {:?}", tokens);
731 assert!(
732 tokens.iter().any(|t| t.0 == TokenKind::Error),
733 "Expected Error token for unterminated heredoc"
734 );
735 }
736
737 #[test]
738 fn test_unterminated_string() {
739 let tokens = lex("\"hello");
741 eprintln!("tokens = {:?}", tokens);
742 assert!(
743 tokens.iter().any(|t| t.0 == TokenKind::Error),
744 "Expected Error token for unterminated string"
745 );
746 }
747}