1use crate::{Span, Token, TokenKind};
4use tracing::trace;
5
6#[derive(Clone)]
8pub struct Tokenizer<'src> {
9 source: &'src str,
11 remaining: &'src str,
13 pos: u32,
15
16 heredoc_state: Option<HeredocState>,
18}
19
20#[derive(Debug, Clone)]
22struct HeredocState {
23 delimiter: String,
25 closing_indent: Option<usize>,
28}
29
30impl<'src> Tokenizer<'src> {
31 pub fn new(source: &'src str) -> Self {
33 Self {
34 source,
35 remaining: source,
36 pos: 0,
37 heredoc_state: None,
38 }
39 }
40
41 #[inline]
43 pub fn position(&self) -> u32 {
44 self.pos
45 }
46
47 #[inline]
49 pub fn is_eof(&self) -> bool {
50 self.remaining.is_empty()
51 }
52
53 #[inline]
57 pub fn heredoc_closing_indent(&self) -> Option<usize> {
58 self.heredoc_state.as_ref().and_then(|s| s.closing_indent)
59 }
60
61 #[inline]
63 fn peek(&self) -> Option<char> {
64 self.remaining.chars().next()
65 }
66
67 #[inline]
69 fn peek_nth(&self, n: usize) -> Option<char> {
70 self.remaining.chars().nth(n)
71 }
72
73 #[inline]
75 fn advance(&mut self) -> Option<char> {
76 let c = self.peek()?;
77 self.pos += c.len_utf8() as u32;
78 self.remaining = &self.remaining[c.len_utf8()..];
79 Some(c)
80 }
81
82 #[inline]
84 fn advance_by(&mut self, n: usize) {
85 self.pos += n as u32;
86 self.remaining = &self.remaining[n..];
87 }
88
89 #[inline]
91 fn starts_with(&self, prefix: &str) -> bool {
92 self.remaining.starts_with(prefix)
93 }
94
95 fn token(&self, kind: TokenKind, start: u32) -> Token<'src> {
97 let span = Span::new(start, self.pos);
98 let text = &self.source[start as usize..self.pos as usize];
99 trace!("Token {:?} at {:?}: {:?}", kind, span, text);
100 Token::new(kind, span, text)
101 }
102
103 pub fn next_token(&mut self) -> Token<'src> {
105 if let Some(ref state) = self.heredoc_state.clone() {
107 return self.tokenize_heredoc_content(&state.delimiter);
108 }
109
110 if self.is_eof() {
112 return self.token(TokenKind::Eof, self.pos);
113 }
114
115 let start = self.pos;
116 let c = self.peek().unwrap();
117
118 match c {
119 '{' => {
121 self.advance();
122 self.token(TokenKind::LBrace, start)
123 }
124 '}' => {
125 self.advance();
126 self.token(TokenKind::RBrace, start)
127 }
128 '(' => {
129 self.advance();
130 self.token(TokenKind::LParen, start)
131 }
132 ')' => {
133 self.advance();
134 self.token(TokenKind::RParen, start)
135 }
136 ',' => {
137 self.advance();
138 self.token(TokenKind::Comma, start)
139 }
140 '>' => {
141 self.advance();
142 self.token(TokenKind::Gt, start)
143 }
144 '@' => self.tokenize_at_or_tag(),
145
146 '"' => self.tokenize_quoted_scalar(),
148
149 '/' if self.starts_with("///") => self.tokenize_doc_comment(),
151 '/' if self.starts_with("//") => self.tokenize_line_comment(),
152 '/' => self.tokenize_bare_scalar(),
154
155 '<' if self.starts_with("<<")
158 && matches!(self.peek_nth(2), Some(c) if c.is_ascii_uppercase()) =>
159 {
160 self.tokenize_heredoc_start()
161 }
162 '<' if self.starts_with("<<") => {
164 let start = self.pos;
165 self.advance(); self.advance(); self.token(TokenKind::Error, start)
168 }
169
170 'r' if matches!(self.peek_nth(1), Some('#' | '"')) => self.tokenize_raw_string(),
172
173 ' ' | '\t' => self.tokenize_whitespace(),
175
176 '\n' => {
178 self.advance();
179 self.token(TokenKind::Newline, start)
180 }
181 '\r' if self.peek_nth(1) == Some('\n') => {
182 self.advance();
183 self.advance();
184 self.token(TokenKind::Newline, start)
185 }
186
187 _ if is_bare_scalar_start(c) => self.tokenize_bare_scalar(),
189
190 _ => {
192 self.advance();
193 self.token(TokenKind::Error, start)
194 }
195 }
196 }
197
198 fn tokenize_whitespace(&mut self) -> Token<'src> {
200 let start = self.pos;
201 while let Some(c) = self.peek() {
202 if c == ' ' || c == '\t' {
203 self.advance();
204 } else {
205 break;
206 }
207 }
208 self.token(TokenKind::Whitespace, start)
209 }
210
211 fn tokenize_bare_scalar(&mut self) -> Token<'src> {
213 let start = self.pos;
214 while let Some(c) = self.peek() {
215 if is_bare_scalar_char(c) {
216 self.advance();
217 } else {
218 break;
219 }
220 }
221 self.token(TokenKind::BareScalar, start)
222 }
223
224 fn tokenize_at_or_tag(&mut self) -> Token<'src> {
226 let start = self.pos;
227 self.advance(); match self.peek() {
231 Some(c) if c.is_ascii_alphabetic() || c == '_' => {
232 self.advance();
235 while let Some(c) = self.peek() {
236 if c == 'r' && matches!(self.peek_nth(1), Some('#' | '"')) {
239 break;
241 }
242 if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
243 self.advance();
244 } else {
245 break;
246 }
247 }
248 self.token(TokenKind::Tag, start)
249 }
250 _ => {
251 self.token(TokenKind::At, start)
253 }
254 }
255 }
256
257 fn tokenize_quoted_scalar(&mut self) -> Token<'src> {
259 let start = self.pos;
260
261 self.advance();
263
264 loop {
265 match self.peek() {
266 None => {
267 return self.token(TokenKind::Error, start);
269 }
270 Some('"') => {
271 self.advance();
272 break;
273 }
274 Some('\\') => {
275 self.advance();
277 if self.peek().is_some() {
278 self.advance();
279 }
280 }
281 Some(_) => {
282 self.advance();
283 }
284 }
285 }
286
287 self.token(TokenKind::QuotedScalar, start)
288 }
289
290 fn tokenize_line_comment(&mut self) -> Token<'src> {
293 let start = self.pos;
294
295 self.advance();
297 self.advance();
298
299 while let Some(c) = self.peek() {
301 if c == '\n' || c == '\r' {
302 break;
303 }
304 self.advance();
305 }
306
307 self.token(TokenKind::LineComment, start)
308 }
309
310 fn tokenize_doc_comment(&mut self) -> Token<'src> {
312 let start = self.pos;
313
314 self.advance();
316 self.advance();
317 self.advance();
318
319 while let Some(c) = self.peek() {
321 if c == '\n' || c == '\r' {
322 break;
323 }
324 self.advance();
325 }
326
327 self.token(TokenKind::DocComment, start)
328 }
329
330 fn tokenize_heredoc_start(&mut self) -> Token<'src> {
336 let start = self.pos;
337
338 self.advance();
340 self.advance();
341
342 let delim_start = self.pos as usize;
343
344 match self.peek() {
346 Some(c) if c.is_ascii_uppercase() => {
347 self.advance();
348 }
349 _ => {
350 while let Some(c) = self.peek() {
353 if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
354 self.advance();
355 } else {
356 break;
357 }
358 }
359 return self.token(TokenKind::Error, start);
360 }
361 }
362
363 while let Some(c) = self.peek() {
365 if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
366 self.advance();
367 } else {
368 break;
369 }
370 }
371
372 let delimiter = &self.source[delim_start..self.pos as usize];
373
374 if delimiter.len() > 16 {
376 return self.token(TokenKind::Error, start);
377 }
378
379 if self.peek() == Some(',') {
383 self.advance(); if let Some(c) = self.peek()
386 && c.is_ascii_lowercase()
387 {
388 self.advance();
389 while let Some(c) = self.peek() {
391 if c.is_ascii_lowercase()
392 || c.is_ascii_digit()
393 || c == '_'
394 || c == '.'
395 || c == '-'
396 {
397 self.advance();
398 } else {
399 break;
400 }
401 }
402 }
403 }
404
405 if self.peek() == Some('\r') {
407 self.advance();
408 }
409 if self.peek() == Some('\n') {
410 self.advance();
411 }
412
413 self.heredoc_state = Some(HeredocState {
415 delimiter: delimiter.to_string(),
416 closing_indent: None,
417 });
418
419 self.token(TokenKind::HeredocStart, start)
420 }
421
422 fn find_heredoc_delimiter(&self, delimiter: &str) -> Option<usize> {
426 let indent_len = self
428 .remaining
429 .chars()
430 .take_while(|c| *c == ' ' || *c == '\t')
431 .count();
432
433 let after_indent = &self.remaining[indent_len..];
435 if let Some(after_delim) = after_indent.strip_prefix(delimiter)
436 && (after_delim.is_empty()
437 || after_delim.starts_with('\n')
438 || after_delim.starts_with("\r\n"))
439 {
440 return Some(indent_len);
441 }
442 None
443 }
444
445 fn tokenize_heredoc_content(&mut self, delimiter: &str) -> Token<'src> {
449 let start = self.pos;
450
451 if let Some(indent_len) = self.find_heredoc_delimiter(delimiter) {
453 self.advance_by(indent_len + delimiter.len());
455 self.heredoc_state = None;
456 return self.token(TokenKind::HeredocEnd, start);
457 }
458
459 let mut found_end = false;
461 let mut closing_indent = 0usize;
462 while !self.is_eof() {
463 while let Some(c) = self.peek() {
465 if c == '\n' {
466 self.advance();
467 break;
468 } else if c == '\r' && self.peek_nth(1) == Some('\n') {
469 self.advance();
470 self.advance();
471 break;
472 }
473 self.advance();
474 }
475
476 if let Some(indent_len) = self.find_heredoc_delimiter(delimiter) {
478 found_end = true;
479 closing_indent = indent_len;
480 break;
481 }
482
483 if self.is_eof() {
484 break;
485 }
486 }
487
488 if start == self.pos
489 && found_end
490 && let Some(indent_len) = self.find_heredoc_delimiter(delimiter)
491 {
492 self.advance_by(indent_len + delimiter.len());
494 self.heredoc_state = None;
495 return self.token(TokenKind::HeredocEnd, start);
496 }
497
498 if self.is_eof() && !found_end {
502 self.heredoc_state = None;
503 return self.token(TokenKind::Error, start);
504 }
505
506 if let Some(ref mut state) = self.heredoc_state {
508 state.closing_indent = Some(closing_indent);
509 }
510
511 self.token(TokenKind::HeredocContent, start)
512 }
513
514 fn tokenize_raw_string(&mut self) -> Token<'src> {
518 let start = self.pos;
519
520 self.advance();
522
523 let mut hash_count: u8 = 0;
525 while self.peek() == Some('#') {
526 hash_count = hash_count.saturating_add(1);
527 self.advance();
528 }
529
530 if self.peek() == Some('"') {
532 self.advance();
533 } else {
534 return self.token(TokenKind::Error, start);
536 }
537
538 loop {
540 match self.peek() {
541 None => {
542 return self.token(TokenKind::Error, start);
544 }
545 Some('"') => {
546 let mut matched_hashes = 0u8;
548 let mut lookahead = 1;
549 while matched_hashes < hash_count {
550 if self.peek_nth(lookahead) == Some('#') {
551 matched_hashes += 1;
552 lookahead += 1;
553 } else {
554 break;
555 }
556 }
557
558 if matched_hashes == hash_count {
559 self.advance(); for _ in 0..hash_count {
562 self.advance(); }
564 return self.token(TokenKind::RawScalar, start);
566 } else {
567 self.advance();
569 }
570 }
571 Some(_) => {
572 self.advance();
573 }
574 }
575 }
576 }
577}
578
579impl<'src> Iterator for Tokenizer<'src> {
580 type Item = Token<'src>;
581
582 fn next(&mut self) -> Option<Self::Item> {
583 let token = self.next_token();
584 if token.kind == TokenKind::Eof {
585 None
586 } else {
587 Some(token)
588 }
589 }
590}
591
592fn is_bare_scalar_start(c: char) -> bool {
595 !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '=' | '@' | '>' | '/') && !c.is_whitespace()
598}
599
600fn is_bare_scalar_char(c: char) -> bool {
603 !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '>') && !c.is_whitespace()
607}
608
609#[cfg(test)]
610mod tests {
611 use super::*;
612 use facet_testhelpers::test;
613
614 fn tokenize(source: &str) -> Vec<(TokenKind, &str)> {
615 Tokenizer::new(source).map(|t| (t.kind, t.text)).collect()
616 }
617
618 #[test]
619 fn test_structural_tokens() {
620 assert_eq!(tokenize("{"), vec![(TokenKind::LBrace, "{")]);
621 assert_eq!(tokenize("}"), vec![(TokenKind::RBrace, "}")]);
622 assert_eq!(tokenize("("), vec![(TokenKind::LParen, "(")]);
623 assert_eq!(tokenize(")"), vec![(TokenKind::RParen, ")")]);
624 assert_eq!(tokenize(","), vec![(TokenKind::Comma, ",")]);
625 assert_eq!(tokenize(">"), vec![(TokenKind::Gt, ">")]);
626 assert_eq!(tokenize("@"), vec![(TokenKind::At, "@")]);
627 }
628
629 #[test]
630 fn test_bare_scalar() {
631 assert_eq!(tokenize("hello"), vec![(TokenKind::BareScalar, "hello")]);
632 assert_eq!(tokenize("42"), vec![(TokenKind::BareScalar, "42")]);
633 assert_eq!(tokenize("true"), vec![(TokenKind::BareScalar, "true")]);
634 assert_eq!(
635 tokenize("https://example.com/path"),
636 vec![(TokenKind::BareScalar, "https://example.com/path")]
637 );
638 }
639
640 #[test]
641 fn test_quoted_scalar() {
642 assert_eq!(
643 tokenize(r#""hello world""#),
644 vec![(TokenKind::QuotedScalar, r#""hello world""#)]
645 );
646 assert_eq!(
647 tokenize(r#""with \"escapes\"""#),
648 vec![(TokenKind::QuotedScalar, r#""with \"escapes\"""#)]
649 );
650 }
651
652 #[test]
653 fn test_raw_scalar() {
654 assert_eq!(
656 tokenize(r#"r"hello""#),
657 vec![(TokenKind::RawScalar, r#"r"hello""#)]
658 );
659 assert_eq!(
660 tokenize(r##"r#"hello"#"##),
661 vec![(TokenKind::RawScalar, r##"r#"hello"#"##)]
662 );
663 }
664
665 #[test]
666 fn test_comments() {
667 assert_eq!(
668 tokenize("// comment"),
669 vec![(TokenKind::LineComment, "// comment")]
670 );
671 assert_eq!(
672 tokenize("/// doc"),
673 vec![(TokenKind::DocComment, "/// doc")]
674 );
675 }
676
677 #[test]
678 fn test_whitespace() {
679 assert_eq!(tokenize(" \t"), vec![(TokenKind::Whitespace, " \t")]);
680 assert_eq!(tokenize("\n"), vec![(TokenKind::Newline, "\n")]);
681 assert_eq!(tokenize("\r\n"), vec![(TokenKind::Newline, "\r\n")]);
682 }
683
684 #[test]
685 fn test_mixed() {
686 let tokens = tokenize("{host localhost}");
687 assert_eq!(
688 tokens,
689 vec![
690 (TokenKind::LBrace, "{"),
691 (TokenKind::BareScalar, "host"),
692 (TokenKind::Whitespace, " "),
693 (TokenKind::BareScalar, "localhost"),
694 (TokenKind::RBrace, "}"),
695 ]
696 );
697 }
698
699 #[test]
700 fn test_heredoc() {
701 let tokens = tokenize("<<EOF\nhello\nworld\nEOF");
702 assert_eq!(
703 tokens,
704 vec![
705 (TokenKind::HeredocStart, "<<EOF\n"),
706 (TokenKind::HeredocContent, "hello\nworld\n"),
707 (TokenKind::HeredocEnd, "EOF"),
708 ]
709 );
710 }
711
712 #[test]
714 fn test_heredoc_valid_delimiters() {
715 assert!(
717 tokenize("<<A\nx\nA")
718 .iter()
719 .all(|t| t.0 != TokenKind::Error)
720 );
721 assert!(
723 tokenize("<<EOF\nx\nEOF")
724 .iter()
725 .all(|t| t.0 != TokenKind::Error)
726 );
727 assert!(
729 tokenize("<<MY123\nx\nMY123")
730 .iter()
731 .all(|t| t.0 != TokenKind::Error)
732 );
733 assert!(
735 tokenize("<<MY_DELIM\nx\nMY_DELIM")
736 .iter()
737 .all(|t| t.0 != TokenKind::Error)
738 );
739 assert!(
741 tokenize("<<ABCDEFGHIJKLMNOP\nx\nABCDEFGHIJKLMNOP")
742 .iter()
743 .all(|t| t.0 != TokenKind::Error)
744 );
745 }
746
747 #[test]
749 fn test_heredoc_must_start_uppercase() {
750 assert!(tokenize("<<123FOO").iter().any(|t| t.0 == TokenKind::Error));
752 assert!(tokenize("<<_FOO").iter().any(|t| t.0 == TokenKind::Error));
754 let tokens = tokenize("<<foo");
756 assert!(!tokens.iter().any(|t| t.0 == TokenKind::HeredocStart));
758 }
759
760 #[test]
762 fn test_heredoc_max_16_chars() {
763 assert!(
765 tokenize("<<ABCDEFGHIJKLMNOPQ\nx\nABCDEFGHIJKLMNOPQ")
766 .iter()
767 .any(|t| t.0 == TokenKind::Error)
768 );
769 }
770
771 #[test]
772 fn test_slash_in_bare_scalar() {
773 let tokens = tokenize("/foo");
775 assert_eq!(tokens, vec![(TokenKind::BareScalar, "/foo")]);
776
777 let tokens = tokenize("/usr/bin/foo");
779 assert_eq!(tokens, vec![(TokenKind::BareScalar, "/usr/bin/foo")]);
780
781 let tokens = tokenize("// comment");
783 assert_eq!(tokens, vec![(TokenKind::LineComment, "// comment")]);
784 }
785
786 #[test]
787 fn test_attribute_syntax_tokens() {
788 let tokens = tokenize("server host>localhost");
790 assert_eq!(
792 tokens,
793 vec![
794 (TokenKind::BareScalar, "server"),
795 (TokenKind::Whitespace, " "),
796 (TokenKind::BareScalar, "host"),
797 (TokenKind::Gt, ">"),
798 (TokenKind::BareScalar, "localhost"),
799 ]
800 );
801 }
802
803 #[test]
804 fn test_unterminated_heredoc() {
805 let tokens = tokenize("<<EOF\nhello world\n");
807 eprintln!("tokens = {:?}", tokens);
808 assert!(
809 tokens.iter().any(|t| t.0 == TokenKind::Error),
810 "Expected Error token for unterminated heredoc"
811 );
812 }
813
814 #[test]
815 fn test_unterminated_string() {
816 let tokens = tokenize("\"hello");
818 eprintln!("tokens = {:?}", tokens);
819 assert!(
820 tokens.iter().any(|t| t.0 == TokenKind::Error),
821 "Expected Error token for unterminated string"
822 );
823 }
824}