1#![allow(dead_code)]
17
18use std::collections::VecDeque;
19
20use super::model::{YamlDiagnostic, diagnostic_codes};
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
25pub(crate) struct Mark {
26 pub index: usize,
27 pub line: usize,
28 pub column: usize,
29}
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub(crate) struct SimpleKey {
39 pub token_number: usize,
40 pub required: bool,
41 pub mark: Mark,
42}
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub(crate) enum ScalarStyle {
48 Plain,
49 SingleQuoted,
50 DoubleQuoted,
51 Literal,
52 Folded,
53}
54
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub(crate) enum TriviaKind {
59 Whitespace,
60 Newline,
61 Comment,
62}
63
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65pub(crate) enum TokenKind {
66 StreamStart,
67 StreamEnd,
68 DocumentStart,
69 DocumentEnd,
70 Directive,
71 BlockSequenceStart,
72 BlockMappingStart,
73 BlockEnd,
74 FlowSequenceStart,
75 FlowSequenceEnd,
76 FlowMappingStart,
77 FlowMappingEnd,
78 BlockEntry,
79 FlowEntry,
80 Key,
81 Value,
82 Alias,
83 Anchor,
84 Tag,
85 Scalar(ScalarStyle),
86 Trivia(TriviaKind),
87}
88
89#[derive(Debug, Clone, Copy, PartialEq, Eq)]
90pub(crate) struct Token {
91 pub kind: TokenKind,
92 pub start: Mark,
93 pub end: Mark,
94}
95
96#[derive(Debug)]
97pub(crate) struct Scanner<'a> {
98 input: &'a str,
99 cursor: Mark,
100 tokens: VecDeque<Token>,
101 tokens_taken: usize,
107 indent: i32,
110 indent_stack: Vec<i32>,
112 simple_keys: Vec<Option<SimpleKey>>,
115 flow_level: usize,
116 allow_simple_key: bool,
122 diagnostics: Vec<YamlDiagnostic>,
123 stream_end_emitted: bool,
124}
125
126impl<'a> Scanner<'a> {
127 pub(crate) fn new(input: &'a str) -> Self {
128 let mut scanner = Self {
129 input,
130 cursor: Mark::default(),
131 tokens: VecDeque::new(),
132 tokens_taken: 0,
133 indent: -1,
134 indent_stack: Vec::new(),
135 simple_keys: vec![None],
138 flow_level: 0,
139 allow_simple_key: true,
140 diagnostics: Vec::new(),
141 stream_end_emitted: false,
142 };
143 let mark = scanner.cursor;
144 scanner.tokens.push_back(Token {
145 kind: TokenKind::StreamStart,
146 start: mark,
147 end: mark,
148 });
149 scanner
150 }
151
152 pub(crate) fn next_token(&mut self) -> Option<Token> {
153 while self.need_more_tokens() {
154 self.fetch_more_tokens();
155 }
156 let tok = self.tokens.pop_front();
157 if tok.is_some() {
158 self.tokens_taken += 1;
159 }
160 tok
161 }
162
163 fn need_more_tokens(&mut self) -> bool {
171 if self.stream_end_emitted {
172 return false;
173 }
174 if self.tokens.is_empty() {
175 return true;
176 }
177 self.stale_simple_keys();
178 matches!(
179 self.next_possible_simple_key_index(),
180 Some(min) if min == self.tokens_taken
181 )
182 }
183
184 fn next_possible_simple_key_index(&self) -> Option<usize> {
185 self.simple_keys
186 .iter()
187 .filter_map(|slot| slot.as_ref().map(|k| k.token_number))
188 .min()
189 }
190
191 fn fetch_more_tokens(&mut self) {
194 self.scan_trivia();
195 self.stale_simple_keys();
196 self.unwind_indent(self.cursor.column as i32);
197 if self.at_eof() {
198 self.fetch_stream_end();
199 return;
200 }
201 if self.flow_level == 0 && self.cursor.column == 0 {
204 if self.check_document_indicator(b"---") {
205 self.fetch_document_marker(TokenKind::DocumentStart);
206 return;
207 }
208 if self.check_document_indicator(b"...") {
209 self.fetch_document_marker(TokenKind::DocumentEnd);
210 return;
211 }
212 if self.peek_char() == Some('%') {
213 self.fetch_directive();
214 return;
215 }
216 }
217 match self.peek_char() {
218 Some('[') => {
219 self.fetch_flow_collection_start(TokenKind::FlowSequenceStart);
220 return;
221 }
222 Some('{') => {
223 self.fetch_flow_collection_start(TokenKind::FlowMappingStart);
224 return;
225 }
226 Some(']') => {
227 self.fetch_flow_collection_end(TokenKind::FlowSequenceEnd);
228 return;
229 }
230 Some('}') => {
231 self.fetch_flow_collection_end(TokenKind::FlowMappingEnd);
232 return;
233 }
234 Some(',') if self.flow_level > 0 => {
235 self.fetch_flow_entry();
236 return;
237 }
238 Some('-') if self.check_block_entry() => {
239 self.fetch_block_entry();
240 return;
241 }
242 Some('?') if self.check_key() => {
243 self.fetch_key();
244 return;
245 }
246 Some(':') if self.check_value() => {
247 self.fetch_value();
248 return;
249 }
250 Some('\'') => {
251 self.fetch_flow_scalar(ScalarStyle::SingleQuoted);
252 return;
253 }
254 Some('"') => {
255 self.fetch_flow_scalar(ScalarStyle::DoubleQuoted);
256 return;
257 }
258 Some('|') if self.flow_level == 0 => {
259 self.fetch_block_scalar(ScalarStyle::Literal);
260 return;
261 }
262 Some('>') if self.flow_level == 0 => {
263 self.fetch_block_scalar(ScalarStyle::Folded);
264 return;
265 }
266 Some('&') => {
267 self.fetch_anchor();
268 return;
269 }
270 Some('*') => {
271 self.fetch_alias();
272 return;
273 }
274 Some('!') => {
275 self.fetch_tag();
276 return;
277 }
278 _ => {}
279 }
280 self.fetch_plain_scalar();
282 }
283
284 fn fetch_flow_collection_start(&mut self, kind: TokenKind) {
285 self.save_simple_key();
291 let start = self.cursor;
292 self.advance();
293 let end = self.cursor;
294 self.flow_level += 1;
295 self.allow_simple_key = true;
298 self.simple_keys.push(None);
299 self.tokens.push_back(Token { kind, start, end });
300 }
301
302 fn fetch_flow_collection_end(&mut self, kind: TokenKind) {
303 let start = self.cursor;
304 self.advance();
305 let end = self.cursor;
306 if self.flow_level > 0 {
307 self.flow_level -= 1;
308 self.simple_keys.pop();
309 }
310 self.tokens.push_back(Token { kind, start, end });
311 }
312
313 fn fetch_flow_entry(&mut self) {
314 self.allow_simple_key = true;
317 self.remove_simple_key();
318 let start = self.cursor;
319 self.advance();
320 let end = self.cursor;
321 self.tokens.push_back(Token {
322 kind: TokenKind::FlowEntry,
323 start,
324 end,
325 });
326 }
327
328 fn fetch_block_entry(&mut self) {
329 if self.flow_level == 0 {
330 if !self.allow_simple_key {
331 self.push_diagnostic(
332 diagnostic_codes::LEX_BLOCK_ENTRY_NOT_ALLOWED,
333 "block sequence entry not allowed here",
334 );
335 }
336 if self.add_indent(self.cursor.column as i32) {
337 let mark = self.cursor;
338 self.tokens.push_back(Token {
339 kind: TokenKind::BlockSequenceStart,
340 start: mark,
341 end: mark,
342 });
343 }
344 }
345 self.allow_simple_key = true;
346 self.remove_simple_key();
347 let start = self.cursor;
348 self.advance();
349 let end = self.cursor;
350 self.tokens.push_back(Token {
351 kind: TokenKind::BlockEntry,
352 start,
353 end,
354 });
355 }
356
357 fn fetch_key(&mut self) {
358 if self.flow_level == 0 {
359 if !self.allow_simple_key {
360 self.push_diagnostic(
361 diagnostic_codes::LEX_KEY_INDICATOR_NOT_ALLOWED,
362 "explicit key indicator not allowed here",
363 );
364 }
365 if self.add_indent(self.cursor.column as i32) {
366 let mark = self.cursor;
367 self.tokens.push_back(Token {
368 kind: TokenKind::BlockMappingStart,
369 start: mark,
370 end: mark,
371 });
372 }
373 }
374 self.allow_simple_key = self.flow_level == 0;
377 self.remove_simple_key();
378 let start = self.cursor;
379 self.advance();
380 let end = self.cursor;
381 self.tokens.push_back(Token {
382 kind: TokenKind::Key,
383 start,
384 end,
385 });
386 }
387
388 fn fetch_value(&mut self) {
389 if let Some(key) = self.simple_keys[self.flow_level].take() {
390 let queue_pos = key.token_number.saturating_sub(self.tokens_taken);
395 self.tokens.insert(
396 queue_pos,
397 Token {
398 kind: TokenKind::Key,
399 start: key.mark,
400 end: key.mark,
401 },
402 );
403 if self.flow_level == 0 && self.add_indent(key.mark.column as i32) {
404 self.tokens.insert(
405 queue_pos,
406 Token {
407 kind: TokenKind::BlockMappingStart,
408 start: key.mark,
409 end: key.mark,
410 },
411 );
412 }
413 self.allow_simple_key = false;
414 } else {
415 if self.flow_level == 0 {
419 if !self.allow_simple_key {
420 self.push_diagnostic(
421 diagnostic_codes::LEX_VALUE_INDICATOR_NOT_ALLOWED,
422 "value indicator not allowed here",
423 );
424 }
425 if self.add_indent(self.cursor.column as i32) {
426 let mark = self.cursor;
427 self.tokens.push_back(Token {
428 kind: TokenKind::BlockMappingStart,
429 start: mark,
430 end: mark,
431 });
432 }
433 }
434 self.allow_simple_key = self.flow_level == 0;
435 self.remove_simple_key();
436 }
437 let start = self.cursor;
438 self.advance();
439 let end = self.cursor;
440 self.tokens.push_back(Token {
441 kind: TokenKind::Value,
442 start,
443 end,
444 });
445 }
446
447 fn fetch_plain_scalar(&mut self) {
462 self.save_simple_key();
463 self.allow_simple_key = false;
464 let start = self.cursor;
465 let min_indent = self.indent + 1;
466 let placeholder = matches!(self.input[start.index..].chars().next(), Some('!'));
476 loop {
477 let chunk_start = self.cursor.index;
478 self.consume_plain_chunk();
479 if self.cursor.index == chunk_start {
480 break;
481 }
482 let saved = self.cursor;
486 while matches!(self.peek_char(), Some(' ' | '\t')) {
487 self.advance();
488 }
489 match self.peek_char() {
490 None | Some('#') => {
491 self.cursor = saved;
492 break;
493 }
494 Some('\n' | '\r') => {
495 if !self.try_consume_plain_line_break(min_indent, placeholder) {
496 self.cursor = saved;
497 break;
498 }
499 }
500 Some(_) => {
501 }
504 }
505 }
506 let end = self.cursor;
507 if start.index == end.index {
508 self.advance();
513 let end = self.cursor;
514 self.tokens.push_back(Token {
515 kind: TokenKind::Scalar(ScalarStyle::Plain),
516 start,
517 end,
518 });
519 return;
520 }
521 self.tokens.push_back(Token {
522 kind: TokenKind::Scalar(ScalarStyle::Plain),
523 start,
524 end,
525 });
526 }
527
528 fn consume_plain_chunk(&mut self) {
532 loop {
533 match self.peek_char() {
534 None | Some('\n' | '\r' | ' ' | '\t') => break,
535 Some(':') => {
536 let next = self.peek_at(1);
537 if matches!(next, None | Some(' ' | '\t' | '\n' | '\r')) {
538 break;
539 }
540 if self.flow_level > 0 && matches!(next, Some(',' | ']' | '}')) {
541 break;
542 }
543 self.advance();
544 }
545 Some(',' | '[' | ']' | '{' | '}') if self.flow_level > 0 => break,
546 _ => {
547 self.advance();
548 }
549 }
550 }
551 }
552
553 fn try_consume_plain_line_break(&mut self, min_indent: i32, placeholder: bool) -> bool {
560 let saved = self.cursor;
561 self.consume_one_line_break();
562 loop {
563 while matches!(self.peek_char(), Some(' ' | '\t')) {
564 self.advance();
565 }
566 match self.peek_char() {
567 None => {
568 self.cursor = saved;
569 return false;
570 }
571 Some('\n' | '\r') => {
572 self.consume_one_line_break();
573 continue;
574 }
575 Some('#') => {
576 self.cursor = saved;
577 return false;
578 }
579 Some(_) => {
580 let col = self.cursor.column as i32;
581 if col < min_indent {
582 self.cursor = saved;
583 return false;
584 }
585 if self.flow_level == 0 {
586 if col == 0
588 && (self.check_document_indicator(b"---")
589 || self.check_document_indicator(b"..."))
590 {
591 self.cursor = saved;
592 return false;
593 }
594 let aborts = if placeholder {
604 matches!(self.peek_char(), Some('-' | '?' | ':'))
605 } else {
606 self.peek_char() == Some(':')
607 };
608 if aborts
609 && matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
610 {
611 self.cursor = saved;
612 return false;
613 }
614 } else if matches!(self.peek_char(), Some(',' | ']' | '}')) {
615 self.cursor = saved;
620 return false;
621 }
622 return true;
623 }
624 }
625 }
626 }
627
628 fn fetch_flow_scalar(&mut self, style: ScalarStyle) {
634 self.save_simple_key();
635 self.allow_simple_key = false;
636 let start = self.cursor;
637 let quote = match style {
638 ScalarStyle::SingleQuoted => '\'',
639 ScalarStyle::DoubleQuoted => '"',
640 _ => unreachable!("fetch_flow_scalar called with non-quoted style"),
641 };
642 self.advance();
644 let mut closed = false;
645 while let Some(c) = self.peek_char() {
646 if c == quote {
647 if style == ScalarStyle::SingleQuoted && self.peek_at(1) == Some('\'') {
648 self.advance();
651 self.advance();
652 continue;
653 }
654 self.advance();
655 closed = true;
656 break;
657 }
658 if style == ScalarStyle::DoubleQuoted && c == '\\' {
659 self.advance();
660 self.consume_double_quoted_escape();
661 continue;
662 }
663 if self.flow_level == 0
668 && self.cursor.column == 0
669 && (self.check_document_indicator(b"---") || self.check_document_indicator(b"..."))
670 {
671 break;
672 }
673 self.advance();
674 }
675 if !closed {
676 self.diagnostics.push(YamlDiagnostic {
677 code: diagnostic_codes::LEX_UNTERMINATED_QUOTED_SCALAR,
678 message: "unterminated quoted scalar",
679 byte_start: start.index,
680 byte_end: self.cursor.index,
681 });
682 }
683 let end = self.cursor;
684 self.tokens.push_back(Token {
685 kind: TokenKind::Scalar(style),
686 start,
687 end,
688 });
689 }
690
691 fn consume_double_quoted_escape(&mut self) {
698 let backslash_index = self.cursor.index.saturating_sub(1);
701 match self.peek_char() {
702 None => {
703 }
706 Some('\n') => {
707 self.advance();
708 }
709 Some('\r') => {
710 self.advance();
711 if self.peek_char() == Some('\n') {
712 self.advance();
713 }
714 }
715 Some('x') => {
716 self.advance();
717 self.consume_hex_digits(2, backslash_index);
718 }
719 Some('u') => {
720 self.advance();
721 self.consume_hex_digits(4, backslash_index);
722 }
723 Some('U') => {
724 self.advance();
725 self.consume_hex_digits(8, backslash_index);
726 }
727 Some(c) if Self::is_double_quoted_single_byte_escape(c) => {
728 self.advance();
729 }
730 Some(_) => {
731 let invalid_end = self.cursor.index + self.peek_char().unwrap().len_utf8();
732 self.diagnostics.push(YamlDiagnostic {
733 code: diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE,
734 message: "invalid double-quoted escape",
735 byte_start: backslash_index,
736 byte_end: invalid_end,
737 });
738 self.advance();
739 }
740 }
741 }
742
743 fn consume_hex_digits(&mut self, count: usize, backslash_index: usize) {
744 let mut consumed = 0;
745 while consumed < count {
746 match self.peek_char() {
747 Some(c) if c.is_ascii_hexdigit() => {
748 self.advance();
749 consumed += 1;
750 }
751 _ => break,
752 }
753 }
754 if consumed < count {
755 self.diagnostics.push(YamlDiagnostic {
756 code: diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE,
757 message: "incomplete hex escape in double-quoted scalar",
758 byte_start: backslash_index,
759 byte_end: self.cursor.index,
760 });
761 }
762 }
763
764 fn is_double_quoted_single_byte_escape(c: char) -> bool {
765 matches!(
767 c,
768 '0' | 'a'
769 | 'b'
770 | 't'
771 | '\t'
772 | 'n'
773 | 'v'
774 | 'f'
775 | 'r'
776 | 'e'
777 | ' '
778 | '"'
779 | '/'
780 | '\\'
781 | 'N'
782 | '_'
783 | 'L'
784 | 'P'
785 )
786 }
787
788 fn fetch_block_scalar(&mut self, style: ScalarStyle) {
799 self.allow_simple_key = true;
804 self.remove_simple_key();
805 let start = self.cursor;
806 let parent_indent = self.indent;
807 self.advance();
809 let mut explicit_increment: Option<u32> = None;
811 for _ in 0..2 {
812 match self.peek_char() {
813 Some('+' | '-') => {
814 self.advance();
815 }
816 Some(d @ '1'..='9') if explicit_increment.is_none() => {
817 explicit_increment = Some(d.to_digit(10).expect("hex digit"));
818 self.advance();
819 }
820 _ => break,
821 }
822 }
823 while matches!(self.peek_char(), Some(' ' | '\t')) {
825 self.advance();
826 }
827 if self.peek_char() == Some('#') {
829 while !matches!(self.peek_char(), None | Some('\n' | '\r')) {
830 self.advance();
831 }
832 }
833 match self.peek_char() {
837 Some('\n') => {
838 self.advance();
839 }
840 Some('\r') => {
841 self.advance();
842 if self.peek_char() == Some('\n') {
843 self.advance();
844 }
845 }
846 None => {
847 let end = self.cursor;
849 self.tokens.push_back(Token {
850 kind: TokenKind::Scalar(style),
851 start,
852 end,
853 });
854 return;
855 }
856 Some(_) => {
857 while !matches!(self.peek_char(), None | Some('\n' | '\r')) {
859 self.advance();
860 }
861 match self.peek_char() {
862 Some('\n') => {
863 self.advance();
864 }
865 Some('\r') => {
866 self.advance();
867 if self.peek_char() == Some('\n') {
868 self.advance();
869 }
870 }
871 _ => {}
872 }
873 }
874 }
875 let base = parent_indent.max(0);
881 let auto_floor = (parent_indent + 1).max(0);
882 let min_indent = match explicit_increment {
883 Some(m) => base + m as i32,
884 None => self
885 .auto_detect_block_scalar_indent()
886 .unwrap_or(auto_floor)
887 .max(auto_floor),
888 };
889 loop {
892 let line_start = self.cursor.index;
893 let bytes = self.input.as_bytes();
894 let mut probe = line_start;
895 while bytes.get(probe) == Some(&b' ') {
896 probe += 1;
897 }
898 let leading_spaces = probe - line_start;
899 match bytes.get(probe) {
900 None => break,
901 Some(b'\n' | b'\r') => {
902 while self.cursor.index < probe {
905 self.advance();
906 }
907 self.consume_one_line_break();
908 continue;
909 }
910 _ => {}
911 }
912 if (leading_spaces as i32) < min_indent {
913 break;
915 }
916 if leading_spaces == 0
917 && (bytes.get(probe..probe + 3) == Some(b"---")
918 || bytes.get(probe..probe + 3) == Some(b"..."))
919 && matches!(
920 bytes.get(probe + 3),
921 None | Some(b' ' | b'\t' | b'\n' | b'\r')
922 )
923 {
924 break;
926 }
927 while !matches!(self.peek_char(), None | Some('\n' | '\r')) {
929 self.advance();
930 }
931 self.consume_one_line_break();
932 if self.at_eof() {
933 break;
934 }
935 }
936 let end = self.cursor;
937 self.tokens.push_back(Token {
938 kind: TokenKind::Scalar(style),
939 start,
940 end,
941 });
942 }
943
944 fn auto_detect_block_scalar_indent(&self) -> Option<i32> {
948 let bytes = self.input.as_bytes();
949 let mut i = self.cursor.index;
950 while i < bytes.len() {
951 let line_start = i;
952 while bytes.get(i) == Some(&b' ') {
953 i += 1;
954 }
955 match bytes.get(i) {
956 None => return None,
957 Some(b'\n') => {
958 i += 1;
959 continue;
960 }
961 Some(b'\r') => {
962 i += 1;
963 if bytes.get(i) == Some(&b'\n') {
964 i += 1;
965 }
966 continue;
967 }
968 _ => {
969 return Some((i - line_start) as i32);
970 }
971 }
972 }
973 None
974 }
975
976 fn consume_one_line_break(&mut self) {
977 match self.peek_char() {
978 Some('\n') => {
979 self.advance();
980 }
981 Some('\r') => {
982 self.advance();
983 if self.peek_char() == Some('\n') {
984 self.advance();
985 }
986 }
987 _ => {}
988 }
989 }
990
991 fn fetch_stream_end(&mut self) {
992 if self.stream_end_emitted {
993 return;
994 }
995 self.unwind_indent(-1);
996 for slot in self.simple_keys.iter_mut() {
1000 if let Some(key) = slot.take()
1001 && key.required
1002 {
1003 self.diagnostics.push(YamlDiagnostic {
1004 code: diagnostic_codes::LEX_REQUIRED_SIMPLE_KEY_NOT_FOUND,
1005 message: "could not find expected ':' for required simple key",
1006 byte_start: key.mark.index,
1007 byte_end: key.mark.index,
1008 });
1009 }
1010 }
1011 self.allow_simple_key = false;
1012 self.stream_end_emitted = true;
1013 let mark = self.cursor;
1014 self.tokens.push_back(Token {
1015 kind: TokenKind::StreamEnd,
1016 start: mark,
1017 end: mark,
1018 });
1019 }
1020
1021 fn check_block_entry(&self) -> bool {
1022 matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
1023 }
1024
1025 fn check_key(&self) -> bool {
1031 matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
1032 }
1033
1034 fn check_value(&self) -> bool {
1039 if self.flow_level > 0 {
1040 return true;
1041 }
1042 matches!(self.peek_at(1), None | Some(' ' | '\t' | '\n' | '\r'))
1043 }
1044
1045 fn add_indent(&mut self, column: i32) -> bool {
1049 if self.indent < column {
1050 self.indent_stack.push(self.indent);
1051 self.indent = column;
1052 true
1053 } else {
1054 false
1055 }
1056 }
1057
1058 fn unwind_indent(&mut self, column: i32) {
1061 if self.flow_level > 0 {
1062 return;
1063 }
1064 while self.indent > column {
1065 let mark = self.cursor;
1066 self.indent = self.indent_stack.pop().unwrap_or(-1);
1067 self.tokens.push_back(Token {
1068 kind: TokenKind::BlockEnd,
1069 start: mark,
1070 end: mark,
1071 });
1072 }
1073 }
1074
1075 fn save_simple_key(&mut self) {
1082 if !self.allow_simple_key {
1083 return;
1084 }
1085 let required = self.flow_level == 0 && self.indent == self.cursor.column as i32;
1086 self.remove_simple_key();
1087 let token_number = self.tokens_taken + self.tokens.len();
1088 self.simple_keys[self.flow_level] = Some(SimpleKey {
1089 token_number,
1090 required,
1091 mark: self.cursor,
1092 });
1093 }
1094
1095 fn remove_simple_key(&mut self) {
1100 if let Some(key) = self.simple_keys[self.flow_level].take()
1101 && key.required
1102 {
1103 self.diagnostics.push(YamlDiagnostic {
1104 code: diagnostic_codes::LEX_REQUIRED_SIMPLE_KEY_NOT_FOUND,
1105 message: "could not find expected ':' for required simple key",
1106 byte_start: key.mark.index,
1107 byte_end: key.mark.index,
1108 });
1109 }
1110 }
1111
1112 fn stale_simple_keys(&mut self) {
1117 let line = self.cursor.line;
1118 for slot in self.simple_keys.iter_mut() {
1119 let stale = match slot {
1120 Some(key) => key.mark.line != line,
1121 None => false,
1122 };
1123 if stale
1124 && let Some(key) = slot.take()
1125 && key.required
1126 {
1127 self.diagnostics.push(YamlDiagnostic {
1128 code: diagnostic_codes::LEX_REQUIRED_SIMPLE_KEY_NOT_FOUND,
1129 message: "could not find expected ':' for required simple key",
1130 byte_start: key.mark.index,
1131 byte_end: key.mark.index,
1132 });
1133 }
1134 }
1135 }
1136
1137 fn push_diagnostic(&mut self, code: &'static str, message: &'static str) {
1138 self.diagnostics.push(YamlDiagnostic {
1139 code,
1140 message,
1141 byte_start: self.cursor.index,
1142 byte_end: self.cursor.index,
1143 });
1144 }
1145
1146 fn check_document_indicator(&self, marker: &[u8; 3]) -> bool {
1150 let bytes = self.input.as_bytes();
1151 let i = self.cursor.index;
1152 if bytes.get(i..i + 3) != Some(marker.as_slice()) {
1153 return false;
1154 }
1155 matches!(bytes.get(i + 3), None | Some(b' ' | b'\t' | b'\n' | b'\r'))
1156 }
1157
1158 fn fetch_document_marker(&mut self, kind: TokenKind) {
1159 self.unwind_indent(-1);
1170 self.remove_simple_key();
1171 self.allow_simple_key = false;
1172 let start = self.cursor;
1173 self.advance();
1174 self.advance();
1175 self.advance();
1176 let end = self.cursor;
1177 self.tokens.push_back(Token { kind, start, end });
1178 }
1179
1180 fn fetch_directive(&mut self) {
1184 let start = self.cursor;
1185 debug_assert_eq!(self.peek_char(), Some('%'));
1186 self.advance();
1187 while let Some(c) = self.peek_char() {
1188 if c == '\n' || c == '\r' {
1189 break;
1190 }
1191 self.advance();
1192 }
1193 let end = self.cursor;
1194 self.tokens.push_back(Token {
1195 kind: TokenKind::Directive,
1196 start,
1197 end,
1198 });
1199 }
1200
1201 fn fetch_anchor(&mut self) {
1206 self.save_simple_key();
1207 self.allow_simple_key = false;
1208 let start = self.cursor;
1209 debug_assert_eq!(self.peek_char(), Some('&'));
1210 self.advance();
1211 self.scan_anchor_name();
1212 let end = self.cursor;
1213 self.tokens.push_back(Token {
1214 kind: TokenKind::Anchor,
1215 start,
1216 end,
1217 });
1218 }
1219
1220 fn fetch_alias(&mut self) {
1223 self.save_simple_key();
1224 self.allow_simple_key = false;
1225 let start = self.cursor;
1226 debug_assert_eq!(self.peek_char(), Some('*'));
1227 self.advance();
1228 self.scan_anchor_name();
1229 let end = self.cursor;
1230 self.tokens.push_back(Token {
1231 kind: TokenKind::Alias,
1232 start,
1233 end,
1234 });
1235 }
1236
1237 fn fetch_tag(&mut self) {
1243 self.save_simple_key();
1244 self.allow_simple_key = false;
1245 let start = self.cursor;
1246 debug_assert_eq!(self.peek_char(), Some('!'));
1247 self.advance();
1248 if self.peek_char() == Some('<') {
1249 self.advance();
1253 while let Some(c) = self.peek_char() {
1254 self.advance();
1255 if c == '>' {
1256 break;
1257 }
1258 }
1259 } else {
1260 while let Some(c) = self.peek_char() {
1264 match c {
1265 ' ' | '\t' | '\n' | '\r' => break,
1266 ',' | '[' | ']' | '{' | '}' if self.flow_level > 0 => break,
1267 _ => {
1268 self.advance();
1269 }
1270 }
1271 }
1272 }
1273 let end = self.cursor;
1274 self.tokens.push_back(Token {
1275 kind: TokenKind::Tag,
1276 start,
1277 end,
1278 });
1279 }
1280
1281 fn scan_anchor_name(&mut self) {
1286 while let Some(c) = self.peek_char() {
1287 match c {
1288 ' ' | '\t' | '\n' | '\r' => break,
1289 ',' | '[' | ']' | '{' | '}' => break,
1290 _ => {
1291 self.advance();
1292 }
1293 }
1294 }
1295 }
1296
1297 fn scan_trivia(&mut self) {
1301 while !self.at_eof() {
1302 match self.peek_char() {
1303 Some(' ' | '\t') => self.scan_whitespace_run(),
1304 Some('\n' | '\r') => self.scan_newline(),
1305 Some('#') => self.scan_comment(),
1306 _ => break,
1307 }
1308 }
1309 }
1310
1311 fn scan_whitespace_run(&mut self) {
1312 let start = self.cursor;
1313 while matches!(self.peek_char(), Some(' ' | '\t')) {
1314 self.advance();
1315 }
1316 let end = self.cursor;
1317 self.tokens.push_back(Token {
1318 kind: TokenKind::Trivia(TriviaKind::Whitespace),
1319 start,
1320 end,
1321 });
1322 }
1323
1324 fn scan_newline(&mut self) {
1325 let start = self.cursor;
1326 match self.peek_char() {
1327 Some('\n') => {
1328 self.advance();
1329 }
1330 Some('\r') => {
1331 self.advance();
1332 if self.peek_char() == Some('\n') {
1333 self.advance();
1334 }
1335 }
1336 _ => unreachable!("scan_newline called on non-newline char"),
1337 }
1338 let end = self.cursor;
1339 if self.flow_level == 0 {
1344 self.allow_simple_key = true;
1345 }
1346 self.tokens.push_back(Token {
1347 kind: TokenKind::Trivia(TriviaKind::Newline),
1348 start,
1349 end,
1350 });
1351 }
1352
1353 fn scan_comment(&mut self) {
1354 let start = self.cursor;
1355 debug_assert_eq!(self.peek_char(), Some('#'));
1356 self.advance();
1357 while let Some(c) = self.peek_char() {
1358 if c == '\n' || c == '\r' {
1359 break;
1360 }
1361 self.advance();
1362 }
1363 let end = self.cursor;
1364 self.tokens.push_back(Token {
1365 kind: TokenKind::Trivia(TriviaKind::Comment),
1366 start,
1367 end,
1368 });
1369 }
1370
1371 pub(crate) fn diagnostics(&self) -> &[YamlDiagnostic] {
1372 &self.diagnostics
1373 }
1374
1375 pub(crate) fn cursor(&self) -> Mark {
1376 self.cursor
1377 }
1378
1379 pub(crate) fn at_eof(&self) -> bool {
1380 self.cursor.index >= self.input.len()
1381 }
1382
1383 fn remaining(&self) -> &str {
1384 &self.input[self.cursor.index..]
1385 }
1386
1387 pub(crate) fn peek_char(&self) -> Option<char> {
1388 self.remaining().chars().next()
1389 }
1390
1391 pub(crate) fn peek_at(&self, offset: usize) -> Option<char> {
1394 self.remaining().chars().nth(offset)
1395 }
1396
1397 pub(crate) fn advance(&mut self) -> Option<char> {
1401 let c = self.peek_char()?;
1402 self.cursor.index += c.len_utf8();
1403 match c {
1404 '\n' => {
1405 self.cursor.line += 1;
1406 self.cursor.column = 0;
1407 }
1408 '\r' => {
1409 if self.peek_char() != Some('\n') {
1413 self.cursor.line += 1;
1414 self.cursor.column = 0;
1415 }
1416 }
1417 _ => {
1418 self.cursor.column += 1;
1419 }
1420 }
1421 Some(c)
1422 }
1423}
1424
1425#[derive(Debug, Clone)]
1431pub struct ShadowScannerReport {
1432 pub byte_complete: bool,
1435 pub token_count: usize,
1437 pub diagnostic_codes: Vec<&'static str>,
1439 pub last_token_end: usize,
1441 pub input_len: usize,
1442 pub gap_at: Option<usize>,
1444 pub overlapping: bool,
1447}
1448
1449pub fn shadow_scanner_check(input: &str) -> ShadowScannerReport {
1454 let mut scanner = Scanner::new(input);
1455 let mut tokens = Vec::new();
1456 while let Some(tok) = scanner.next_token() {
1457 tokens.push(tok);
1458 }
1459 let mut cursor = 0usize;
1460 let mut overlapping = false;
1461 let mut gap_at: Option<usize> = None;
1462 for tok in &tokens {
1463 match tok.kind {
1464 TokenKind::StreamStart | TokenKind::StreamEnd => {}
1465 _ => {
1466 if tok.start.index < cursor {
1467 overlapping = true;
1468 } else if tok.start.index > cursor && gap_at.is_none() {
1469 gap_at = Some(cursor);
1470 }
1471 if tok.end.index > cursor {
1472 cursor = tok.end.index;
1473 }
1474 }
1475 }
1476 }
1477 let byte_complete = !overlapping && gap_at.is_none() && cursor == input.len();
1478 ShadowScannerReport {
1479 byte_complete,
1480 token_count: tokens.len(),
1481 diagnostic_codes: scanner.diagnostics.iter().map(|d| d.code).collect(),
1482 last_token_end: cursor,
1483 input_len: input.len(),
1484 gap_at,
1485 overlapping,
1486 }
1487}
1488
1489#[cfg(test)]
1490mod tests {
1491 use super::*;
1492
1493 #[test]
1494 fn empty_input_emits_stream_start_then_stream_end() {
1495 let mut scanner = Scanner::new("");
1496 assert_eq!(
1497 scanner.next_token().map(|t| t.kind),
1498 Some(TokenKind::StreamStart)
1499 );
1500 assert_eq!(
1501 scanner.next_token().map(|t| t.kind),
1502 Some(TokenKind::StreamEnd)
1503 );
1504 assert_eq!(scanner.next_token(), None);
1505 }
1506
1507 #[test]
1508 fn first_and_last_tokens_are_always_stream_markers() {
1509 let mut scanner = Scanner::new("foo: bar\n");
1510 assert_eq!(
1511 scanner.next_token().map(|t| t.kind),
1512 Some(TokenKind::StreamStart)
1513 );
1514 let mut last = None;
1515 while let Some(tok) = scanner.next_token() {
1516 last = Some(tok);
1517 }
1518 assert_eq!(last.map(|t| t.kind), Some(TokenKind::StreamEnd));
1519 }
1520
1521 #[test]
1522 fn stream_end_marks_cursor_position_after_trivia_only_input() {
1523 let input = " \n";
1524 let mut scanner = Scanner::new(input);
1525 let mut last = None;
1527 while let Some(tok) = scanner.next_token() {
1528 last = Some(tok);
1529 }
1530 let end = last.expect("stream end");
1531 assert_eq!(end.kind, TokenKind::StreamEnd);
1532 assert_eq!(end.start.index, input.len());
1533 assert_eq!(end.end.index, input.len());
1534 }
1535
1536 #[test]
1537 fn diagnostics_start_empty() {
1538 let scanner = Scanner::new("");
1539 assert!(scanner.diagnostics().is_empty());
1540 }
1541
1542 #[test]
1543 fn cursor_starts_at_origin() {
1544 let scanner = Scanner::new("anything");
1545 assert_eq!(
1546 scanner.cursor(),
1547 Mark {
1548 index: 0,
1549 line: 0,
1550 column: 0
1551 }
1552 );
1553 }
1554
1555 #[test]
1556 fn at_eof_is_true_for_empty_input() {
1557 let scanner = Scanner::new("");
1558 assert!(scanner.at_eof());
1559 assert_eq!(scanner.peek_char(), None);
1560 }
1561
1562 #[test]
1563 fn peek_does_not_advance_cursor() {
1564 let scanner = Scanner::new("abc");
1565 assert_eq!(scanner.peek_char(), Some('a'));
1566 assert_eq!(scanner.peek_at(1), Some('b'));
1567 assert_eq!(scanner.peek_at(2), Some('c'));
1568 assert_eq!(scanner.peek_at(3), None);
1569 assert_eq!(scanner.cursor().index, 0);
1570 }
1571
1572 #[test]
1573 fn advance_moves_through_ascii_one_column_per_char() {
1574 let mut scanner = Scanner::new("abc");
1575 assert_eq!(scanner.advance(), Some('a'));
1576 assert_eq!(
1577 scanner.cursor(),
1578 Mark {
1579 index: 1,
1580 line: 0,
1581 column: 1
1582 }
1583 );
1584 assert_eq!(scanner.advance(), Some('b'));
1585 assert_eq!(
1586 scanner.cursor(),
1587 Mark {
1588 index: 2,
1589 line: 0,
1590 column: 2
1591 }
1592 );
1593 assert_eq!(scanner.advance(), Some('c'));
1594 assert_eq!(
1595 scanner.cursor(),
1596 Mark {
1597 index: 3,
1598 line: 0,
1599 column: 3
1600 }
1601 );
1602 assert_eq!(scanner.advance(), None);
1603 assert!(scanner.at_eof());
1604 }
1605
1606 #[test]
1607 fn lf_increments_line_and_resets_column() {
1608 let mut scanner = Scanner::new("a\nb");
1609 scanner.advance(); scanner.advance(); assert_eq!(
1612 scanner.cursor(),
1613 Mark {
1614 index: 2,
1615 line: 1,
1616 column: 0
1617 }
1618 );
1619 scanner.advance(); assert_eq!(
1621 scanner.cursor(),
1622 Mark {
1623 index: 3,
1624 line: 1,
1625 column: 1
1626 }
1627 );
1628 }
1629
1630 #[test]
1631 fn crlf_counts_as_one_line_break() {
1632 let mut scanner = Scanner::new("a\r\nb");
1633 scanner.advance(); scanner.advance(); assert_eq!(scanner.cursor().line, 0);
1636 assert_eq!(scanner.cursor().index, 2);
1637 scanner.advance(); assert_eq!(
1639 scanner.cursor(),
1640 Mark {
1641 index: 3,
1642 line: 1,
1643 column: 0
1644 }
1645 );
1646 scanner.advance(); assert_eq!(
1648 scanner.cursor(),
1649 Mark {
1650 index: 4,
1651 line: 1,
1652 column: 1
1653 }
1654 );
1655 }
1656
1657 #[test]
1658 fn lone_cr_takes_its_own_line_break() {
1659 let mut scanner = Scanner::new("a\rb");
1660 scanner.advance(); scanner.advance(); assert_eq!(
1663 scanner.cursor(),
1664 Mark {
1665 index: 2,
1666 line: 1,
1667 column: 0
1668 }
1669 );
1670 scanner.advance(); assert_eq!(
1672 scanner.cursor(),
1673 Mark {
1674 index: 3,
1675 line: 1,
1676 column: 1
1677 }
1678 );
1679 }
1680
1681 #[test]
1682 fn multibyte_utf8_advances_index_by_byte_length_and_column_by_one() {
1683 let mut scanner = Scanner::new("é!");
1685 scanner.advance();
1686 assert_eq!(
1687 scanner.cursor(),
1688 Mark {
1689 index: 2,
1690 line: 0,
1691 column: 1
1692 }
1693 );
1694 scanner.advance();
1695 assert_eq!(
1696 scanner.cursor(),
1697 Mark {
1698 index: 3,
1699 line: 0,
1700 column: 2
1701 }
1702 );
1703 }
1704
1705 #[test]
1706 fn mixed_line_endings_track_correctly() {
1707 let mut scanner = Scanner::new("a\nb\r\nc\rd");
1709 while scanner.advance().is_some() {}
1710 assert_eq!(scanner.cursor().line, 3);
1711 assert_eq!(scanner.cursor().column, 1);
1712 assert_eq!(scanner.cursor().index, 8);
1713 }
1714
1715 fn collect_tokens(input: &str) -> Vec<Token> {
1716 let mut scanner = Scanner::new(input);
1717 let mut out = Vec::new();
1718 while let Some(tok) = scanner.next_token() {
1719 out.push(tok);
1720 }
1721 out
1722 }
1723
1724 fn trivia_kinds(tokens: &[Token]) -> Vec<TriviaKind> {
1725 tokens
1726 .iter()
1727 .filter_map(|t| match t.kind {
1728 TokenKind::Trivia(k) => Some(k),
1729 _ => None,
1730 })
1731 .collect()
1732 }
1733
1734 fn assert_byte_complete(input: &str, tokens: &[Token]) {
1735 let mut cursor = 0usize;
1738 for tok in tokens {
1739 match tok.kind {
1740 TokenKind::StreamStart | TokenKind::StreamEnd => {
1741 assert_eq!(tok.start.index, tok.end.index, "synthetic token has extent");
1742 }
1743 _ => {
1744 assert_eq!(tok.start.index, cursor, "token starts at expected position");
1745 assert!(tok.end.index >= tok.start.index);
1746 cursor = tok.end.index;
1747 }
1748 }
1749 }
1750 assert_eq!(cursor, input.len(), "all bytes covered");
1751 }
1752
1753 #[test]
1754 fn pure_whitespace_yields_one_whitespace_trivia_token() {
1755 let tokens = collect_tokens(" \t ");
1756 assert_eq!(
1757 trivia_kinds(&tokens),
1758 vec![TriviaKind::Whitespace],
1759 "whitespace coalesces into a single run"
1760 );
1761 assert_byte_complete(" \t ", &tokens);
1762 }
1763
1764 #[test]
1765 fn newline_emits_one_newline_per_logical_break() {
1766 let input = "\n\r\n\r";
1767 let tokens = collect_tokens(input);
1768 assert_eq!(
1769 trivia_kinds(&tokens),
1770 vec![
1771 TriviaKind::Newline,
1772 TriviaKind::Newline,
1773 TriviaKind::Newline
1774 ],
1775 );
1776 assert_byte_complete(input, &tokens);
1777 }
1778
1779 #[test]
1780 fn comment_runs_to_end_of_line_excluding_break() {
1781 let input = "# hello\n# next\n";
1782 let tokens = collect_tokens(input);
1783 assert_eq!(
1784 trivia_kinds(&tokens),
1785 vec![
1786 TriviaKind::Comment,
1787 TriviaKind::Newline,
1788 TriviaKind::Comment,
1789 TriviaKind::Newline,
1790 ],
1791 );
1792 let comment_tok = tokens
1794 .iter()
1795 .find(|t| matches!(t.kind, TokenKind::Trivia(TriviaKind::Comment)))
1796 .unwrap();
1797 assert_eq!(
1798 &input[comment_tok.start.index..comment_tok.end.index],
1799 "# hello"
1800 );
1801 assert_byte_complete(input, &tokens);
1802 }
1803
1804 #[test]
1805 fn whitespace_then_comment_then_newline_separates_into_three_tokens() {
1806 let input = " # comment\n";
1807 let tokens = collect_tokens(input);
1808 assert_eq!(
1809 trivia_kinds(&tokens),
1810 vec![
1811 TriviaKind::Whitespace,
1812 TriviaKind::Comment,
1813 TriviaKind::Newline
1814 ],
1815 );
1816 assert_byte_complete(input, &tokens);
1817 }
1818
1819 #[test]
1820 fn pure_trivia_input_round_trips_byte_complete() {
1821 let input = " \t# c1\r\n\n # c2\n\r";
1825 let tokens = collect_tokens(input);
1826 assert_byte_complete(input, &tokens);
1827 assert!(matches!(
1828 tokens.last().map(|t| t.kind),
1829 Some(TokenKind::StreamEnd),
1830 ));
1831 }
1832
1833 #[test]
1834 fn empty_input_emits_only_stream_markers() {
1835 let tokens = collect_tokens("");
1836 assert_eq!(tokens.len(), 2);
1837 assert_eq!(tokens[0].kind, TokenKind::StreamStart);
1838 assert_eq!(tokens[1].kind, TokenKind::StreamEnd);
1839 }
1840
1841 fn meaningful_kinds(tokens: &[Token]) -> Vec<TokenKind> {
1842 tokens
1843 .iter()
1844 .map(|t| t.kind)
1845 .filter(|k| !matches!(k, TokenKind::Trivia(_)))
1846 .collect()
1847 }
1848
1849 #[test]
1850 fn document_start_marker_at_column_zero_emits_token() {
1851 let input = "---\n";
1852 let tokens = collect_tokens(input);
1853 assert_eq!(
1854 meaningful_kinds(&tokens),
1855 vec![
1856 TokenKind::StreamStart,
1857 TokenKind::DocumentStart,
1858 TokenKind::StreamEnd
1859 ],
1860 );
1861 assert_byte_complete(input, &tokens);
1862 }
1863
1864 #[test]
1865 fn document_end_marker_at_column_zero_emits_token() {
1866 let input = "...\n";
1867 let tokens = collect_tokens(input);
1868 assert_eq!(
1869 meaningful_kinds(&tokens),
1870 vec![
1871 TokenKind::StreamStart,
1872 TokenKind::DocumentEnd,
1873 TokenKind::StreamEnd
1874 ],
1875 );
1876 assert_byte_complete(input, &tokens);
1877 }
1878
1879 #[test]
1880 fn document_marker_at_eof_without_trailing_break_still_emits() {
1881 let input = "---";
1882 let tokens = collect_tokens(input);
1883 assert_eq!(
1884 meaningful_kinds(&tokens),
1885 vec![
1886 TokenKind::StreamStart,
1887 TokenKind::DocumentStart,
1888 TokenKind::StreamEnd
1889 ],
1890 );
1891 }
1892
1893 #[test]
1894 fn three_dashes_followed_by_non_break_is_not_a_marker() {
1895 let tokens = collect_tokens("---abc\n");
1897 let kinds = meaningful_kinds(&tokens);
1898 assert!(!kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}",);
1899 assert!(
1900 kinds.contains(&TokenKind::Scalar(ScalarStyle::Plain)),
1901 "got {kinds:?}",
1902 );
1903 }
1904
1905 #[test]
1906 fn three_dashes_indented_is_not_a_marker() {
1907 let tokens = collect_tokens(" ---\n");
1909 let kinds = meaningful_kinds(&tokens);
1910 assert!(!kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}",);
1911 }
1912
1913 #[test]
1914 fn directive_at_column_zero_emits_directive_token() {
1915 let input = "%YAML 1.2\n";
1916 let tokens = collect_tokens(input);
1917 let directive = tokens
1918 .iter()
1919 .find(|t| matches!(t.kind, TokenKind::Directive))
1920 .expect("directive token");
1921 assert_eq!(
1922 &input[directive.start.index..directive.end.index],
1923 "%YAML 1.2",
1924 );
1925 assert_byte_complete(input, &tokens);
1926 }
1927
1928 #[test]
1929 fn directive_indented_is_not_recognized() {
1930 let tokens = collect_tokens(" %YAML 1.2\n");
1932 let kinds = meaningful_kinds(&tokens);
1933 assert!(!kinds.contains(&TokenKind::Directive), "got {kinds:?}",);
1934 }
1935
1936 #[test]
1937 fn document_start_then_marker_on_new_line() {
1938 let input = "---\n...\n";
1940 let tokens = collect_tokens(input);
1941 assert_eq!(
1942 meaningful_kinds(&tokens),
1943 vec![
1944 TokenKind::StreamStart,
1945 TokenKind::DocumentStart,
1946 TokenKind::DocumentEnd,
1947 TokenKind::StreamEnd,
1948 ],
1949 );
1950 assert_byte_complete(input, &tokens);
1951 }
1952
1953 #[test]
1954 fn directive_followed_by_doc_start_emits_both_in_order() {
1955 let input = "%YAML 1.2\n---\n";
1956 let tokens = collect_tokens(input);
1957 assert_eq!(
1958 meaningful_kinds(&tokens),
1959 vec![
1960 TokenKind::StreamStart,
1961 TokenKind::Directive,
1962 TokenKind::DocumentStart,
1963 TokenKind::StreamEnd,
1964 ],
1965 );
1966 assert_byte_complete(input, &tokens);
1967 }
1968
1969 #[test]
1970 fn document_marker_followed_by_space_emits_marker_then_content_scalar() {
1971 let input = "--- foo\n";
1972 let tokens = collect_tokens(input);
1973 let kinds = meaningful_kinds(&tokens);
1974 assert_eq!(kinds[0], TokenKind::StreamStart);
1975 assert_eq!(kinds[1], TokenKind::DocumentStart);
1976 assert_eq!(kinds[2], TokenKind::Scalar(ScalarStyle::Plain));
1978 assert_eq!(*kinds.last().unwrap(), TokenKind::StreamEnd);
1979 assert_byte_complete(input, &tokens);
1980 }
1981
1982 #[test]
1983 fn empty_flow_sequence_emits_start_then_end() {
1984 let input = "[]";
1985 let tokens = collect_tokens(input);
1986 assert_eq!(
1987 meaningful_kinds(&tokens),
1988 vec![
1989 TokenKind::StreamStart,
1990 TokenKind::FlowSequenceStart,
1991 TokenKind::FlowSequenceEnd,
1992 TokenKind::StreamEnd,
1993 ],
1994 );
1995 assert_byte_complete(input, &tokens);
1996 }
1997
1998 #[test]
1999 fn empty_flow_mapping_emits_start_then_end() {
2000 let input = "{}";
2001 let tokens = collect_tokens(input);
2002 assert_eq!(
2003 meaningful_kinds(&tokens),
2004 vec![
2005 TokenKind::StreamStart,
2006 TokenKind::FlowMappingStart,
2007 TokenKind::FlowMappingEnd,
2008 TokenKind::StreamEnd,
2009 ],
2010 );
2011 assert_byte_complete(input, &tokens);
2012 }
2013
2014 #[test]
2015 fn nested_flow_sequence_brackets_emit_in_order() {
2016 let input = "[[]]";
2017 let tokens = collect_tokens(input);
2018 assert_eq!(
2019 meaningful_kinds(&tokens),
2020 vec![
2021 TokenKind::StreamStart,
2022 TokenKind::FlowSequenceStart,
2023 TokenKind::FlowSequenceStart,
2024 TokenKind::FlowSequenceEnd,
2025 TokenKind::FlowSequenceEnd,
2026 TokenKind::StreamEnd,
2027 ],
2028 );
2029 assert_byte_complete(input, &tokens);
2030 }
2031
2032 #[test]
2033 fn nested_flow_mixed_brackets_emit_in_order() {
2034 let input = "[{}]";
2035 let tokens = collect_tokens(input);
2036 assert_eq!(
2037 meaningful_kinds(&tokens),
2038 vec![
2039 TokenKind::StreamStart,
2040 TokenKind::FlowSequenceStart,
2041 TokenKind::FlowMappingStart,
2042 TokenKind::FlowMappingEnd,
2043 TokenKind::FlowSequenceEnd,
2044 TokenKind::StreamEnd,
2045 ],
2046 );
2047 assert_byte_complete(input, &tokens);
2048 }
2049
2050 #[test]
2051 fn comma_inside_flow_emits_flow_entry() {
2052 let input = "[,,]";
2053 let tokens = collect_tokens(input);
2054 assert_eq!(
2055 meaningful_kinds(&tokens),
2056 vec![
2057 TokenKind::StreamStart,
2058 TokenKind::FlowSequenceStart,
2059 TokenKind::FlowEntry,
2060 TokenKind::FlowEntry,
2061 TokenKind::FlowSequenceEnd,
2062 TokenKind::StreamEnd,
2063 ],
2064 );
2065 assert_byte_complete(input, &tokens);
2066 }
2067
2068 #[test]
2069 fn comma_outside_flow_is_not_a_flow_entry() {
2070 let tokens = collect_tokens(",");
2072 let kinds = meaningful_kinds(&tokens);
2073 assert!(!kinds.contains(&TokenKind::FlowEntry), "got {kinds:?}");
2074 }
2075
2076 #[test]
2077 fn doc_markers_inside_flow_context_are_not_recognized() {
2078 let tokens = collect_tokens("[---]");
2081 let kinds = meaningful_kinds(&tokens);
2082 assert!(!kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}");
2083 assert_eq!(kinds[1], TokenKind::FlowSequenceStart);
2084 }
2085
2086 #[test]
2087 fn flow_brackets_with_whitespace_emit_trivia_between() {
2088 let input = "[ , ]";
2089 let tokens = collect_tokens(input);
2090 assert_eq!(
2092 tokens
2093 .iter()
2094 .map(|t| t.kind)
2095 .filter(|k| !matches!(k, TokenKind::StreamStart | TokenKind::StreamEnd))
2096 .collect::<Vec<_>>(),
2097 vec![
2098 TokenKind::FlowSequenceStart,
2099 TokenKind::Trivia(TriviaKind::Whitespace),
2100 TokenKind::FlowEntry,
2101 TokenKind::Trivia(TriviaKind::Whitespace),
2102 TokenKind::FlowSequenceEnd,
2103 ],
2104 );
2105 assert_byte_complete(input, &tokens);
2106 }
2107
2108 #[test]
2109 fn block_mapping_implicit_key_splices_block_mapping_start_and_key() {
2110 let input = "key: value";
2114 let tokens = collect_tokens(input);
2115 assert_eq!(
2116 meaningful_kinds(&tokens),
2117 vec![
2118 TokenKind::StreamStart,
2119 TokenKind::BlockMappingStart,
2120 TokenKind::Key,
2121 TokenKind::Scalar(ScalarStyle::Plain),
2122 TokenKind::Value,
2123 TokenKind::Scalar(ScalarStyle::Plain),
2124 TokenKind::BlockEnd,
2125 TokenKind::StreamEnd,
2126 ],
2127 );
2128 assert_byte_complete(input, &tokens);
2129 }
2130
2131 #[test]
2132 fn block_sequence_emits_block_sequence_start_then_entries() {
2133 let input = "- a\n- b\n";
2134 let tokens = collect_tokens(input);
2135 assert_eq!(
2136 meaningful_kinds(&tokens),
2137 vec![
2138 TokenKind::StreamStart,
2139 TokenKind::BlockSequenceStart,
2140 TokenKind::BlockEntry,
2141 TokenKind::Scalar(ScalarStyle::Plain),
2142 TokenKind::BlockEntry,
2143 TokenKind::Scalar(ScalarStyle::Plain),
2144 TokenKind::BlockEnd,
2145 TokenKind::StreamEnd,
2146 ],
2147 );
2148 assert_byte_complete(input, &tokens);
2149 }
2150
2151 #[test]
2152 fn explicit_key_indicator_emits_key_and_value_without_splice() {
2153 let input = "? a\n: b\n";
2157 let tokens = collect_tokens(input);
2158 let kinds = meaningful_kinds(&tokens);
2159 assert_eq!(
2160 kinds,
2161 vec![
2162 TokenKind::StreamStart,
2163 TokenKind::BlockMappingStart,
2164 TokenKind::Key,
2165 TokenKind::Scalar(ScalarStyle::Plain),
2166 TokenKind::Value,
2167 TokenKind::Scalar(ScalarStyle::Plain),
2168 TokenKind::BlockEnd,
2169 TokenKind::StreamEnd,
2170 ],
2171 );
2172 assert_byte_complete(input, &tokens);
2173 }
2174
2175 #[test]
2176 fn multi_line_plain_scalar_does_not_confirm_simple_key_on_next_line() {
2177 let input = "a\nb: c\n";
2183 let tokens = collect_tokens(input);
2184 let kinds = meaningful_kinds(&tokens);
2185 let scalar_pos = kinds
2188 .iter()
2189 .position(|&k| k == TokenKind::Scalar(ScalarStyle::Plain))
2190 .expect("plain scalar present");
2191 if let Some(key_pos) = kinds.iter().position(|&k| k == TokenKind::Key) {
2192 assert!(
2193 scalar_pos < key_pos,
2194 "multi-line scalar must precede any key: {kinds:?}",
2195 );
2196 }
2197 let scalar = tokens
2199 .iter()
2200 .find(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2201 .unwrap();
2202 assert_eq!(&input[scalar.start.index..scalar.end.index], "a\nb");
2203 }
2204
2205 #[test]
2206 fn flow_mapping_with_implicit_key_emits_only_flow_indicators() {
2207 let input = "{a: b}";
2210 let tokens = collect_tokens(input);
2211 let kinds = meaningful_kinds(&tokens);
2212 assert_eq!(
2213 kinds,
2214 vec![
2215 TokenKind::StreamStart,
2216 TokenKind::FlowMappingStart,
2217 TokenKind::Key,
2218 TokenKind::Scalar(ScalarStyle::Plain),
2219 TokenKind::Value,
2220 TokenKind::Scalar(ScalarStyle::Plain),
2221 TokenKind::FlowMappingEnd,
2222 TokenKind::StreamEnd,
2223 ],
2224 );
2225 assert!(
2226 !kinds.contains(&TokenKind::BlockMappingStart),
2227 "got {kinds:?}",
2228 );
2229 assert_byte_complete(input, &tokens);
2230 }
2231
2232 #[test]
2233 fn flow_explicit_key_indicator_emits_key_token() {
2234 let input = "{? a: b}";
2237 let tokens = collect_tokens(input);
2238 let kinds = meaningful_kinds(&tokens);
2239 assert_eq!(kinds[0], TokenKind::StreamStart);
2240 assert_eq!(kinds[1], TokenKind::FlowMappingStart);
2241 assert_eq!(kinds[2], TokenKind::Key);
2242 assert!(kinds.contains(&TokenKind::Value));
2245 assert_byte_complete(input, &tokens);
2246 }
2247
2248 #[test]
2249 fn nested_block_mapping_emits_block_end_on_dedent() {
2250 let input = "outer:\n inner: x\ny: z\n";
2256 let tokens = collect_tokens(input);
2257 let kinds = meaningful_kinds(&tokens);
2258 let block_ends = kinds.iter().filter(|&&k| k == TokenKind::BlockEnd).count();
2259 assert_eq!(block_ends, 2, "got {kinds:?}");
2262 assert_byte_complete(input, &tokens);
2263 }
2264
2265 #[test]
2266 fn nested_block_sequence_inside_mapping_unwinds_correctly() {
2267 let input = "items:\n - a\n - b\nstatus: ok\n";
2275 let tokens = collect_tokens(input);
2276 let kinds = meaningful_kinds(&tokens);
2277 let key_positions: Vec<_> = kinds
2280 .iter()
2281 .enumerate()
2282 .filter_map(|(i, &k)| (k == TokenKind::Key).then_some(i))
2283 .collect();
2284 assert_eq!(key_positions.len(), 2, "expected 2 keys: {kinds:?}");
2285 let second_key = key_positions[1];
2286 let preceding_block_end = kinds[..second_key]
2287 .iter()
2288 .rposition(|&k| k == TokenKind::BlockEnd);
2289 assert!(
2290 preceding_block_end.is_some(),
2291 "BlockEnd must precede second key: {kinds:?}",
2292 );
2293 let n = kinds.len();
2295 assert_eq!(kinds[n - 1], TokenKind::StreamEnd);
2296 assert_eq!(kinds[n - 2], TokenKind::BlockEnd);
2297 assert_byte_complete(input, &tokens);
2298 }
2299
2300 #[test]
2301 fn value_indicator_with_no_simple_key_emits_block_mapping_start() {
2302 let input = ": value\n";
2306 let tokens = collect_tokens(input);
2307 let kinds = meaningful_kinds(&tokens);
2308 assert_eq!(kinds[0], TokenKind::StreamStart);
2309 assert_eq!(kinds[1], TokenKind::BlockMappingStart);
2310 assert_eq!(kinds[2], TokenKind::Value);
2311 assert!(!kinds[..3].contains(&TokenKind::Key), "got {kinds:?}",);
2313 assert_byte_complete(input, &tokens);
2314 }
2315
2316 #[test]
2317 fn block_mapping_unwinds_indents_at_stream_end() {
2318 let input = "a:\n b: c";
2323 let tokens = collect_tokens(input);
2324 let kinds = meaningful_kinds(&tokens);
2325 let n = kinds.len();
2327 assert_eq!(kinds[n - 1], TokenKind::StreamEnd);
2328 assert_eq!(kinds[n - 2], TokenKind::BlockEnd);
2329 assert_eq!(kinds[n - 3], TokenKind::BlockEnd);
2330 assert_byte_complete(input, &tokens);
2331 }
2332
2333 #[test]
2334 fn colon_inside_plain_scalar_token_does_not_break_scalar() {
2335 let input = "https://example.com";
2338 let tokens = collect_tokens(input);
2339 let scalar = tokens
2340 .iter()
2341 .find(|t| matches!(t.kind, TokenKind::Scalar(_)))
2342 .expect("plain scalar token");
2343 assert_eq!(
2344 &input[scalar.start.index..scalar.end.index],
2345 "https://example.com",
2346 );
2347 assert_byte_complete(input, &tokens);
2348 }
2349
2350 #[test]
2351 fn diagnostics_remain_empty_for_well_formed_inputs() {
2352 for input in ["key: value", "- a\n- b\n", "{a: b, c: d}", "? k\n: v\n"] {
2353 let mut scanner = Scanner::new(input);
2354 while scanner.next_token().is_some() {}
2355 assert!(
2356 scanner.diagnostics().is_empty(),
2357 "{input:?} produced unexpected diagnostics: {:?}",
2358 scanner.diagnostics(),
2359 );
2360 }
2361 }
2362
2363 fn find_scalar(tokens: &[Token]) -> &Token {
2364 tokens
2365 .iter()
2366 .find(|t| matches!(t.kind, TokenKind::Scalar(_)))
2367 .expect("expected scalar token")
2368 }
2369
2370 #[test]
2371 fn single_quoted_scalar_emits_token_spanning_quotes() {
2372 let input = "'hello'";
2373 let tokens = collect_tokens(input);
2374 let scalar = find_scalar(&tokens);
2375 assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::SingleQuoted));
2376 assert_eq!(&input[scalar.start.index..scalar.end.index], "'hello'");
2377 assert_byte_complete(input, &tokens);
2378 }
2379
2380 #[test]
2381 fn double_quoted_scalar_emits_token_spanning_quotes() {
2382 let input = "\"hello\"";
2383 let tokens = collect_tokens(input);
2384 let scalar = find_scalar(&tokens);
2385 assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2386 assert_eq!(&input[scalar.start.index..scalar.end.index], "\"hello\"");
2387 assert_byte_complete(input, &tokens);
2388 }
2389
2390 #[test]
2391 fn single_quoted_scalar_treats_doubled_quote_as_escape() {
2392 let input = "'it''s'";
2395 let tokens = collect_tokens(input);
2396 let scalars: Vec<_> = tokens
2397 .iter()
2398 .filter(|t| matches!(t.kind, TokenKind::Scalar(_)))
2399 .collect();
2400 assert_eq!(scalars.len(), 1, "got {:?}", tokens);
2401 assert_eq!(
2402 &input[scalars[0].start.index..scalars[0].end.index],
2403 "'it''s'",
2404 );
2405 }
2406
2407 #[test]
2408 fn double_quoted_scalar_with_escaped_quote_does_not_terminate_early() {
2409 let input = "\"a\\\"b\"";
2412 let tokens = collect_tokens(input);
2413 let scalars: Vec<_> = tokens
2414 .iter()
2415 .filter(|t| matches!(t.kind, TokenKind::Scalar(_)))
2416 .collect();
2417 assert_eq!(scalars.len(), 1, "got {tokens:?}");
2418 assert_eq!(
2419 &input[scalars[0].start.index..scalars[0].end.index],
2420 "\"a\\\"b\"",
2421 );
2422 assert_byte_complete(input, &tokens);
2423 }
2424
2425 #[test]
2426 fn double_quoted_scalar_recognises_common_single_byte_escapes() {
2427 let input = "\"\\n\\t\\r\\0\\\\\\\"\"";
2429 let tokens = collect_tokens(input);
2430 let scalar = find_scalar(&tokens);
2431 assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2432 assert_eq!(scalar.start.index, 0);
2434 assert_eq!(scalar.end.index, input.len());
2435 let mut scanner = Scanner::new(input);
2436 while scanner.next_token().is_some() {}
2437 assert!(scanner.diagnostics().is_empty());
2438 }
2439
2440 #[test]
2441 fn double_quoted_scalar_recognises_hex_escapes() {
2442 let input = "\"\\x41\\u00E9\\U0001F600\"";
2444 let mut scanner = Scanner::new(input);
2445 while scanner.next_token().is_some() {}
2446 assert!(
2447 scanner.diagnostics().is_empty(),
2448 "got {:?}",
2449 scanner.diagnostics()
2450 );
2451 }
2452
2453 #[test]
2454 fn double_quoted_scalar_with_invalid_escape_emits_diagnostic() {
2455 let input = "\"\\q\"";
2456 let mut scanner = Scanner::new(input);
2457 while scanner.next_token().is_some() {}
2458 assert_eq!(
2459 scanner.diagnostics().len(),
2460 1,
2461 "got {:?}",
2462 scanner.diagnostics(),
2463 );
2464 assert_eq!(
2465 scanner.diagnostics()[0].code,
2466 diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE,
2467 );
2468 }
2469
2470 #[test]
2471 fn double_quoted_scalar_with_short_hex_escape_emits_diagnostic() {
2472 let input = "\"\\x4\"";
2475 let mut scanner = Scanner::new(input);
2476 while scanner.next_token().is_some() {}
2477 assert!(
2478 scanner
2479 .diagnostics()
2480 .iter()
2481 .any(|d| d.code == diagnostic_codes::LEX_INVALID_DOUBLE_QUOTED_ESCAPE),
2482 "got {:?}",
2483 scanner.diagnostics(),
2484 );
2485 }
2486
2487 #[test]
2488 fn double_quoted_scalar_spans_multiple_lines() {
2489 let input = "\"line1\nline2\"";
2491 let tokens = collect_tokens(input);
2492 let scalar = find_scalar(&tokens);
2493 assert_eq!(scalar.kind, TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2494 assert_eq!(scalar.start.index, 0);
2498 assert_eq!(scalar.end.index, input.len());
2499 }
2500
2501 #[test]
2502 fn line_continuation_escape_consumes_newline_inside_quoted_scalar() {
2503 let input = "\"a\\\nb\"";
2506 let mut scanner = Scanner::new(input);
2507 while scanner.next_token().is_some() {}
2508 assert!(
2509 scanner.diagnostics().is_empty(),
2510 "got {:?}",
2511 scanner.diagnostics(),
2512 );
2513 }
2514
2515 #[test]
2516 fn unterminated_quoted_scalar_emits_diagnostic() {
2517 for input in ["'oops", "\"oops"] {
2518 let mut scanner = Scanner::new(input);
2519 while scanner.next_token().is_some() {}
2520 assert!(
2521 scanner
2522 .diagnostics()
2523 .iter()
2524 .any(|d| d.code == diagnostic_codes::LEX_UNTERMINATED_QUOTED_SCALAR),
2525 "{input:?} produced {:?}",
2526 scanner.diagnostics(),
2527 );
2528 }
2529 }
2530
2531 #[test]
2532 fn quoted_scalar_can_be_implicit_key() {
2533 let input = "\"key\": value";
2534 let tokens = collect_tokens(input);
2535 let kinds = meaningful_kinds(&tokens);
2536 assert_eq!(
2537 kinds,
2538 vec![
2539 TokenKind::StreamStart,
2540 TokenKind::BlockMappingStart,
2541 TokenKind::Key,
2542 TokenKind::Scalar(ScalarStyle::DoubleQuoted),
2543 TokenKind::Value,
2544 TokenKind::Scalar(ScalarStyle::Plain),
2545 TokenKind::BlockEnd,
2546 TokenKind::StreamEnd,
2547 ],
2548 );
2549 assert_byte_complete(input, &tokens);
2550 }
2551
2552 #[test]
2553 fn multi_line_quoted_scalar_cannot_be_implicit_key() {
2554 let input = "\"line1\nline2\": value\n";
2559 let tokens = collect_tokens(input);
2560 let kinds = meaningful_kinds(&tokens);
2561 assert_eq!(kinds[0], TokenKind::StreamStart);
2565 assert_eq!(kinds[1], TokenKind::Scalar(ScalarStyle::DoubleQuoted));
2566 assert_eq!(kinds[2], TokenKind::BlockMappingStart);
2567 assert_eq!(kinds[3], TokenKind::Value);
2568 assert!(!kinds[..3].contains(&TokenKind::Key), "got {kinds:?}",);
2569 }
2570
2571 #[test]
2572 fn quoted_scalar_inside_flow_mapping_terminates_at_closing_quote() {
2573 let input = "{\"a\": \"b\"}";
2574 let tokens = collect_tokens(input);
2575 let kinds = meaningful_kinds(&tokens);
2576 assert_eq!(
2577 kinds,
2578 vec![
2579 TokenKind::StreamStart,
2580 TokenKind::FlowMappingStart,
2581 TokenKind::Key,
2582 TokenKind::Scalar(ScalarStyle::DoubleQuoted),
2583 TokenKind::Value,
2584 TokenKind::Scalar(ScalarStyle::DoubleQuoted),
2585 TokenKind::FlowMappingEnd,
2586 TokenKind::StreamEnd,
2587 ],
2588 );
2589 assert_byte_complete(input, &tokens);
2590 }
2591
2592 #[test]
2593 fn literal_block_scalar_at_top_level_spans_to_eof() {
2594 let input = "|\n hello\n world\n";
2595 let tokens = collect_tokens(input);
2596 let scalar = tokens
2597 .iter()
2598 .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2599 .expect("literal scalar");
2600 assert_eq!(scalar.start.index, 0);
2603 assert_eq!(scalar.end.index, input.len());
2604 assert_byte_complete(input, &tokens);
2605 }
2606
2607 #[test]
2608 fn folded_block_scalar_emits_folded_style() {
2609 let input = ">\n hello\n";
2610 let tokens = collect_tokens(input);
2611 assert!(
2612 tokens
2613 .iter()
2614 .any(|t| t.kind == TokenKind::Scalar(ScalarStyle::Folded)),
2615 "got {tokens:?}",
2616 );
2617 }
2618
2619 #[test]
2620 fn block_scalar_terminates_on_dedent_to_parent_indent() {
2621 let input = "key: |\n line1\n line2\nnext: x\n";
2630 let tokens = collect_tokens(input);
2631 let kinds = meaningful_kinds(&tokens);
2632 let scalar = tokens
2635 .iter()
2636 .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2637 .expect("literal scalar");
2638 let next_idx = input.find("next:").expect("next key in fixture");
2639 assert!(
2640 scalar.end.index <= next_idx,
2641 "scalar should end before `next:` at {next_idx}: scalar ends at {}",
2642 scalar.end.index,
2643 );
2644 let key_count = kinds.iter().filter(|&&k| k == TokenKind::Key).count();
2646 assert_eq!(key_count, 2, "got {kinds:?}");
2647 }
2648
2649 #[test]
2650 fn block_scalar_with_keep_chomping_indicator_in_header() {
2651 let input = "|+\n text\n\n";
2652 let tokens = collect_tokens(input);
2653 let scalar = tokens
2654 .iter()
2655 .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2656 .expect("literal scalar");
2657 assert_eq!(scalar.start.index, 0);
2660 assert_eq!(scalar.end.index, input.len());
2661 assert_byte_complete(input, &tokens);
2662 }
2663
2664 #[test]
2665 fn block_scalar_with_explicit_indent_indicator_uses_that_indent() {
2666 let input = "key: |2\n hi\nbye: x\n";
2670 let tokens = collect_tokens(input);
2671 let scalar = tokens
2672 .iter()
2673 .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2674 .expect("literal scalar");
2675 let bye_idx = input.find("bye:").expect("bye key in fixture");
2676 assert!(
2677 scalar.end.index <= bye_idx,
2678 "scalar must end before `bye`: {} vs {}",
2679 scalar.end.index,
2680 bye_idx,
2681 );
2682 assert_byte_complete(input, &tokens);
2683 }
2684
2685 #[test]
2686 fn block_scalar_at_eof_without_trailing_newline_still_emits() {
2687 let input = "|\n text";
2688 let tokens = collect_tokens(input);
2689 let scalar = tokens
2690 .iter()
2691 .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2692 .expect("literal scalar");
2693 assert_eq!(scalar.end.index, input.len());
2694 }
2695
2696 #[test]
2697 fn block_scalar_with_internal_blank_lines_includes_them() {
2698 let input = "|\n a\n\n b\n";
2700 let tokens = collect_tokens(input);
2701 let scalar = tokens
2702 .iter()
2703 .find(|t| t.kind == TokenKind::Scalar(ScalarStyle::Literal))
2704 .expect("literal scalar");
2705 assert_eq!(scalar.end.index, input.len());
2706 assert_byte_complete(input, &tokens);
2707 }
2708
2709 #[test]
2710 fn pipe_inside_flow_context_is_part_of_plain_scalar_not_block() {
2711 let input = "[|]";
2713 let tokens = collect_tokens(input);
2714 let kinds = meaningful_kinds(&tokens);
2715 assert!(
2718 !kinds.contains(&TokenKind::Scalar(ScalarStyle::Literal)),
2719 "got {kinds:?}",
2720 );
2721 assert_eq!(kinds[1], TokenKind::FlowSequenceStart);
2722 assert!(kinds.contains(&TokenKind::Scalar(ScalarStyle::Plain)));
2723 }
2724
2725 #[test]
2726 fn block_scalar_terminates_on_document_marker() {
2727 let input = "|\n text\n---\nnext\n";
2728 let tokens = collect_tokens(input);
2729 let kinds = meaningful_kinds(&tokens);
2730 assert!(kinds.contains(&TokenKind::DocumentStart), "got {kinds:?}");
2732 }
2733
2734 #[test]
2735 fn plain_scalar_with_internal_whitespace_is_one_token() {
2736 let input = "hello world";
2737 let tokens = collect_tokens(input);
2738 let scalars: Vec<_> = tokens
2739 .iter()
2740 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2741 .collect();
2742 assert_eq!(scalars.len(), 1, "got {tokens:?}");
2743 assert_eq!(
2744 &input[scalars[0].start.index..scalars[0].end.index],
2745 "hello world",
2746 );
2747 assert_byte_complete(input, &tokens);
2748 }
2749
2750 #[test]
2751 fn plain_scalar_with_multiple_internal_spaces_is_one_token() {
2752 let input = "a b c";
2753 let tokens = collect_tokens(input);
2754 let scalars: Vec<_> = tokens
2755 .iter()
2756 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2757 .collect();
2758 assert_eq!(scalars.len(), 1, "got {tokens:?}");
2759 assert_eq!(
2760 &input[scalars[0].start.index..scalars[0].end.index],
2761 "a b c",
2762 );
2763 }
2764
2765 #[test]
2766 fn plain_scalar_drops_trailing_whitespace_before_eof() {
2767 let input = "hello ";
2769 let tokens = collect_tokens(input);
2770 let scalar = tokens
2771 .iter()
2772 .find(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2773 .expect("plain scalar");
2774 assert_eq!(&input[scalar.start.index..scalar.end.index], "hello");
2775 assert!(
2777 tokens
2778 .iter()
2779 .any(|t| t.kind == TokenKind::Trivia(TriviaKind::Whitespace)),
2780 "expected trailing whitespace as trivia: {tokens:?}",
2781 );
2782 assert_byte_complete(input, &tokens);
2783 }
2784
2785 #[test]
2786 fn plain_scalar_drops_trailing_whitespace_before_comment() {
2787 let input = "hello # comment";
2790 let tokens = collect_tokens(input);
2791 let scalar = tokens
2792 .iter()
2793 .find(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2794 .expect("plain scalar");
2795 assert_eq!(&input[scalar.start.index..scalar.end.index], "hello");
2796 assert!(
2797 tokens
2798 .iter()
2799 .any(|t| t.kind == TokenKind::Trivia(TriviaKind::Comment)),
2800 "expected comment trivia: {tokens:?}",
2801 );
2802 }
2803
2804 #[test]
2805 fn colon_inside_url_does_not_break_plain_scalar() {
2806 let input = "url: https://example.com\n";
2809 let tokens = collect_tokens(input);
2810 let scalars: Vec<_> = tokens
2811 .iter()
2812 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2813 .map(|t| &input[t.start.index..t.end.index])
2814 .collect();
2815 assert_eq!(scalars, vec!["url", "https://example.com"]);
2816 }
2817
2818 #[test]
2819 fn multi_line_plain_scalar_continues_under_indent() {
2820 let input = "key: hello\n world\n";
2823 let tokens = collect_tokens(input);
2824 let plain_scalars: Vec<_> = tokens
2825 .iter()
2826 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2827 .collect();
2828 assert_eq!(plain_scalars.len(), 2, "got {tokens:?}");
2830 let value = plain_scalars[1];
2832 assert!(
2833 input[value.start.index..value.end.index].contains("hello"),
2834 "scalar text: {:?}",
2835 &input[value.start.index..value.end.index],
2836 );
2837 assert!(
2838 input[value.start.index..value.end.index].contains("world"),
2839 "scalar text: {:?}",
2840 &input[value.start.index..value.end.index],
2841 );
2842 }
2843
2844 #[test]
2845 fn plain_scalar_terminates_at_blank_line_continuation() {
2846 let input = "key: hello\n\n world\n";
2848 let tokens = collect_tokens(input);
2849 let plain_scalars: Vec<_> = tokens
2850 .iter()
2851 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2852 .map(|t| &input[t.start.index..t.end.index])
2853 .collect();
2854 let merged = plain_scalars.iter().any(|s| s.contains("world"));
2859 assert!(
2860 merged || plain_scalars.contains(&"world"),
2861 "got {plain_scalars:?}"
2862 );
2863 }
2864
2865 #[test]
2866 fn plain_scalar_terminates_on_dedent() {
2867 let input = "outer:\n hello\nnext: x\n";
2871 let tokens = collect_tokens(input);
2872 let kinds = meaningful_kinds(&tokens);
2873 let key_count = kinds.iter().filter(|&&k| k == TokenKind::Key).count();
2875 assert_eq!(key_count, 2, "got {kinds:?}");
2876 let plain_count = kinds
2878 .iter()
2879 .filter(|&&k| k == TokenKind::Scalar(ScalarStyle::Plain))
2880 .count();
2881 assert_eq!(plain_count, 4, "got {kinds:?}");
2882 }
2883
2884 #[test]
2885 fn plain_scalar_terminates_on_following_block_entry_indicator() {
2886 let input = "outer:\n - a\n - b\n";
2890 let tokens = collect_tokens(input);
2891 let kinds = meaningful_kinds(&tokens);
2892 let block_entry_count = kinds
2896 .iter()
2897 .filter(|&&k| k == TokenKind::BlockEntry)
2898 .count();
2899 assert!(block_entry_count >= 1, "got {kinds:?}");
2900 }
2901
2902 #[test]
2903 fn more_indented_dash_line_folds_into_plain_scalar() {
2904 let input = "- single multiline\n - sequence entry\n";
2910 let tokens = collect_tokens(input);
2911 let kinds = meaningful_kinds(&tokens);
2912 let block_entry_count = kinds
2913 .iter()
2914 .filter(|&&k| k == TokenKind::BlockEntry)
2915 .count();
2916 assert_eq!(block_entry_count, 1, "got {kinds:?}");
2917 let plain_scalars: Vec<_> = tokens
2918 .iter()
2919 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2920 .collect();
2921 assert_eq!(plain_scalars.len(), 1, "got {tokens:?}");
2922 let value = plain_scalars[0];
2923 assert_eq!(
2924 &input[value.start.index..value.end.index],
2925 "single multiline\n - sequence entry",
2926 );
2927 }
2928
2929 #[test]
2930 fn flow_context_plain_scalar_does_not_absorb_terminator_line_break() {
2931 let input = "{a: 42\n}\n";
2936 let tokens = collect_tokens(input);
2937 let scalars: Vec<_> = tokens
2938 .iter()
2939 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2940 .map(|t| &input[t.start.index..t.end.index])
2941 .collect();
2942 assert!(scalars.contains(&"42"), "got {scalars:?}");
2943 assert_byte_complete(input, &tokens);
2944 }
2945
2946 #[test]
2947 fn plain_scalar_in_flow_context_terminates_on_flow_indicators() {
2948 let input = "[a b, c]";
2949 let tokens = collect_tokens(input);
2950 let plain_scalars: Vec<_> = tokens
2951 .iter()
2952 .filter(|t| matches!(t.kind, TokenKind::Scalar(ScalarStyle::Plain)))
2953 .map(|t| &input[t.start.index..t.end.index])
2954 .collect();
2955 assert_eq!(plain_scalars, vec!["a b", "c"]);
2958 }
2959
2960 #[test]
2961 fn multi_line_plain_scalar_does_not_register_as_simple_key() {
2962 let input = "hello\n world: value\n";
2970 let tokens = collect_tokens(input);
2971 let kinds = meaningful_kinds(&tokens);
2972 let scalar_pos = kinds
2974 .iter()
2975 .position(|&k| k == TokenKind::Scalar(ScalarStyle::Plain));
2976 let key_pos = kinds.iter().position(|&k| k == TokenKind::Key);
2977 assert!(scalar_pos.is_some(), "no scalar: {kinds:?}");
2978 if let Some(k) = key_pos {
2985 let s = scalar_pos.unwrap();
2986 assert!(s < k, "multi-line scalar must precede any key: {kinds:?}",);
2987 }
2988 }
2989
2990 #[test]
2991 fn plain_scalar_preserves_single_line_simple_key_behaviour() {
2992 let input = "hello world: value\n";
2996 let tokens = collect_tokens(input);
2997 let kinds = meaningful_kinds(&tokens);
2998 assert_eq!(
2999 kinds,
3000 vec![
3001 TokenKind::StreamStart,
3002 TokenKind::BlockMappingStart,
3003 TokenKind::Key,
3004 TokenKind::Scalar(ScalarStyle::Plain),
3005 TokenKind::Value,
3006 TokenKind::Scalar(ScalarStyle::Plain),
3007 TokenKind::BlockEnd,
3008 TokenKind::StreamEnd,
3009 ],
3010 );
3011 }
3012}