1#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use alloc::{
13 borrow::{Cow, ToOwned},
14 collections::VecDeque,
15 string::String,
16 vec::Vec,
17};
18use core::{char, fmt};
19
20use crate::{
21 char_traits::{
22 as_hex, is_anchor_char, is_blank_or_breakz, is_bom, is_break, is_breakz, is_flow, is_hex,
23 is_tag_char, is_uri_char,
24 },
25 input::{BorrowedInput, SkipTabs},
26};
27
28const SIMPLE_KEY_MAX_LOOKAHEAD: usize = 1024;
30
31#[derive(Clone, Copy, PartialEq, Debug, Eq)]
33pub enum TEncoding {
34 Utf8,
36}
37
38#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash, PartialOrd, Ord)]
40pub enum ScalarStyle {
41 Plain,
43 SingleQuoted,
45 DoubleQuoted,
47
48 Literal,
54 Folded,
61}
62
63#[derive(Clone, Copy, Debug, Default)]
70pub struct MarkerOffsets {
71 chars: usize,
73 bytes: Option<usize>,
75}
76
77impl PartialEq for MarkerOffsets {
78 fn eq(&self, other: &Self) -> bool {
79 self.chars == other.chars
83 }
84}
85
86impl Eq for MarkerOffsets {}
87
88#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
90pub struct Marker {
91 offsets: MarkerOffsets,
93 line: usize,
95 col: usize,
97}
98
99impl Marker {
100 #[must_use]
102 pub fn new(index: usize, line: usize, col: usize) -> Marker {
103 Marker {
104 offsets: MarkerOffsets {
105 chars: index,
106 bytes: None,
107 },
108 line,
109 col,
110 }
111 }
112
113 #[must_use]
115 pub fn with_byte_offset(mut self, byte_offset: Option<usize>) -> Marker {
116 self.offsets.bytes = byte_offset;
117 self
118 }
119
120 #[must_use]
122 pub fn index(&self) -> usize {
123 self.offsets.chars
124 }
125
126 #[must_use]
128 pub fn byte_offset(&self) -> Option<usize> {
129 self.offsets.bytes
130 }
131
132 #[must_use]
134 pub fn line(&self) -> usize {
135 self.line
136 }
137
138 #[must_use]
140 pub fn col(&self) -> usize {
141 self.col
142 }
143}
144
145#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
147pub struct Span {
148 pub start: Marker,
150 pub end: Marker,
152
153 pub indent: Option<usize>,
158
159 pub tag_start: Option<Marker>,
168}
169
170impl Span {
171 #[must_use]
173 pub fn new(start: Marker, end: Marker) -> Span {
174 Span {
175 start,
176 end,
177 indent: None,
178 tag_start: None,
179 }
180 }
181
182 #[must_use]
189 pub fn empty(mark: Marker) -> Span {
190 Span {
191 start: mark,
192 end: mark,
193 indent: None,
194 tag_start: None,
195 }
196 }
197
198 #[must_use]
200 pub fn with_indent(mut self, indent: Option<usize>) -> Span {
201 self.indent = indent;
202 self
203 }
204
205 #[must_use]
207 pub fn with_tag_start(mut self, tag_start: Option<Marker>) -> Span {
208 self.tag_start = tag_start;
209 self
210 }
211
212 #[must_use]
218 pub fn tag_start(&self) -> Option<Marker> {
219 self.tag_start
220 }
221
222 #[must_use]
224 pub fn len(&self) -> usize {
225 self.end.index() - self.start.index()
226 }
227
228 #[must_use]
230 pub fn is_empty(&self) -> bool {
231 self.len() == 0
232 }
233
234 #[must_use]
236 pub fn byte_range(&self) -> Option<core::ops::Range<usize>> {
237 let start = self.start.byte_offset()?;
238 let end = self.end.byte_offset()?;
239 Some(start..end)
240 }
241
242 #[must_use]
245 pub fn slice<'source>(&self, source: &'source str) -> Option<&'source str> {
246 source.get(self.byte_range()?)
247 }
248}
249
250#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
265pub enum Placement {
266 Above,
272 Right,
275 #[default]
281 Free,
282 Last,
287}
288
289#[derive(Clone, PartialEq, Debug, Eq)]
295pub struct Comment<'input> {
296 pub span: Span,
298 pub text: Cow<'input, str>,
302 pub placement: Placement,
304}
305
306impl<'input> Comment<'input> {
307 #[must_use]
312 pub fn new(span: Span, text: impl Into<Cow<'input, str>>) -> Self {
313 Self {
314 span,
315 text: text.into(),
316 placement: Placement::Free,
317 }
318 }
319
320 #[must_use]
322 pub fn with_placement(mut self, placement: Placement) -> Self {
323 self.placement = placement;
324 self
325 }
326
327 #[must_use]
331 pub fn trimmed_text(&self) -> &str {
332 self.text.trim()
333 }
334}
335
336impl AsRef<str> for Comment<'_> {
337 fn as_ref(&self) -> &str {
338 self.text.as_ref()
339 }
340}
341
342#[derive(Clone, PartialEq, Debug, Eq)]
344pub struct ScanError {
345 mark: Marker,
347 info: String,
349}
350
351impl ScanError {
352 #[must_use]
354 #[cold]
355 pub fn new(loc: Marker, info: String) -> ScanError {
356 ScanError { mark: loc, info }
357 }
358
359 #[must_use]
361 #[cold]
362 pub fn new_str(loc: Marker, info: &str) -> ScanError {
363 ScanError {
364 mark: loc,
365 info: info.to_owned(),
366 }
367 }
368
369 #[cold]
370 pub(crate) fn into_result<T>(self) -> Result<T, ScanError> {
371 Err(self)
372 }
373
374 #[must_use]
376 pub fn marker(&self) -> &Marker {
377 &self.mark
378 }
379
380 #[must_use]
382 pub fn info(&self) -> &str {
383 self.info.as_ref()
384 }
385}
386
387impl fmt::Display for ScanError {
388 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
389 write!(
390 f,
391 "{} at char {} line {} column {}",
392 self.info,
393 self.mark.index(),
394 self.mark.line(),
395 self.mark.col() + 1
396 )
397 }
398}
399
400impl core::error::Error for ScanError {}
401
402#[derive(Clone, PartialEq, Debug, Eq)]
404pub enum TokenType<'input> {
405 StreamStart(TEncoding),
407 StreamEnd,
409 VersionDirective(
411 u32,
413 u32,
415 ),
416 TagDirective(
418 Cow<'input, str>,
420 Cow<'input, str>,
422 ),
423 DocumentStart,
425 DocumentEnd,
427 BlockSequenceStart,
431 BlockMappingStart,
435 BlockEnd,
437 FlowSequenceStart,
439 FlowSequenceEnd,
441 FlowMappingStart,
443 FlowMappingEnd,
445 BlockEntry,
447 FlowEntry,
449 Key,
451 Value,
453 Alias(Cow<'input, str>),
455 Anchor(Cow<'input, str>),
457 Tag(
459 Cow<'input, str>,
461 Cow<'input, str>,
463 ),
464 Scalar(ScalarStyle, Cow<'input, str>),
466 Comment(
471 Comment<'input>,
473 ),
474 ReservedDirective(
476 String,
478 Vec<String>,
480 ),
481}
482
483#[derive(Clone, PartialEq, Debug, Eq)]
485pub struct Token<'input>(
486 pub Span,
488 pub TokenType<'input>,
490);
491
492#[derive(Clone, PartialEq, Debug, Eq)]
497pub(crate) struct QueuedComment<'input> {
498 pub(crate) text: Cow<'input, str>,
499 pub(crate) placement: Placement,
500}
501
502impl<'input> QueuedComment<'input> {
503 fn into_public(self, span: Span) -> Comment<'input> {
504 Comment::new(span, self.text).with_placement(self.placement)
505 }
506}
507
508impl<'input> From<Comment<'input>> for QueuedComment<'input> {
509 fn from(comment: Comment<'input>) -> Self {
510 Self {
511 text: comment.text,
512 placement: comment.placement,
513 }
514 }
515}
516
517#[derive(Clone, PartialEq, Debug, Eq)]
522pub(crate) enum QueuedTokenType<'input> {
523 StreamStart(TEncoding),
524 StreamEnd,
525 VersionDirective(u32, u32),
526 TagDirective(Cow<'input, str>, Cow<'input, str>),
527 DocumentStart,
528 DocumentEnd,
529 BlockSequenceStart,
530 BlockMappingStart,
531 BlockEnd,
532 FlowSequenceStart,
533 FlowSequenceEnd,
534 FlowMappingStart,
535 FlowMappingEnd,
536 BlockEntry,
537 FlowEntry,
538 Key,
539 Value,
540 Alias(Cow<'input, str>),
541 Anchor(Cow<'input, str>),
542 Tag(Cow<'input, str>, Cow<'input, str>),
543 Scalar(ScalarStyle, Cow<'input, str>),
544 Comment(QueuedComment<'input>),
545 ReservedDirective(String, Vec<String>),
546}
547
548impl<'input> QueuedTokenType<'input> {
549 fn into_public(self, span: Span) -> TokenType<'input> {
550 match self {
551 Self::StreamStart(encoding) => TokenType::StreamStart(encoding),
552 Self::StreamEnd => TokenType::StreamEnd,
553 Self::VersionDirective(major, minor) => TokenType::VersionDirective(major, minor),
554 Self::TagDirective(handle, prefix) => TokenType::TagDirective(handle, prefix),
555 Self::DocumentStart => TokenType::DocumentStart,
556 Self::DocumentEnd => TokenType::DocumentEnd,
557 Self::BlockSequenceStart => TokenType::BlockSequenceStart,
558 Self::BlockMappingStart => TokenType::BlockMappingStart,
559 Self::BlockEnd => TokenType::BlockEnd,
560 Self::FlowSequenceStart => TokenType::FlowSequenceStart,
561 Self::FlowSequenceEnd => TokenType::FlowSequenceEnd,
562 Self::FlowMappingStart => TokenType::FlowMappingStart,
563 Self::FlowMappingEnd => TokenType::FlowMappingEnd,
564 Self::BlockEntry => TokenType::BlockEntry,
565 Self::FlowEntry => TokenType::FlowEntry,
566 Self::Key => TokenType::Key,
567 Self::Value => TokenType::Value,
568 Self::Alias(name) => TokenType::Alias(name),
569 Self::Anchor(name) => TokenType::Anchor(name),
570 Self::Tag(handle, suffix) => TokenType::Tag(handle, suffix),
571 Self::Scalar(style, value) => TokenType::Scalar(style, value),
572 Self::Comment(comment) => TokenType::Comment(comment.into_public(span)),
573 Self::ReservedDirective(name, params) => TokenType::ReservedDirective(name, params),
574 }
575 }
576}
577
578impl<'input> From<TokenType<'input>> for QueuedTokenType<'input> {
579 fn from(token: TokenType<'input>) -> Self {
580 match token {
581 TokenType::StreamStart(encoding) => Self::StreamStart(encoding),
582 TokenType::StreamEnd => Self::StreamEnd,
583 TokenType::VersionDirective(major, minor) => Self::VersionDirective(major, minor),
584 TokenType::TagDirective(handle, prefix) => Self::TagDirective(handle, prefix),
585 TokenType::DocumentStart => Self::DocumentStart,
586 TokenType::DocumentEnd => Self::DocumentEnd,
587 TokenType::BlockSequenceStart => Self::BlockSequenceStart,
588 TokenType::BlockMappingStart => Self::BlockMappingStart,
589 TokenType::BlockEnd => Self::BlockEnd,
590 TokenType::FlowSequenceStart => Self::FlowSequenceStart,
591 TokenType::FlowSequenceEnd => Self::FlowSequenceEnd,
592 TokenType::FlowMappingStart => Self::FlowMappingStart,
593 TokenType::FlowMappingEnd => Self::FlowMappingEnd,
594 TokenType::BlockEntry => Self::BlockEntry,
595 TokenType::FlowEntry => Self::FlowEntry,
596 TokenType::Key => Self::Key,
597 TokenType::Value => Self::Value,
598 TokenType::Alias(name) => Self::Alias(name),
599 TokenType::Anchor(name) => Self::Anchor(name),
600 TokenType::Tag(handle, suffix) => Self::Tag(handle, suffix),
601 TokenType::Scalar(style, value) => Self::Scalar(style, value),
602 TokenType::Comment(comment) => Self::Comment(comment.into()),
603 TokenType::ReservedDirective(name, params) => Self::ReservedDirective(name, params),
604 }
605 }
606}
607
608#[derive(Clone, PartialEq, Debug, Eq)]
610pub(crate) struct QueuedToken<'input>(pub(crate) Span, pub(crate) QueuedTokenType<'input>);
611
612impl<'input> QueuedToken<'input> {
613 fn into_public(self) -> Token<'input> {
614 Token(self.0, self.1.into_public(self.0))
615 }
616}
617
618impl<'input> From<Token<'input>> for QueuedToken<'input> {
619 fn from(token: Token<'input>) -> Self {
620 Self(token.0, token.1.into())
621 }
622}
623
624#[derive(Clone, PartialEq, Debug, Eq)]
659struct SimpleKey {
660 possible: bool,
673 required: bool,
682 token_number: usize,
688 mark: Marker,
690}
691
692impl SimpleKey {
693 fn new(mark: Marker) -> SimpleKey {
695 SimpleKey {
696 possible: false,
697 required: false,
698 token_number: 0,
699 mark,
700 }
701 }
702}
703
704#[derive(Clone, Debug, Default)]
706struct Indent {
707 indent: isize,
709 needs_block_end: bool,
727}
728
729#[derive(Debug, PartialEq)]
751enum ImplicitMappingState {
752 Possible,
757 Inside(u8),
761}
762
763#[derive(Debug)]
773#[allow(clippy::struct_excessive_bools)]
774pub struct Scanner<'input, T> {
775 input: T,
779 mark: Marker,
781 tokens: VecDeque<QueuedToken<'input>>,
788 error: Option<ScanError>,
790 deferred_error: Option<ScanError>,
792 comments_possible: bool,
794
795 stream_start_produced: bool,
797 stream_end_produced: bool,
799 document_prefix_allowed: bool,
805 adjacent_value_allowed_at: usize,
808 simple_key_allowed: bool,
812 simple_keys: smallvec::SmallVec<[SimpleKey; 8]>,
817 indent: isize,
819 indents: smallvec::SmallVec<[Indent; 8]>,
821 flow_level: u8,
823 tokens_parsed: usize,
827 token_available: bool,
829 leading_whitespace: bool,
831 flow_mapping_started: smallvec::SmallVec<[bool; 8]>,
838 implicit_flow_mapping_states: smallvec::SmallVec<[ImplicitMappingState; 8]>,
851 interrupted_plain_by_comment: Option<Marker>,
854 explicit_key_tab_check_pending: bool,
859 flow_markers: smallvec::SmallVec<[(Marker, char); 8]>,
861 buf_leading_break: String,
862 buf_trailing_breaks: String,
863 buf_whitespaces: String,
864}
865
866impl<'input, T: BorrowedInput<'input>> Iterator for Scanner<'input, T> {
867 type Item = Token<'input>;
868
869 fn next(&mut self) -> Option<Self::Item> {
870 if self.error.is_some() {
871 return None;
872 }
873 match self.next_token() {
874 Ok(Some(tok)) => {
875 debug_print!(
876 " \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
877 tok.1,
878 tok.0
879 );
880 Some(tok)
881 }
882 Ok(tok) => tok,
883 Err(e) => self.stop_after_error(e),
884 }
885 }
886}
887
888pub type ScanResult = Result<(), ScanError>;
890
891#[derive(Debug)]
892enum FlowScalarBuf {
893 Borrowed {
899 start: usize,
900 end: usize,
901 pending_ws_start: Option<usize>,
902 pending_ws_end: usize,
903 },
904 Owned(String),
905}
906
907impl FlowScalarBuf {
908 #[inline]
909 fn new_borrowed(start: usize) -> Self {
910 Self::Borrowed {
911 start,
912 end: start,
913 pending_ws_start: None,
914 pending_ws_end: start,
915 }
916 }
917
918 #[inline]
919 fn new_owned() -> Self {
920 Self::Owned(String::new())
921 }
922
923 #[inline]
924 fn as_owned_mut(&mut self) -> Option<&mut String> {
925 match self {
926 Self::Owned(s) => Some(s),
927 Self::Borrowed { .. } => None,
928 }
929 }
930
931 #[inline]
932 fn commit_pending_ws(&mut self) {
933 if let Self::Borrowed {
934 end,
935 pending_ws_start,
936 pending_ws_end,
937 ..
938 } = self
939 {
940 if pending_ws_start.is_some() {
941 *end = *pending_ws_end;
942 *pending_ws_start = None;
943 }
944 }
945 }
946
947 #[inline]
948 fn note_pending_ws(&mut self, ws_start: usize, ws_end: usize) {
949 if let Self::Borrowed {
950 pending_ws_start,
951 pending_ws_end,
952 ..
953 } = self
954 {
955 if pending_ws_start.is_none() {
956 *pending_ws_start = Some(ws_start);
957 }
958 *pending_ws_end = ws_end;
959 }
960 }
961
962 #[inline]
963 fn discard_pending_ws(&mut self) {
964 if let Self::Borrowed {
965 pending_ws_start,
966 pending_ws_end,
967 end,
968 ..
969 } = self
970 {
971 *pending_ws_start = None;
972 *pending_ws_end = *end;
973 }
974 }
975}
976
977impl<'input, T: BorrowedInput<'input>> Scanner<'input, T> {
978 #[inline]
979 fn promote_flow_scalar_buf_to_owned(
980 &self,
981 start_mark: &Marker,
982 buf: &mut FlowScalarBuf,
983 ) -> Result<(), ScanError> {
984 let FlowScalarBuf::Borrowed {
985 start,
986 end,
987 pending_ws_start: _,
988 pending_ws_end: _,
989 } = *buf
990 else {
991 return Ok(());
992 };
993
994 let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
995 ScanError::new_str(
996 *start_mark,
997 "internal error: input advertised offsets but did not provide a slice",
998 )
999 })?;
1000 *buf = FlowScalarBuf::Owned(slice.to_owned());
1001 Ok(())
1002 }
1003 #[inline]
1009 fn try_borrow_slice(&self, start: usize, end: usize) -> Option<&'input str> {
1010 self.input.slice_borrowed(start, end)
1011 }
1012
1013 fn scan_tag_handle_directive_cow(
1018 &mut self,
1019 mark: &Marker,
1020 ) -> Result<Cow<'input, str>, ScanError> {
1021 let Some(start) = self.input.byte_offset() else {
1022 return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1023 };
1024
1025 if self.input.look_ch() != '!' {
1026 return Err(ScanError::new_str(
1027 *mark,
1028 "while scanning a tag, did not find expected '!'",
1029 ));
1030 }
1031
1032 self.skip_non_blank();
1034
1035 self.input.lookahead(1);
1038 while self.input.next_is_alpha() {
1039 self.skip_non_blank();
1040 self.input.lookahead(1);
1041 }
1042
1043 if self.input.peek() == '!' {
1045 self.skip_non_blank();
1046 }
1047
1048 let Some(end) = self.input.byte_offset() else {
1049 return Ok(Cow::Owned(self.scan_tag_handle(true, mark)?));
1051 };
1052
1053 let Some(slice) = self.try_borrow_slice(start, end) else {
1054 let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1056 ScanError::new_str(
1057 *mark,
1058 "internal error: input advertised slicing but did not provide a slice",
1059 )
1060 })?;
1061 if !slice.ends_with('!') && slice != "!" {
1062 return Err(ScanError::new_str(
1063 *mark,
1064 "while parsing a tag directive, did not find expected '!'",
1065 ));
1066 }
1067 return Ok(Cow::Owned(slice.to_owned()));
1068 };
1069
1070 if !slice.ends_with('!') && slice != "!" {
1071 return Err(ScanError::new_str(
1072 *mark,
1073 "while parsing a tag directive, did not find expected '!'",
1074 ));
1075 }
1076
1077 Ok(Cow::Borrowed(slice))
1078 }
1079
1080 fn scan_tag_prefix_directive_cow(
1085 &mut self,
1086 start_mark: &Marker,
1087 ) -> Result<Cow<'input, str>, ScanError> {
1088 let Some(start) = self.input.byte_offset() else {
1089 return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1090 };
1091
1092 if self.input.look_ch() == '!' {
1094 self.skip_non_blank();
1095 } else if !is_tag_char(self.input.peek()) {
1096 return Err(ScanError::new_str(
1097 *start_mark,
1098 "invalid global tag character",
1099 ));
1100 } else if self.input.peek() == '%' {
1101 } else {
1103 self.skip_non_blank();
1104 }
1105
1106 while is_uri_char(self.input.look_ch()) {
1108 if self.input.peek() == '%' {
1109 break;
1110 }
1111 self.skip_non_blank();
1112 }
1113
1114 if self.input.peek() == '%' {
1116 let current = self
1117 .input
1118 .byte_offset()
1119 .expect("byte_offset() must remain available once enabled");
1120 let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
1121 slice.to_owned()
1122 } else {
1123 String::new()
1124 };
1125
1126 while is_uri_char(self.input.look_ch()) {
1127 if self.input.peek() == '%' {
1128 out.push(self.scan_uri_escapes(start_mark)?);
1129 } else {
1130 out.push(self.input.peek());
1131 self.skip_non_blank();
1132 }
1133 }
1134 return Ok(Cow::Owned(out));
1135 }
1136
1137 let Some(end) = self.input.byte_offset() else {
1138 return Ok(Cow::Owned(self.scan_tag_prefix(start_mark)?));
1139 };
1140
1141 let Some(slice) = self.try_borrow_slice(start, end) else {
1142 let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
1144 ScanError::new_str(
1145 *start_mark,
1146 "internal error: input advertised slicing but did not provide a slice",
1147 )
1148 })?;
1149 return Ok(Cow::Owned(slice.to_owned()));
1150 };
1151
1152 Ok(Cow::Borrowed(slice))
1153 }
1154 pub fn new(input: T) -> Self {
1156 let initial_byte_offset = input.byte_offset();
1157 let comments_possible = input.may_contain_comments();
1158 Scanner {
1159 input,
1160 mark: Marker::new(0, 1, 0).with_byte_offset(initial_byte_offset),
1161 tokens: VecDeque::with_capacity(64),
1162 error: None,
1163 deferred_error: None,
1164 comments_possible,
1165
1166 stream_start_produced: false,
1167 stream_end_produced: false,
1168 document_prefix_allowed: true,
1169 adjacent_value_allowed_at: 0,
1170 simple_key_allowed: true,
1171 simple_keys: smallvec::SmallVec::new(),
1172 indent: -1,
1173 indents: smallvec::SmallVec::new(),
1174 flow_level: 0,
1175 tokens_parsed: 0,
1176 token_available: false,
1177 leading_whitespace: true,
1178 flow_mapping_started: smallvec::SmallVec::new(),
1179 implicit_flow_mapping_states: smallvec::SmallVec::new(),
1180 flow_markers: smallvec::SmallVec::new(),
1181 interrupted_plain_by_comment: None,
1182 explicit_key_tab_check_pending: false,
1183
1184 buf_leading_break: String::with_capacity(128),
1185 buf_trailing_breaks: String::with_capacity(128),
1186 buf_whitespaces: String::with_capacity(128),
1187 }
1188 }
1189
1190 #[inline]
1195 pub fn get_error(&self) -> Option<ScanError> {
1196 self.error.clone().or_else(|| self.deferred_error.clone())
1197 }
1198
1199 #[cold]
1200 fn stop_after_error(&mut self, error: ScanError) -> Option<Token<'input>> {
1201 self.error = Some(error);
1202 None
1203 }
1204
1205 #[cold]
1206 fn simple_key_expected(&self) -> ScanError {
1207 ScanError::new_str(self.mark, "simple key expected")
1208 }
1209
1210 #[cold]
1211 fn unclosed_bracket(mark: Marker, bracket: char) -> ScanError {
1212 ScanError::new(mark, format!("unclosed bracket '{bracket}'"))
1213 }
1214
1215 #[inline]
1217 fn skip_blank(&mut self) {
1218 self.input.skip();
1219
1220 self.mark.offsets.chars += 1;
1221 self.mark.col += 1;
1222 self.mark.offsets.bytes = self.input.byte_offset();
1223 }
1224
1225 #[inline]
1227 fn skip_non_blank(&mut self) {
1228 self.input.skip();
1229
1230 self.mark.offsets.chars += 1;
1231 self.mark.col += 1;
1232 self.mark.offsets.bytes = self.input.byte_offset();
1233 self.leading_whitespace = false;
1234 }
1235
1236 #[inline]
1241 fn skip_bom(&mut self) {
1242 self.input.skip();
1243
1244 self.mark.offsets.chars += 1;
1245 self.mark.offsets.bytes = self.input.byte_offset();
1246 }
1247
1248 #[inline]
1254 fn skip_comment_char(&mut self) {
1255 self.input.skip();
1256
1257 self.mark.offsets.chars += 1;
1258 self.mark.col += 1;
1259 self.mark.offsets.bytes = self.input.byte_offset();
1260 }
1261
1262 #[inline]
1264 fn skip_n_non_blank(&mut self, count: usize) {
1265 for _ in 0..count {
1266 self.input.skip();
1267 self.mark.offsets.chars += 1;
1268 self.mark.col += 1;
1269 }
1270 self.mark.offsets.bytes = self.input.byte_offset();
1271 self.leading_whitespace = false;
1272 }
1273
1274 #[inline]
1276 fn skip_nl(&mut self) {
1277 self.input.skip();
1278
1279 self.mark.offsets.chars += 1;
1280 self.mark.col = 0;
1281 self.mark.line += 1;
1282 self.mark.offsets.bytes = self.input.byte_offset();
1283 self.leading_whitespace = true;
1284 }
1285
1286 #[inline]
1288 fn skip_linebreak(&mut self) {
1289 if self.input.next_2_are('\r', '\n') {
1290 self.skip_blank();
1293 self.skip_nl();
1294 } else if self.input.next_is_break() {
1295 self.skip_nl();
1296 }
1297 }
1298
1299 #[cfg(test)]
1300 fn scan_comment_token(&mut self) -> Result<Token<'input>, ScanError> {
1301 Ok(self.scan_comment_queued_token()?.into_public())
1302 }
1303
1304 fn scan_comment_queued_token(&mut self) -> Result<QueuedToken<'input>, ScanError> {
1305 let start_mark = self.mark;
1306 debug_assert_eq!(self.input.peek(), '#');
1307 let placement = if self.leading_whitespace {
1308 Placement::Free
1309 } else {
1310 Placement::Right
1311 };
1312
1313 self.skip_comment_char();
1314
1315 let text = if let Some(start) = self.input.byte_offset() {
1316 let n = self.input.skip_while_non_breakz();
1318 self.mark.offsets.chars += n;
1319 self.mark.col += n;
1320 let byte_offset = self.input.byte_offset();
1321 self.mark.offsets.bytes = byte_offset;
1322 let end = byte_offset.expect("byte_offset must remain available once enabled");
1323
1324 if let Some(slice) = self.try_borrow_slice(start, end) {
1325 Cow::Borrowed(slice)
1326 } else if let Some(slice) = self.input.slice_bytes(start, end) {
1327 Cow::Owned(slice.to_owned())
1329 } else {
1330 return Err(ScanError::new_str(
1331 start_mark,
1332 "internal error: input advertised offsets but did not provide a slice",
1333 ));
1334 }
1335 } else {
1336 let mut owned = String::new();
1338 while !is_breakz(self.input.look_ch()) {
1339 owned.push(self.input.peek());
1340 self.skip_comment_char();
1341 }
1342 Cow::Owned(owned)
1343 };
1344
1345 let end_mark = self.mark;
1346 let span = Span::new(start_mark, end_mark);
1347 Ok(QueuedToken(
1348 span,
1349 QueuedTokenType::Comment(QueuedComment { text, placement }),
1350 ))
1351 }
1352
1353 fn push_comment_token(&mut self) -> ScanResult {
1354 let token = self.scan_comment_queued_token()?;
1355 self.tokens.push_back(token);
1356 Ok(())
1357 }
1358
1359 fn skip_comment(&mut self) {
1360 debug_assert_eq!(self.input.peek(), '#');
1361
1362 self.skip_comment_char();
1363 let n = self.input.skip_while_non_breakz();
1364 self.mark.offsets.chars += n;
1365 self.mark.col += n;
1366 self.mark.offsets.bytes = self.input.byte_offset();
1367 }
1368
1369 #[inline]
1371 pub fn stream_started(&self) -> bool {
1372 self.stream_start_produced
1373 }
1374
1375 #[inline]
1377 pub fn stream_ended(&self) -> bool {
1378 self.stream_end_produced
1379 }
1380
1381 #[inline]
1383 pub fn mark(&self) -> Marker {
1384 self.mark
1385 }
1386
1387 #[inline]
1389 pub(crate) fn comments_possible(&self) -> bool {
1390 self.comments_possible
1391 }
1392
1393 #[inline]
1400 fn read_break(&mut self, s: &mut String) {
1401 self.skip_break();
1402 s.push('\n');
1403 }
1404
1405 #[inline]
1410 fn skip_break(&mut self) {
1411 let c = self.input.peek();
1412 let nc = self.input.peek_nth(1);
1413 debug_assert!(is_break(c));
1414 if c == '\r' && nc == '\n' {
1415 self.skip_blank();
1416 }
1417 self.skip_nl();
1418 }
1419
1420 fn insert_token(&mut self, pos: usize, tok: Token<'input>) {
1422 let old_len = self.tokens.len();
1423 assert!(pos <= old_len);
1424 self.tokens.insert(pos, tok.into());
1425 }
1426
1427 fn simple_key_token_index(&self, sk: &SimpleKey, mark: Marker) -> Result<usize, ScanError> {
1428 let Some(index) = sk.token_number.checked_sub(self.tokens_parsed) else {
1429 return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1430 };
1431 if index > self.tokens.len() {
1432 return Err(ScanError::new_str(mark, "simple key is no longer valid"));
1433 }
1434 Ok(index)
1435 }
1436
1437 #[inline]
1438 fn allow_simple_key(&mut self) {
1439 self.simple_key_allowed = true;
1440 }
1441
1442 #[inline]
1443 fn disallow_simple_key(&mut self) {
1444 self.simple_key_allowed = false;
1445 }
1446
1447 pub fn fetch_next_token(&mut self) -> ScanResult {
1452 self.input.lookahead(1);
1453
1454 if !self.stream_start_produced {
1455 self.fetch_stream_start();
1456 return Ok(());
1457 }
1458 if self.skip_to_next_token(true)? {
1459 return Ok(());
1460 }
1461
1462 debug_print!(
1463 " \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
1464 self.mark,
1465 self.input.peek()
1466 );
1467
1468 self.stale_simple_keys()?;
1469
1470 let mark = self.mark;
1471 self.unroll_indent(mark.col as isize);
1472
1473 self.input.lookahead(4);
1474
1475 if self.input.next_is_z() {
1476 self.fetch_stream_end()?;
1477 return Ok(());
1478 }
1479
1480 if self.mark.col == 0 {
1481 if self.input.next_char_is('%') {
1482 return self.fetch_directive();
1483 } else if self.input.next_is_document_start() {
1484 return self.fetch_document_indicator(TokenType::DocumentStart);
1485 } else if self.input.next_is_document_end() {
1486 self.fetch_document_indicator(TokenType::DocumentEnd)?;
1487 self.skip_ws_to_eol(SkipTabs::Yes)?;
1488 if !self.input.next_is_breakz() {
1489 return Err(ScanError::new_str(
1490 self.mark,
1491 "invalid content after document end marker",
1492 ));
1493 }
1494 return Ok(());
1495 }
1496 }
1497
1498 if self.document_prefix_allowed {
1499 self.document_prefix_allowed = false;
1500 }
1501
1502 if (self.mark.col as isize) < self.indent {
1503 self.input.lookahead(1);
1504 let c = self.input.peek();
1505 if self.flow_level == 0 || !matches!(c, ']' | '}' | ',') {
1506 return Err(ScanError::new_str(self.mark, "invalid indentation"));
1507 }
1508 }
1509
1510 let c = self.input.peek();
1511 let nc = self.input.peek_nth(1);
1512 match c {
1513 '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
1514 '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
1515 ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
1516 '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
1517 ',' => self.fetch_flow_entry(),
1518 '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
1519 '?' if is_blank_or_breakz(nc) => self.fetch_key(),
1520 ':' if is_blank_or_breakz(nc) => self.fetch_value(),
1521 ':' if self.flow_level > 0
1522 && (is_flow(nc) || self.mark.index() == self.adjacent_value_allowed_at) =>
1523 {
1524 self.fetch_flow_value()
1525 }
1526 '*' => self.fetch_anchor(true),
1528 '&' => self.fetch_anchor(false),
1530 '!' => self.fetch_tag(),
1531 '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
1533 '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
1535 '\'' => self.fetch_flow_scalar(true),
1536 '"' => self.fetch_flow_scalar(false),
1537 '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
1539 ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
1540 self.fetch_plain_scalar()
1541 }
1542 c if is_bom(c) => Err(ScanError::new_str(
1543 self.mark,
1544 "a BOM must not appear inside a document",
1545 )),
1546 '%' | '@' | '`' => Err(ScanError::new(
1547 self.mark,
1548 format!("unexpected character: `{c}'"),
1549 )),
1550 _ => self.fetch_plain_scalar(),
1551 }
1552 }
1553
1554 pub(crate) fn next_queued_token(&mut self) -> Result<Option<QueuedToken<'input>>, ScanError> {
1559 if self.deferred_error.is_some() {
1560 if !matches!(
1561 self.tokens.front().map(|token| &token.1),
1562 Some(QueuedTokenType::Comment(_))
1563 ) {
1564 if let Some(error) = self.deferred_error.take() {
1565 return error.into_result();
1566 }
1567 }
1568 self.token_available = true;
1569 }
1570
1571 if self.stream_end_produced {
1572 return Ok(None);
1573 }
1574
1575 if !self.token_available {
1576 if let Err(error) = self.fetch_more_tokens() {
1577 if matches!(
1578 self.tokens.front().map(|token| &token.1),
1579 Some(QueuedTokenType::Comment(_))
1580 ) {
1581 self.deferred_error = Some(error);
1582 } else {
1583 return Err(error);
1584 }
1585 }
1586 }
1587 let Some(t) = self.tokens.pop_front() else {
1588 return Err(ScanError::new_str(
1589 self.mark,
1590 "did not find expected next token",
1591 ));
1592 };
1593 self.token_available = false;
1594 self.tokens_parsed += 1;
1595
1596 let is_stream_end = matches!(t.1, QueuedTokenType::StreamEnd);
1597 if is_stream_end {
1598 self.stream_end_produced = true;
1599 }
1600 Ok(Some(t))
1601 }
1602
1603 pub fn next_token(&mut self) -> Result<Option<Token<'input>>, ScanError> {
1608 Ok(self.next_queued_token()?.map(QueuedToken::into_public))
1609 }
1610
1611 pub fn fetch_more_tokens(&mut self) -> ScanResult {
1616 let mut need_more;
1617 loop {
1618 if self.tokens.is_empty() {
1619 need_more = true;
1620 } else {
1621 need_more = false;
1622 self.stale_simple_keys()?;
1624 if !matches!(
1625 self.tokens.front().map(|token| &token.1),
1626 Some(QueuedTokenType::Comment(_))
1627 ) {
1628 for sk in &self.simple_keys {
1630 if sk.possible && sk.token_number == self.tokens_parsed {
1631 need_more = true;
1632 break;
1633 }
1634 }
1635 }
1636 }
1637
1638 if let Some(token) = self.tokens.back() {
1641 if matches!(
1642 token.1,
1643 QueuedTokenType::DocumentEnd | QueuedTokenType::DocumentStart
1644 ) {
1645 break;
1646 }
1647 }
1648
1649 if !need_more {
1650 break;
1651 }
1652 self.fetch_next_token()?;
1653 }
1654 self.token_available = true;
1655
1656 Ok(())
1657 }
1658
1659 fn stale_simple_keys(&mut self) -> ScanResult {
1668 for sk in &mut self.simple_keys {
1669 let is_line_stale = self.flow_level == 0 && sk.mark.line < self.mark.line;
1670 let is_length_stale =
1673 self.mark.index().saturating_sub(sk.mark.index()) > SIMPLE_KEY_MAX_LOOKAHEAD;
1674
1675 if sk.possible && (is_line_stale || is_length_stale) {
1676 if sk.required {
1677 return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
1678 }
1679 sk.possible = false;
1680 }
1681 }
1682 Ok(())
1683 }
1684
1685 fn skip_to_next_token(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1695 let consume_linebreak = |this: &mut Self| {
1698 this.input.lookahead(2);
1699 this.skip_linebreak();
1700 if this.flow_level == 0 {
1701 this.allow_simple_key();
1702 }
1703 };
1704
1705 loop {
1706 let ch = self.input.look_ch();
1707 if self.explicit_key_tab_check_pending {
1708 match ch {
1709 '\t' => {
1710 return Err(ScanError::new_str(
1711 self.mark(),
1712 "tabs disallowed in this context",
1713 ));
1714 }
1715 ' ' | '\n' | '\r' | '#' => {}
1716 _ => self.explicit_key_tab_check_pending = false,
1717 }
1718 }
1719
1720 match ch {
1721 '\t' => {
1723 if self.is_within_block()
1724 && self.leading_whitespace
1725 && (self.mark.col as isize) < self.indent
1726 {
1727 self.skip_ws_to_eol(SkipTabs::Yes)?;
1728
1729 if !self.input.next_is_breakz() {
1731 return Err(ScanError::new_str(
1732 self.mark,
1733 "tabs disallowed within this context (block indentation)",
1734 ));
1735 }
1736
1737 if matches!(self.input.look_ch(), '\n' | '\r') {
1739 consume_linebreak(self);
1740 }
1741 } else {
1742 self.skip_blank();
1744 }
1745 }
1746
1747 ' ' => self.skip_blank(),
1748
1749 '\n' | '\r' => consume_linebreak(self),
1750
1751 c if is_bom(c)
1752 && self.document_prefix_allowed
1753 && self.flow_level == 0
1754 && self.mark.col == 0 =>
1755 {
1756 self.skip_bom();
1757 }
1758
1759 '#' => {
1760 self.push_comment_token()?;
1761
1762 if matches!(self.input.look_ch(), '\n' | '\r') {
1764 consume_linebreak(self);
1765 }
1766 if stop_after_comment {
1767 return Ok(true);
1768 }
1769 }
1770
1771 _ => break,
1772 }
1773 }
1774
1775 if let Some(err_mark) = self.interrupted_plain_by_comment.take() {
1778 let is_immediate_next_line = self.mark.line == err_mark.line + 1;
1782
1783 if self.flow_level == 0
1785 && is_immediate_next_line
1786 && (self.mark.col as isize) > self.indent
1787 {
1788 self.input.lookahead(4);
1792
1793 if !self.input.next_is_z()
1794 && !self.input.next_is_document_indicator()
1795 && self.input.next_can_be_plain_scalar(false)
1796 {
1797 return Err(ScanError::new_str(
1798 err_mark,
1799 "comment intercepting the multiline text",
1800 ));
1801 }
1802 }
1803 }
1804
1805 Ok(false)
1806 }
1807
1808 fn skip_yaml_whitespace(&mut self, stop_after_comment: bool) -> Result<bool, ScanError> {
1816 let mut need_whitespace = true;
1817 loop {
1818 match self.input.look_ch() {
1819 ' ' => {
1820 self.skip_blank();
1821
1822 need_whitespace = false;
1823 }
1824 '\n' | '\r' => {
1825 self.input.lookahead(2);
1826 self.skip_linebreak();
1827 if self.flow_level == 0 {
1828 self.allow_simple_key();
1829 }
1830 need_whitespace = false;
1831 }
1832 '#' => {
1833 if need_whitespace {
1834 self.skip_comment();
1835 } else {
1836 self.push_comment_token()?;
1837 if stop_after_comment {
1838 return Ok(true);
1839 }
1840 }
1841 }
1842 _ => break,
1843 }
1844 }
1845
1846 if need_whitespace {
1847 Err(ScanError::new_str(self.mark(), "expected whitespace"))
1848 } else {
1849 Ok(false)
1850 }
1851 }
1852
1853 fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
1854 debug_assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
1855
1856 if !self.comments_possible {
1857 let (chars_consumed, result) = self.input.skip_ws_to_eol(skip_tabs);
1858 self.mark.col += chars_consumed;
1859 self.mark.offsets.chars += chars_consumed;
1860 self.mark.offsets.bytes = self.input.byte_offset();
1861 return result.map_err(|msg| ScanError::new_str(self.mark, msg));
1862 }
1863
1864 let (chars_consumed, whitespace) = self.input.skip_ws_to_eol_blanks(skip_tabs);
1865 self.mark.col += chars_consumed;
1866 self.mark.offsets.chars += chars_consumed;
1867 self.mark.offsets.bytes = self.input.byte_offset();
1868
1869 if self.input.look_ch() != '#' {
1870 return Ok(whitespace);
1871 }
1872
1873 if !whitespace.found_tabs() && !whitespace.has_valid_yaml_ws() {
1874 return Err(ScanError::new_str(
1875 self.mark,
1876 "comments must be separated from other tokens by whitespace",
1877 ));
1878 }
1879
1880 self.push_comment_token()?;
1881 Ok(whitespace)
1882 }
1883
1884 fn fetch_stream_start(&mut self) {
1885 let mark = self.mark;
1886 self.indent = -1;
1887 self.stream_start_produced = true;
1888 self.allow_simple_key();
1889 self.tokens
1890 .push_back(Token(Span::empty(mark), TokenType::StreamStart(TEncoding::Utf8)).into());
1891 self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1892 }
1893
1894 fn fetch_stream_end(&mut self) -> ScanResult {
1895 if self.mark.col != 0 {
1897 self.mark.col = 0;
1898 self.mark.line += 1;
1899 }
1900
1901 if let Some((mark, bracket)) = self.flow_markers.pop() {
1902 return Err(Self::unclosed_bracket(mark, bracket));
1903 }
1904
1905 for sk in &mut self.simple_keys {
1908 if sk.required && sk.possible {
1909 return Err(self.simple_key_expected());
1910 }
1911 sk.possible = false;
1912 }
1913
1914 self.unroll_indent(-1);
1915 self.remove_simple_key()?;
1916 self.disallow_simple_key();
1917
1918 self.tokens
1919 .push_back(Token(Span::empty(self.mark), TokenType::StreamEnd).into());
1920 Ok(())
1921 }
1922
1923 fn fetch_directive(&mut self) -> ScanResult {
1924 self.unroll_indent(-1);
1925 self.remove_simple_key()?;
1926
1927 self.disallow_simple_key();
1928
1929 let token_index = self.tokens.len();
1930 let tok = self.scan_directive()?;
1931 self.insert_token(token_index, tok);
1932
1933 Ok(())
1934 }
1935
1936 fn scan_directive(&mut self) -> Result<Token<'input>, ScanError> {
1937 let start_mark = self.mark;
1938 self.skip_non_blank();
1939
1940 let name = self.scan_directive_name()?;
1941 let tok = match name.as_ref() {
1942 "YAML" => self.scan_version_directive_value(&start_mark)?,
1943 "TAG" => self.scan_tag_directive_value(&start_mark)?,
1944 _ => {
1945 let mut params = Vec::new();
1946 while self.input.next_is_blank() {
1947 let n_blanks = self.input.skip_while_blank();
1948 self.mark.offsets.chars += n_blanks;
1949 self.mark.col += n_blanks;
1950 self.mark.offsets.bytes = self.input.byte_offset();
1951
1952 if !is_blank_or_breakz(self.input.peek()) {
1953 let mut param = String::new();
1954 let n_chars = self.input.fetch_while_is_yaml_non_space(&mut param);
1955 self.mark.offsets.chars += n_chars;
1956 self.mark.col += n_chars;
1957 self.mark.offsets.bytes = self.input.byte_offset();
1958 params.push(param);
1959 }
1960 }
1961
1962 Token(
1963 Span::new(start_mark, self.mark),
1964 TokenType::ReservedDirective(name, params),
1965 )
1966 }
1967 };
1968
1969 self.skip_ws_to_eol(SkipTabs::Yes)?;
1970
1971 if self.input.next_is_breakz() {
1972 self.input.lookahead(2);
1973 self.skip_linebreak();
1974 Ok(tok)
1975 } else {
1976 Err(ScanError::new_str(
1977 start_mark,
1978 "while scanning a directive, did not find expected comment or line break",
1979 ))
1980 }
1981 }
1982
1983 fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
1984 let n_blanks = self.input.skip_while_blank();
1985 self.mark.offsets.chars += n_blanks;
1986 self.mark.col += n_blanks;
1987 self.mark.offsets.bytes = self.input.byte_offset();
1988
1989 let major = self.scan_version_directive_number(mark)?;
1990
1991 if self.input.peek() != '.' {
1992 return Err(ScanError::new_str(
1993 *mark,
1994 "while scanning a YAML directive, did not find expected digit or '.' character",
1995 ));
1996 }
1997 self.skip_non_blank();
1998
1999 let minor = self.scan_version_directive_number(mark)?;
2000
2001 Ok(Token(
2002 Span::new(*mark, self.mark),
2003 TokenType::VersionDirective(major, minor),
2004 ))
2005 }
2006
2007 fn scan_directive_name(&mut self) -> Result<String, ScanError> {
2008 let start_mark = self.mark;
2009 let mut string = String::new();
2010
2011 let n_chars = self.input.fetch_while_is_yaml_non_space(&mut string);
2012 self.mark.offsets.chars += n_chars;
2013 self.mark.col += n_chars;
2014 self.mark.offsets.bytes = self.input.byte_offset();
2015
2016 if string.is_empty() {
2017 return Err(ScanError::new_str(
2018 start_mark,
2019 "while scanning a directive, could not find expected directive name",
2020 ));
2021 }
2022
2023 if !is_blank_or_breakz(self.input.peek()) {
2024 return Err(ScanError::new_str(
2025 start_mark,
2026 "while scanning a directive, found unexpected non-alphabetical character",
2027 ));
2028 }
2029
2030 Ok(string)
2031 }
2032
2033 fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
2034 let mut val = 0u32;
2035 let mut length = 0usize;
2036 while let Some(digit) = self.input.look_ch().to_digit(10) {
2037 if length + 1 > 9 {
2038 return Err(ScanError::new_str(
2039 *mark,
2040 "while scanning a YAML directive, found extremely long version number",
2041 ));
2042 }
2043 length += 1;
2044 val = val * 10 + digit;
2045 self.skip_non_blank();
2046 }
2047
2048 if length == 0 {
2049 return Err(ScanError::new_str(
2050 *mark,
2051 "while scanning a YAML directive, did not find expected version number",
2052 ));
2053 }
2054
2055 Ok(val)
2056 }
2057
2058 fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token<'input>, ScanError> {
2059 let n_blanks = self.input.skip_while_blank();
2060 self.mark.offsets.chars += n_blanks;
2061 self.mark.col += n_blanks;
2062 self.mark.offsets.bytes = self.input.byte_offset();
2063
2064 let handle = self.scan_tag_handle_directive_cow(mark)?;
2065
2066 let n_blanks = self.input.skip_while_blank();
2067 self.mark.offsets.chars += n_blanks;
2068 self.mark.col += n_blanks;
2069 self.mark.offsets.bytes = self.input.byte_offset();
2070
2071 let prefix = self.scan_tag_prefix_directive_cow(mark)?;
2072
2073 self.input.lookahead(1);
2074
2075 if self.input.next_is_blank_or_breakz() {
2076 Ok(Token(
2077 Span::new(*mark, self.mark),
2078 TokenType::TagDirective(handle, prefix),
2079 ))
2080 } else {
2081 Err(ScanError::new_str(
2082 *mark,
2083 "while scanning TAG, did not find expected whitespace or line break",
2084 ))
2085 }
2086 }
2087
2088 fn fetch_tag(&mut self) -> ScanResult {
2089 self.save_simple_key();
2090 self.disallow_simple_key();
2091
2092 let tok = self.scan_tag()?;
2093 self.tokens.push_back(tok.into());
2094 Ok(())
2095 }
2096
2097 fn scan_tag(&mut self) -> Result<Token<'input>, ScanError> {
2098 let start_mark = self.mark;
2099
2100 self.input.lookahead(2);
2102
2103 if self.input.byte_offset().is_none() {
2105 return self.scan_tag_owned(&start_mark);
2106 }
2107
2108 let (handle, suffix): (Cow<'input, str>, Cow<'input, str>) =
2109 if self.input.nth_char_is(1, '<') {
2110 let suffix = self.scan_verbatim_tag(&start_mark)?;
2112 (Cow::Owned(String::new()), Cow::Owned(suffix))
2113 } else {
2114 let handle = self.scan_tag_handle_cow(&start_mark)?;
2116 if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2118 let suffix = self.scan_tag_shorthand_suffix_cow(&start_mark, true)?;
2120 (handle, suffix)
2121 } else {
2122 let remaining_suffix =
2127 self.scan_tag_shorthand_suffix_cow(&start_mark, false)?;
2128
2129 let suffix = if handle.len() > 1 {
2131 if remaining_suffix.is_empty() {
2132 match handle {
2134 Cow::Borrowed(s) => Cow::Borrowed(&s[1..]),
2135 Cow::Owned(s) => Cow::Owned(s[1..].to_owned()),
2136 }
2137 } else {
2138 let mut combined = handle[1..].to_owned();
2140 combined.push_str(&remaining_suffix);
2141 Cow::Owned(combined)
2142 }
2143 } else {
2144 remaining_suffix
2146 };
2147
2148 if suffix.is_empty() {
2151 (Cow::Borrowed(""), Cow::Borrowed("!"))
2152 } else {
2153 (Cow::Borrowed("!"), suffix)
2154 }
2155 }
2156 };
2157
2158 if is_blank_or_breakz(self.input.look_ch())
2159 || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2160 {
2161 Ok(Token(
2164 Span::new(start_mark, self.mark),
2165 TokenType::Tag(handle, suffix),
2166 ))
2167 } else {
2168 Err(ScanError::new_str(
2169 start_mark,
2170 "while scanning a tag, did not find expected whitespace or line break",
2171 ))
2172 }
2173 }
2174
2175 fn scan_tag_owned(&mut self, start_mark: &Marker) -> Result<Token<'input>, ScanError> {
2177 let mut handle = String::new();
2178 let mut suffix;
2179
2180 if self.input.nth_char_is(1, '<') {
2181 suffix = self.scan_verbatim_tag(start_mark)?;
2182 } else {
2183 handle = self.scan_tag_handle(false, start_mark)?;
2185 if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
2187 let is_secondary_handle = handle == "!!";
2189 suffix =
2190 self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", start_mark)?;
2191 } else {
2192 suffix = self.scan_tag_shorthand_suffix(false, false, &handle, start_mark)?;
2193 "!".clone_into(&mut handle);
2194 if suffix.is_empty() {
2197 handle.clear();
2198 "!".clone_into(&mut suffix);
2199 }
2200 }
2201 }
2202
2203 if is_blank_or_breakz(self.input.look_ch())
2204 || (self.flow_level > 0 && matches!(self.input.peek(), ',' | ']' | '}'))
2205 {
2206 Ok(Token(
2209 Span::new(*start_mark, self.mark),
2210 TokenType::Tag(handle.into(), suffix.into()),
2211 ))
2212 } else {
2213 Err(ScanError::new_str(
2214 *start_mark,
2215 "while scanning a tag, did not find expected whitespace or line break",
2216 ))
2217 }
2218 }
2219
2220 fn scan_tag_handle_cow(&mut self, mark: &Marker) -> Result<Cow<'input, str>, ScanError> {
2225 let Some(start) = self.input.byte_offset() else {
2226 return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2227 };
2228
2229 if self.input.look_ch() != '!' {
2230 return Err(ScanError::new_str(
2231 *mark,
2232 "while scanning a tag, did not find expected '!'",
2233 ));
2234 }
2235
2236 self.skip_non_blank();
2238
2239 self.input.lookahead(1);
2241 while self.input.next_is_alpha() {
2242 self.skip_non_blank();
2243 self.input.lookahead(1);
2244 }
2245
2246 if self.input.peek() == '!' {
2248 self.skip_non_blank();
2249 }
2250
2251 let Some(end) = self.input.byte_offset() else {
2252 return Ok(Cow::Owned(self.scan_tag_handle(false, mark)?));
2253 };
2254
2255 if let Some(slice) = self.try_borrow_slice(start, end) {
2256 Ok(Cow::Borrowed(slice))
2257 } else {
2258 let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2259 ScanError::new_str(
2260 *mark,
2261 "internal error: input advertised slicing but did not provide a slice",
2262 )
2263 })?;
2264 Ok(Cow::Owned(slice.to_owned()))
2265 }
2266 }
2267
2268 fn scan_tag_shorthand_suffix_cow(
2272 &mut self,
2273 mark: &Marker,
2274 require_non_empty: bool,
2275 ) -> Result<Cow<'input, str>, ScanError> {
2276 let Some(start) = self.input.byte_offset() else {
2277 return Ok(Cow::Owned(
2278 self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2279 ));
2280 };
2281
2282 while is_tag_char(self.input.look_ch()) {
2284 if self.input.peek() == '%' {
2285 let current = self
2287 .input
2288 .byte_offset()
2289 .expect("byte_offset() must remain available once enabled");
2290 let mut out = if let Some(slice) = self.input.slice_bytes(start, current) {
2291 slice.to_owned()
2292 } else {
2293 String::new()
2294 };
2295
2296 while is_tag_char(self.input.look_ch()) {
2298 if self.input.peek() == '%' {
2299 out.push(self.scan_uri_escapes(mark)?);
2300 } else {
2301 out.push(self.input.peek());
2302 self.skip_non_blank();
2303 }
2304 }
2305 return Ok(Cow::Owned(out));
2306 }
2307 self.skip_non_blank();
2308 }
2309
2310 let Some(end) = self.input.byte_offset() else {
2311 return Ok(Cow::Owned(
2312 self.scan_tag_shorthand_suffix(false, false, "", mark)?,
2313 ));
2314 };
2315
2316 if require_non_empty && start == end {
2317 return Err(ScanError::new_str(
2318 *mark,
2319 "while parsing a tag, did not find expected tag URI",
2320 ));
2321 }
2322
2323 if let Some(slice) = self.try_borrow_slice(start, end) {
2324 Ok(Cow::Borrowed(slice))
2325 } else {
2326 let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
2327 ScanError::new_str(
2328 *mark,
2329 "internal error: input advertised slicing but did not provide a slice",
2330 )
2331 })?;
2332 Ok(Cow::Owned(slice.to_owned()))
2333 }
2334 }
2335
2336 fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
2337 let mut string = String::new();
2338 if self.input.look_ch() != '!' {
2339 return Err(ScanError::new_str(
2340 *mark,
2341 "while scanning a tag, did not find expected '!'",
2342 ));
2343 }
2344
2345 string.push(self.input.peek());
2346 self.skip_non_blank();
2347
2348 let n_chars = self.input.fetch_while_is_alpha(&mut string);
2349 self.mark.offsets.chars += n_chars;
2350 self.mark.col += n_chars;
2351 self.mark.offsets.bytes = self.input.byte_offset();
2352
2353 if self.input.peek() == '!' {
2355 string.push(self.input.peek());
2356 self.skip_non_blank();
2357 } else if directive && string != "!" {
2358 return Err(ScanError::new_str(
2362 *mark,
2363 "while parsing a tag directive, did not find expected '!'",
2364 ));
2365 }
2366 Ok(string)
2367 }
2368
2369 fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2375 let mut string = String::new();
2376
2377 if self.input.look_ch() == '!' {
2378 string.push(self.input.peek());
2380 self.skip_non_blank();
2381 } else if !is_tag_char(self.input.peek()) {
2382 return Err(ScanError::new_str(
2384 *start_mark,
2385 "invalid global tag character",
2386 ));
2387 } else if self.input.peek() == '%' {
2388 string.push(self.scan_uri_escapes(start_mark)?);
2390 } else {
2391 string.push(self.input.peek());
2393 self.skip_non_blank();
2394 }
2395
2396 while is_uri_char(self.input.look_ch()) {
2397 if self.input.peek() == '%' {
2398 string.push(self.scan_uri_escapes(start_mark)?);
2399 } else {
2400 string.push(self.input.peek());
2401 self.skip_non_blank();
2402 }
2403 }
2404
2405 Ok(string)
2406 }
2407
2408 fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
2412 self.skip_non_blank();
2414 self.skip_non_blank();
2415
2416 let mut string = String::new();
2417 while is_uri_char(self.input.look_ch()) {
2418 if self.input.peek() == '%' {
2419 string.push(self.scan_uri_escapes(start_mark)?);
2420 } else {
2421 string.push(self.input.peek());
2422 self.skip_non_blank();
2423 }
2424 }
2425
2426 if string.is_empty() {
2427 return Err(ScanError::new_str(
2428 *start_mark,
2429 "while parsing a tag, did not find expected tag URI",
2430 ));
2431 }
2432
2433 if self.input.peek() != '>' {
2434 return Err(ScanError::new_str(
2435 *start_mark,
2436 "while scanning a verbatim tag, did not find the expected '>'",
2437 ));
2438 }
2439 self.skip_non_blank();
2440
2441 Ok(string)
2442 }
2443
2444 fn scan_tag_shorthand_suffix(
2445 &mut self,
2446 _directive: bool,
2447 _is_secondary: bool,
2448 head: &str,
2449 mark: &Marker,
2450 ) -> Result<String, ScanError> {
2451 let mut length = head.len();
2452 let mut string = String::new();
2453
2454 if length > 1 {
2457 string.extend(head.chars().skip(1));
2458 }
2459
2460 while is_tag_char(self.input.look_ch()) {
2461 if self.input.peek() == '%' {
2463 string.push(self.scan_uri_escapes(mark)?);
2464 } else {
2465 string.push(self.input.peek());
2466 self.skip_non_blank();
2467 }
2468
2469 length += 1;
2470 }
2471
2472 if length == 0 {
2473 return Err(ScanError::new_str(
2474 *mark,
2475 "while parsing a tag, did not find expected tag URI",
2476 ));
2477 }
2478
2479 Ok(string)
2480 }
2481
2482 fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
2483 let mut width = 0usize;
2484 let mut bytes = [0u8; 4];
2485 let mut bytes_len = 0usize;
2486 loop {
2487 self.input.lookahead(3);
2488
2489 let c = self.input.peek_nth(1);
2490 let nc = self.input.peek_nth(2);
2491
2492 if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
2493 return Err(ScanError::new_str(
2494 *mark,
2495 "while parsing a tag, found an invalid escape sequence",
2496 ));
2497 }
2498
2499 let byte = u8::try_from((as_hex(c) << 4) + as_hex(nc))
2500 .expect("two hex nibbles always fit in a byte");
2501 if width == 0 {
2502 width = match byte {
2503 _ if byte & 0x80 == 0x00 => 1,
2504 _ if byte & 0xE0 == 0xC0 => 2,
2505 _ if byte & 0xF0 == 0xE0 => 3,
2506 _ if byte & 0xF8 == 0xF0 => 4,
2507 _ => {
2508 return Err(ScanError::new_str(
2509 *mark,
2510 "while parsing a tag, found an incorrect leading UTF-8 byte",
2511 ));
2512 }
2513 };
2514 } else if byte & 0xc0 != 0x80 {
2515 return Err(ScanError::new_str(
2516 *mark,
2517 "while parsing a tag, found an incorrect trailing UTF-8 byte",
2518 ));
2519 }
2520
2521 bytes[bytes_len] = byte;
2522 bytes_len += 1;
2523
2524 self.skip_n_non_blank(3);
2525
2526 width -= 1;
2527 if width == 0 {
2528 break;
2529 }
2530 }
2531
2532 let s = core::str::from_utf8(&bytes[..bytes_len]).map_err(|_| {
2533 ScanError::new_str(
2534 *mark,
2535 "while parsing a tag, found an invalid UTF-8 codepoint",
2536 )
2537 })?;
2538
2539 let mut chars = s.chars();
2540 match (chars.next(), chars.next()) {
2541 (Some(ch), None) => Ok(ch),
2542 _ => Err(ScanError::new_str(
2543 *mark,
2544 "while parsing a tag, found an invalid UTF-8 codepoint",
2545 )),
2546 }
2547 }
2548
2549 fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2550 self.save_simple_key();
2551 self.disallow_simple_key();
2552
2553 let tok = self.scan_anchor(alias)?;
2554
2555 self.tokens.push_back(tok.into());
2556
2557 Ok(())
2558 }
2559
2560 fn scan_anchor(&mut self, alias: bool) -> Result<Token<'input>, ScanError> {
2561 let start_mark = self.mark;
2562
2563 self.skip_non_blank();
2565
2566 if let Some(start) = self.input.byte_offset() {
2568 while is_anchor_char(self.input.look_ch()) {
2569 self.skip_non_blank();
2570 }
2571
2572 let end = self
2573 .input
2574 .byte_offset()
2575 .expect("byte_offset() must remain available once enabled");
2576
2577 if start == end {
2578 return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2579 }
2580
2581 let cow = if let Some(slice) = self.try_borrow_slice(start, end) {
2582 Cow::Borrowed(slice)
2583 } else if let Some(slice) = self.input.slice_bytes(start, end) {
2584 Cow::Owned(slice.to_owned())
2585 } else {
2586 return Err(ScanError::new_str(
2587 start_mark,
2588 "internal error: input advertised slicing but did not provide a slice",
2589 ));
2590 };
2591
2592 let tok = if alias {
2593 TokenType::Alias(cow)
2594 } else {
2595 TokenType::Anchor(cow)
2596 };
2597 return Ok(Token(Span::new(start_mark, self.mark), tok));
2598 }
2599
2600 let mut string = String::new();
2601 while is_anchor_char(self.input.look_ch()) {
2602 string.push(self.input.peek());
2603 self.skip_non_blank();
2604 }
2605
2606 if string.is_empty() {
2607 return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2608 }
2609
2610 let tok = if alias {
2611 TokenType::Alias(string.into())
2612 } else {
2613 TokenType::Anchor(string.into())
2614 };
2615 Ok(Token(Span::new(start_mark, self.mark), tok))
2616 }
2617
2618 fn fetch_flow_collection_start(&mut self, tok: TokenType<'input>) -> ScanResult {
2619 self.save_simple_key();
2621
2622 let start_mark = self.mark;
2623 let indicator = self.input.peek();
2624 self.flow_markers.push((start_mark, indicator));
2625
2626 self.roll_one_col_indent();
2627 self.increase_flow_level()?;
2628
2629 self.allow_simple_key();
2630
2631 self.skip_non_blank();
2632
2633 if tok == TokenType::FlowMappingStart {
2634 self.flow_mapping_started.push(true);
2635 } else {
2636 self.flow_mapping_started.push(false);
2637 self.implicit_flow_mapping_states
2638 .push(ImplicitMappingState::Possible);
2639 }
2640
2641 let token_index = self.tokens.len();
2642 self.skip_ws_to_eol(SkipTabs::Yes)?;
2643
2644 self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2645 Ok(())
2646 }
2647
2648 fn fetch_flow_collection_end(&mut self, tok: TokenType<'input>) -> ScanResult {
2649 if self.flow_level == 0 {
2651 return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2652 }
2653
2654 let Some((open_mark, open_ch)) = self.flow_markers.pop() else {
2655 return Err(ScanError::new_str(self.mark, "misplaced bracket"));
2656 };
2657
2658 let (expected_open, actual_close) = match tok {
2659 TokenType::FlowSequenceEnd => ('[', ']'),
2660 TokenType::FlowMappingEnd => ('{', '}'),
2661 _ => unreachable!("flow collection end called with non-closing token"),
2662 };
2663
2664 if open_ch != expected_open {
2665 return Err(ScanError::new(
2666 open_mark,
2667 format!("mismatched bracket '{open_ch}' closed by '{actual_close}'"),
2668 ));
2669 }
2670
2671 let flow_level = self.flow_level;
2672
2673 self.remove_simple_key()?;
2674
2675 if matches!(tok, TokenType::FlowSequenceEnd) {
2676 self.end_implicit_mapping(self.mark, flow_level);
2677 self.implicit_flow_mapping_states.pop();
2679 }
2680 self.flow_mapping_started.pop();
2681
2682 self.decrease_flow_level();
2683
2684 self.disallow_simple_key();
2685
2686 let start_mark = self.mark;
2687 self.skip_non_blank();
2688 let token_index = self.tokens.len();
2689 self.skip_ws_to_eol(SkipTabs::Yes)?;
2690
2691 if self.flow_level > 0 {
2697 self.adjacent_value_allowed_at = self.mark.index();
2698 }
2699
2700 self.insert_token(token_index, Token(Span::new(start_mark, self.mark), tok));
2701 Ok(())
2702 }
2703
2704 fn fetch_flow_entry(&mut self) -> ScanResult {
2706 self.remove_simple_key()?;
2707 self.allow_simple_key();
2708
2709 self.end_implicit_mapping(self.mark, self.flow_level);
2710 if self.current_flow_collection_is_sequence() {
2711 self.set_current_flow_mapping_started(false);
2712 }
2713
2714 let start_mark = self.mark;
2715 self.skip_non_blank();
2716 let token_index = self.tokens.len();
2717 self.skip_ws_to_eol(SkipTabs::Yes)?;
2718
2719 self.insert_token(
2720 token_index,
2721 Token(Span::new(start_mark, self.mark), TokenType::FlowEntry),
2722 );
2723 Ok(())
2724 }
2725
2726 fn increase_flow_level(&mut self) -> ScanResult {
2727 self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2728 self.flow_level = self
2729 .flow_level
2730 .checked_add(1)
2731 .ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2732 Ok(())
2733 }
2734
2735 fn decrease_flow_level(&mut self) {
2736 if self.flow_level > 0 {
2737 self.flow_level -= 1;
2738 self.simple_keys.pop().unwrap();
2739 }
2740 }
2741
2742 fn fetch_block_entry(&mut self) -> ScanResult {
2748 if self.flow_level > 0 {
2749 return Err(ScanError::new_str(
2751 self.mark,
2752 r#""-" is only valid inside a block"#,
2753 ));
2754 }
2755 if !self.simple_key_allowed {
2757 return Err(ScanError::new_str(
2758 self.mark,
2759 "block sequence entries are not allowed in this context",
2760 ));
2761 }
2762
2763 if let Some(QueuedToken(span, QueuedTokenType::Anchor(..) | QueuedTokenType::Tag(..))) =
2765 self.tokens.back()
2766 {
2767 if self.mark.col == 0 && span.start.col == 0 && self.indent > -1 {
2768 return Err(ScanError::new_str(
2769 span.start,
2770 "invalid indentation for anchor",
2771 ));
2772 }
2773 }
2774
2775 let mark = self.mark;
2777 self.skip_non_blank();
2778
2779 self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2781 let token_index = self.tokens.len();
2782 let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
2783 self.input.lookahead(2);
2784 if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
2785 {
2786 return Err(ScanError::new_str(
2787 self.mark,
2788 "'-' must be followed by a valid YAML whitespace",
2789 ));
2790 }
2791
2792 self.skip_ws_to_eol(SkipTabs::No)?;
2793 self.input.lookahead(1);
2794 if self.input.next_is_break() || self.input.next_is_flow() {
2795 self.roll_one_col_indent();
2796 }
2797
2798 self.remove_simple_key()?;
2799 self.allow_simple_key();
2800
2801 self.insert_token(
2802 token_index,
2803 Token(Span::empty(self.mark), TokenType::BlockEntry),
2804 );
2805
2806 Ok(())
2807 }
2808
2809 fn fetch_document_indicator(&mut self, t: TokenType<'input>) -> ScanResult {
2810 if let Some((mark, bracket)) = self.flow_markers.pop() {
2811 return Err(ScanError::new(
2812 mark,
2813 format!("unclosed bracket '{bracket}'"),
2814 ));
2815 }
2816
2817 self.unroll_indent(-1);
2818 self.remove_simple_key()?;
2819 self.disallow_simple_key();
2820
2821 let mark = self.mark;
2822
2823 self.skip_n_non_blank(3);
2824
2825 self.document_prefix_allowed = matches!(t, TokenType::DocumentEnd);
2826 self.tokens
2827 .push_back(Token(Span::new(mark, self.mark), t).into());
2828 Ok(())
2829 }
2830
2831 fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2832 self.save_simple_key();
2833 self.allow_simple_key();
2834 let tok = self.scan_block_scalar(literal)?;
2835
2836 self.tokens.push_back(tok.into());
2837 Ok(())
2838 }
2839
2840 #[allow(clippy::too_many_lines)]
2841 fn scan_block_scalar(&mut self, literal: bool) -> Result<Token<'input>, ScanError> {
2842 let start_mark = self.mark;
2843 let mut chomping = Chomping::Clip;
2844 let mut increment: usize = 0;
2845 let mut indent: usize = 0;
2846 let mut trailing_blank: bool;
2847 let mut leading_blank: bool = false;
2848 let style = if literal {
2849 ScalarStyle::Literal
2850 } else {
2851 ScalarStyle::Folded
2852 };
2853
2854 let mut string = String::new();
2855 let mut leading_break = String::new();
2856 let mut trailing_breaks = String::new();
2857 let mut chomping_break = String::new();
2858
2859 self.skip_non_blank();
2861 self.unroll_non_block_indents();
2862
2863 if self.input.look_ch() == '+' || self.input.peek() == '-' {
2864 if self.input.peek() == '+' {
2865 chomping = Chomping::Keep;
2866 } else {
2867 chomping = Chomping::Strip;
2868 }
2869 self.skip_non_blank();
2870 self.input.lookahead(1);
2871 if self.input.next_is_digit() {
2872 if self.input.peek() == '0' {
2873 return Err(ScanError::new_str(
2874 start_mark,
2875 "while scanning a block scalar, found an indentation indicator equal to 0",
2876 ));
2877 }
2878 increment = (self.input.peek() as usize) - ('0' as usize);
2879 self.skip_non_blank();
2880 }
2881 } else if self.input.next_is_digit() {
2882 if self.input.peek() == '0' {
2883 return Err(ScanError::new_str(
2884 start_mark,
2885 "while scanning a block scalar, found an indentation indicator equal to 0",
2886 ));
2887 }
2888
2889 increment = (self.input.peek() as usize) - ('0' as usize);
2890 self.skip_non_blank();
2891 self.input.lookahead(1);
2892 if self.input.peek() == '+' || self.input.peek() == '-' {
2893 if self.input.peek() == '+' {
2894 chomping = Chomping::Keep;
2895 } else {
2896 chomping = Chomping::Strip;
2897 }
2898 self.skip_non_blank();
2899 }
2900 }
2901
2902 self.skip_ws_to_eol(SkipTabs::Yes)?;
2903
2904 self.input.lookahead(1);
2906 if !self.input.next_is_breakz() {
2907 return Err(ScanError::new_str(
2908 start_mark,
2909 "while scanning a block scalar, did not find expected comment or line break",
2910 ));
2911 }
2912
2913 if self.input.next_is_break() {
2914 self.input.lookahead(2);
2915 self.read_break(&mut chomping_break);
2916 }
2917
2918 if self.input.look_ch() == '\t' {
2919 return Err(ScanError::new_str(
2920 start_mark,
2921 "a block scalar content cannot start with a tab",
2922 ));
2923 }
2924
2925 if increment > 0 {
2926 indent = if self.indent >= 0 {
2927 (self.indent + increment as isize) as usize
2928 } else {
2929 increment
2930 }
2931 }
2932
2933 if indent == 0 {
2935 self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
2936 } else {
2937 self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2938 }
2939
2940 if self.input.next_is_z() {
2945 let contents = match chomping {
2946 Chomping::Strip => String::new(),
2948 _ if self.mark.line == start_mark.line() => String::new(),
2950 Chomping::Clip => chomping_break,
2953 Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
2956 Chomping::Keep => trailing_breaks,
2958 };
2959 return Ok(Token(
2960 Span::new(start_mark, self.mark),
2961 TokenType::Scalar(style, contents.into()),
2962 ));
2963 }
2964
2965 if self.mark.col < indent && (self.mark.col as isize) > self.indent {
2966 if self.indent < 0 && self.mark.col == 0 {
2967 self.input.lookahead(4);
2968 if self.input.next_is_document_start()
2969 || self.input.next_is_document_end()
2970 || self.input.peek() == '#'
2971 {
2972 } else {
2976 return Err(ScanError::new_str(
2977 self.mark,
2978 "wrongly indented line in block scalar",
2979 ));
2980 }
2981 } else {
2982 return Err(ScanError::new_str(
2983 self.mark,
2984 "wrongly indented line in block scalar",
2985 ));
2986 }
2987 }
2988
2989 let mut line_buffer = String::with_capacity(100);
2990 let start_mark = self.mark;
2991 while self.mark.col == indent && !self.input.next_is_z() {
2992 if indent == 0 {
2993 self.input.lookahead(4);
2994 if self.input.next_is_document_end() {
2995 break;
2996 }
2997 }
2998
2999 trailing_blank = self.input.next_is_blank();
3001 if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
3002 string.push_str(&trailing_breaks);
3003 if trailing_breaks.is_empty() {
3004 string.push(' ');
3005 }
3006 } else {
3007 string.push_str(&leading_break);
3008 string.push_str(&trailing_breaks);
3009 }
3010
3011 leading_break.clear();
3012 trailing_breaks.clear();
3013
3014 leading_blank = self.input.next_is_blank();
3015
3016 self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
3017
3018 self.input.lookahead(2);
3020 if self.input.next_is_z() {
3021 break;
3022 }
3023
3024 self.read_break(&mut leading_break);
3025
3026 self.skip_block_scalar_indent(indent, &mut trailing_breaks);
3028 }
3029
3030 if chomping != Chomping::Strip {
3032 string.push_str(&leading_break);
3033 if self.input.next_is_z() && self.mark.col >= indent.max(1) {
3037 string.push('\n');
3038 }
3039 }
3040
3041 if chomping == Chomping::Keep {
3042 string.push_str(&trailing_breaks);
3043 }
3044
3045 Ok(Token(
3046 Span::new(start_mark, self.mark),
3047 TokenType::Scalar(style, string.into()),
3048 ))
3049 }
3050
3051 fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
3061 while !self.input.buf_is_empty() && !self.input.next_is_breakz() {
3063 string.push(self.input.peek());
3064 self.skip_blank();
3070 }
3071
3072 if self.input.buf_is_empty() {
3075 let mut n_chars = 0;
3083 debug_assert!(line_buffer.is_empty());
3084 while let Some(c) = self.input.raw_read_non_breakz_ch() {
3085 line_buffer.push(c);
3086 n_chars += 1;
3087 }
3088
3089 self.mark.col += n_chars;
3091 self.mark.offsets.chars += n_chars;
3092 self.mark.offsets.bytes = self.input.byte_offset();
3093
3094 string.reserve(line_buffer.len());
3096 string.push_str(line_buffer);
3097 line_buffer.clear();
3099 }
3100 }
3101
3102 fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
3104 loop {
3105 if indent < self.input.bufmaxlen() - 2 {
3107 self.input.lookahead(self.input.bufmaxlen());
3108 while self.mark.col < indent && self.input.peek() == ' ' {
3109 self.skip_blank();
3110 }
3111 } else {
3112 loop {
3113 self.input.lookahead(self.input.bufmaxlen());
3114 while !self.input.buf_is_empty()
3115 && self.mark.col < indent
3116 && self.input.peek() == ' '
3117 {
3118 self.skip_blank();
3119 }
3120 if self.mark.col == indent
3124 || (!self.input.buf_is_empty() && self.input.peek() != ' ')
3125 {
3126 break;
3127 }
3128 }
3129 self.input.lookahead(2);
3130 }
3131
3132 if self.input.next_is_break() {
3134 self.read_break(breaks);
3135 } else {
3136 break;
3138 }
3139 }
3140 }
3141
3142 fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
3147 let mut max_indent = 0;
3148 loop {
3149 while self.input.look_ch() == ' ' {
3151 self.skip_blank();
3152 }
3153
3154 if self.mark.col > max_indent {
3155 max_indent = self.mark.col;
3156 }
3157
3158 if self.input.next_is_break() {
3159 self.input.lookahead(2);
3161 self.read_break(breaks);
3162 } else {
3163 break;
3165 }
3166 }
3167
3168 *indent = max_indent.max((self.indent + 1) as usize);
3177 if self.indent > 0 {
3178 *indent = (*indent).max(1);
3179 }
3180 }
3181
3182 fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
3183 self.save_simple_key();
3184 self.disallow_simple_key();
3185
3186 let token_index = self.tokens.len();
3187 let tok = self.scan_flow_scalar(single)?;
3188
3189 if self.skip_to_next_token(true)? {
3192 self.adjacent_value_allowed_at = usize::MAX;
3193 } else {
3194 self.adjacent_value_allowed_at = self.mark.index();
3195 }
3196
3197 self.insert_token(token_index, tok);
3198 Ok(())
3199 }
3200
3201 #[allow(clippy::too_many_lines)]
3202 fn scan_flow_scalar(&mut self, single: bool) -> Result<Token<'input>, ScanError> {
3203 let start_mark = self.mark;
3204
3205 let mut buf = match self.input.byte_offset() {
3207 Some(off) => FlowScalarBuf::new_borrowed(off + self.input.peek().len_utf8()),
3208 None => FlowScalarBuf::new_owned(),
3209 };
3210
3211 let mut break_scratch = String::new();
3214
3215 self.skip_non_blank();
3217
3218 loop {
3219 self.input.lookahead(4);
3221
3222 if self.mark.col == 0 && self.input.next_is_document_indicator() {
3223 return Err(ScanError::new_str(
3224 start_mark,
3225 "while scanning a quoted scalar, found unexpected document indicator",
3226 ));
3227 }
3228
3229 if self.input.next_is_z() {
3230 return Err(ScanError::new_str(start_mark, "unclosed quote"));
3231 }
3232
3233 let mut leading_blanks = false;
3236 self.consume_flow_scalar_non_whitespace_chars(
3237 single,
3238 &mut buf,
3239 &mut leading_blanks,
3240 &start_mark,
3241 )?;
3242
3243 match self.input.look_ch() {
3244 '\'' if single => break,
3245 '"' if !single => break,
3246 _ => {}
3247 }
3248
3249 let mut trailing_ws_start: Option<usize> = None;
3265 let mut has_leading_break = false;
3266 let mut has_trailing_breaks = false;
3267
3268 let mut pending_ws_start: Option<usize> = None;
3270
3271 while self.input.next_is_blank() || self.input.next_is_break() {
3273 if self.input.next_is_blank() {
3274 if leading_blanks {
3276 if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
3277 return Err(ScanError::new_str(
3278 self.mark,
3279 "tab cannot be used as indentation",
3280 ));
3281 }
3282 self.skip_blank();
3283 } else {
3284 match buf {
3286 FlowScalarBuf::Owned(ref mut string) => {
3287 if trailing_ws_start.is_none() {
3288 trailing_ws_start = Some(string.len());
3289 }
3290 string.push(self.input.peek());
3291 }
3292 FlowScalarBuf::Borrowed { .. } => {
3293 if pending_ws_start.is_none() {
3294 pending_ws_start = self.input.byte_offset();
3295 }
3296 }
3297 }
3298 self.skip_blank();
3299
3300 if let (FlowScalarBuf::Borrowed { .. }, Some(ws_start), Some(ws_end)) =
3301 (&mut buf, pending_ws_start, self.input.byte_offset())
3302 {
3303 buf.note_pending_ws(ws_start, ws_end);
3304 }
3305 }
3306 } else {
3307 self.input.lookahead(2);
3308
3309 if leading_blanks {
3311 match buf {
3313 FlowScalarBuf::Owned(ref mut string) => self.read_break(string),
3314 FlowScalarBuf::Borrowed { .. } => {
3315 self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3316 let Some(string) = buf.as_owned_mut() else {
3317 unreachable!()
3318 };
3319 self.read_break(string);
3320 }
3321 }
3322 has_trailing_breaks = true;
3323 } else {
3324 if let Some(pos) = trailing_ws_start.take() {
3326 if let FlowScalarBuf::Owned(ref mut string) = buf {
3327 string.truncate(pos);
3328 }
3329 }
3330
3331 if pending_ws_start.take().is_some() {
3332 if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3334 self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3335 }
3336 buf.discard_pending_ws();
3337 } else {
3338 buf.commit_pending_ws();
3339 }
3340
3341 break_scratch.clear();
3342 self.read_break(&mut break_scratch);
3343 has_leading_break = true;
3346 leading_blanks = true;
3347 }
3348 }
3349
3350 self.input.lookahead(1);
3351 }
3352
3353 if leading_blanks && has_leading_break && self.flow_level == 0 {
3356 let next_ch = self.input.peek();
3357 let is_closing_quote = (single && next_ch == '\'') || (!single && next_ch == '"');
3358 if !is_closing_quote && (self.mark.col as isize) <= self.indent {
3359 return Err(ScanError::new_str(
3360 self.mark,
3361 "invalid indentation in multiline quoted scalar",
3362 ));
3363 }
3364 }
3365
3366 if leading_blanks {
3368 if has_leading_break && !has_trailing_breaks {
3373 match buf {
3374 FlowScalarBuf::Owned(ref mut string) => string.push(' '),
3375 FlowScalarBuf::Borrowed { .. } => {
3376 self.promote_flow_scalar_buf_to_owned(&start_mark, &mut buf)?;
3377 let Some(string) = buf.as_owned_mut() else {
3378 unreachable!()
3379 };
3380 string.push(' ');
3381 }
3382 }
3383 }
3384 }
3385 } self.skip_non_blank();
3390 let end_mark = self.mark;
3391
3392 self.skip_ws_to_eol(SkipTabs::Yes)?;
3394 match self.input.peek() {
3395 ',' | '}' | ']' if self.flow_level > 0 => {}
3397 c if is_breakz(c) => {}
3399 ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
3402 ':' if self.flow_level > 0 => {}
3404 _ => {
3405 return Err(ScanError::new_str(
3406 self.mark,
3407 "invalid trailing content after double-quoted scalar",
3408 ));
3409 }
3410 }
3411
3412 let style = if single {
3413 ScalarStyle::SingleQuoted
3414 } else {
3415 ScalarStyle::DoubleQuoted
3416 };
3417
3418 let contents = match buf {
3419 FlowScalarBuf::Owned(string) => Cow::Owned(string),
3420 FlowScalarBuf::Borrowed {
3421 start,
3422 mut end,
3423 pending_ws_start,
3424 pending_ws_end,
3425 } => {
3426 if pending_ws_start.is_some() {
3428 end = pending_ws_end;
3429 }
3430 if let Some(slice) = self.try_borrow_slice(start, end) {
3431 Cow::Borrowed(slice)
3432 } else {
3433 let slice = self.input.slice_bytes(start, end).ok_or_else(|| {
3434 ScanError::new_str(
3435 start_mark,
3436 "internal error: input advertised offsets but did not provide a slice",
3437 )
3438 })?;
3439 Cow::Owned(slice.to_owned())
3440 }
3441 }
3442 };
3443
3444 Ok(Token(
3445 Span::new(start_mark, end_mark),
3446 TokenType::Scalar(style, contents),
3447 ))
3448 }
3449
3450 fn consume_flow_scalar_non_whitespace_chars(
3459 &mut self,
3460 single: bool,
3461 buf: &mut FlowScalarBuf,
3462 leading_blanks: &mut bool,
3463 start_mark: &Marker,
3464 ) -> Result<(), ScanError> {
3465 self.input.lookahead(2);
3466 while !is_blank_or_breakz(self.input.peek()) {
3467 match self.input.peek() {
3468 '\'' if self.input.peek_nth(1) == '\'' && single => {
3470 if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3471 buf.commit_pending_ws();
3472 self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3473 }
3474 let Some(string) = buf.as_owned_mut() else {
3475 unreachable!()
3476 };
3477 string.push('\'');
3478 self.skip_n_non_blank(2);
3479 }
3480 '\'' if single => break,
3482 '"' if !single => break,
3483 '\\' if !single && is_break(self.input.peek_nth(1)) => {
3485 self.input.lookahead(3);
3486 if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3487 buf.commit_pending_ws();
3488 self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3489 }
3490 self.skip_non_blank();
3491 self.skip_linebreak();
3492 *leading_blanks = true;
3493 break;
3494 }
3495 '\\' if !single => {
3497 if matches!(buf, FlowScalarBuf::Borrowed { .. }) {
3498 buf.commit_pending_ws();
3499 self.promote_flow_scalar_buf_to_owned(start_mark, buf)?;
3500 }
3501 let Some(string) = buf.as_owned_mut() else {
3502 unreachable!()
3503 };
3504 string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
3505 }
3506 c => {
3507 match buf {
3508 FlowScalarBuf::Owned(ref mut string) => {
3509 string.push(c);
3510 }
3511 FlowScalarBuf::Borrowed { .. } => {
3512 buf.commit_pending_ws();
3513 }
3514 }
3515 self.skip_non_blank();
3516
3517 if let Some(new_end) = self.input.byte_offset() {
3518 if let FlowScalarBuf::Borrowed { end, .. } = buf {
3519 *end = new_end;
3520 }
3521 }
3522 }
3523 }
3524 self.input.lookahead(2);
3525 }
3526 Ok(())
3527 }
3528
3529 fn resolve_flow_scalar_escape_sequence(
3536 &mut self,
3537 start_mark: &Marker,
3538 ) -> Result<char, ScanError> {
3539 let mut code_length = 0usize;
3540 let mut ret = '\0';
3541
3542 match self.input.peek_nth(1) {
3543 '0' => ret = '\0',
3544 'a' => ret = '\x07',
3545 'b' => ret = '\x08',
3546 't' | '\t' => ret = '\t',
3547 'n' => ret = '\n',
3548 'v' => ret = '\x0b',
3549 'f' => ret = '\x0c',
3550 'r' => ret = '\x0d',
3551 'e' => ret = '\x1b',
3552 ' ' => ret = '\x20',
3553 '"' => ret = '"',
3554 '/' => ret = '/',
3555 '\\' => ret = '\\',
3556 'N' => ret = char::from_u32(0x85).unwrap(),
3558 '_' => ret = char::from_u32(0xA0).unwrap(),
3560 'L' => ret = char::from_u32(0x2028).unwrap(),
3562 'P' => ret = char::from_u32(0x2029).unwrap(),
3564 'x' => code_length = 2,
3565 'u' => code_length = 4,
3566 'U' => code_length = 8,
3567 _ => {
3568 return Err(ScanError::new_str(
3569 *start_mark,
3570 "while parsing a quoted scalar, found unknown escape character",
3571 ))
3572 }
3573 }
3574 self.skip_n_non_blank(2);
3575
3576 if code_length > 0 {
3578 self.input.lookahead(code_length);
3579 let mut value = 0u32;
3580 for i in 0..code_length {
3581 let c = self.input.peek_nth(i);
3582 if !is_hex(c) {
3583 return Err(ScanError::new_str(
3584 *start_mark,
3585 "while parsing a quoted scalar, did not find expected hexadecimal number",
3586 ));
3587 }
3588 value = (value << 4) + as_hex(c);
3589 }
3590
3591 self.skip_n_non_blank(code_length);
3592
3593 if code_length == 4 && (0xD800..=0xDBFF).contains(&value) {
3595 self.input.lookahead(2);
3596 if self.input.peek() == '\\' && self.input.peek_nth(1) == 'u' {
3597 self.skip_n_non_blank(2);
3598 self.input.lookahead(4);
3599 let mut low_value = 0u32;
3600 for i in 0..4 {
3601 let c = self.input.peek_nth(i);
3602 if !is_hex(c) {
3603 return Err(ScanError::new_str(
3604 *start_mark,
3605 "while parsing a quoted scalar, did not find expected hexadecimal number for low surrogate",
3606 ));
3607 }
3608 low_value = (low_value << 4) + as_hex(c);
3609 }
3610 if (0xDC00..=0xDFFF).contains(&low_value) {
3611 value = 0x10000 + (((value - 0xD800) << 10) | (low_value - 0xDC00));
3612 self.skip_n_non_blank(4);
3613 } else {
3614 return Err(ScanError::new_str(
3615 *start_mark,
3616 "while parsing a quoted scalar, found invalid low surrogate",
3617 ));
3618 }
3619 } else {
3620 return Err(ScanError::new_str(
3621 *start_mark,
3622 "while parsing a quoted scalar, found high surrogate without following low surrogate",
3623 ));
3624 }
3625 } else if code_length == 4 && (0xDC00..=0xDFFF).contains(&value) {
3626 return Err(ScanError::new_str(
3627 *start_mark,
3628 "while parsing a quoted scalar, found unpaired low surrogate",
3629 ));
3630 }
3631
3632 let Some(ch) = char::from_u32(value) else {
3633 return Err(ScanError::new_str(
3634 *start_mark,
3635 "while parsing a quoted scalar, found invalid Unicode character escape code",
3636 ));
3637 };
3638 ret = ch;
3639 }
3640 Ok(ret)
3641 }
3642
3643 fn fetch_plain_scalar(&mut self) -> ScanResult {
3644 self.save_simple_key();
3645 self.disallow_simple_key();
3646
3647 let token_index = self.tokens.len();
3648 let tok = self.scan_plain_scalar()?;
3649
3650 self.insert_token(token_index, tok);
3651 Ok(())
3652 }
3653
3654 #[allow(clippy::too_many_lines)]
3659 fn scan_plain_scalar(&mut self) -> Result<Token<'input>, ScanError> {
3660 self.unroll_non_block_indents();
3661 let indent = self.indent + 1;
3662 let start_mark = self.mark;
3663
3664 if self.flow_level > 0 && (start_mark.col as isize) < indent {
3665 return Err(ScanError::new_str(
3666 start_mark,
3667 "invalid indentation in flow construct",
3668 ));
3669 }
3670
3671 let mut string = String::with_capacity(32);
3672 self.buf_whitespaces.clear();
3673 self.buf_leading_break.clear();
3674 self.buf_trailing_breaks.clear();
3675 let mut end_mark = self.mark;
3676
3677 loop {
3678 self.input.lookahead(4);
3679 if (self.mark.col == 0 && self.input.next_is_document_indicator())
3680 || self.input.peek() == '#'
3681 {
3682 if self.input.peek() == '#'
3687 && !string.is_empty()
3688 && !self.buf_whitespaces.is_empty()
3689 && self.flow_level == 0
3690 {
3691 self.interrupted_plain_by_comment = Some(self.mark);
3692 }
3693 break;
3694 }
3695
3696 if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
3697 return Err(ScanError::new_str(
3698 self.mark,
3699 "plain scalar cannot start with '-' followed by ,[]{}",
3700 ));
3701 }
3702
3703 if !self.input.next_is_blank_or_breakz()
3704 && self.input.next_can_be_plain_scalar(self.flow_level > 0)
3705 {
3706 if self.leading_whitespace {
3707 if self.buf_leading_break.is_empty() {
3708 string.push_str(&self.buf_leading_break);
3709 string.push_str(&self.buf_trailing_breaks);
3710 self.buf_trailing_breaks.clear();
3711 self.buf_leading_break.clear();
3712 } else {
3713 if self.buf_trailing_breaks.is_empty() {
3714 string.push(' ');
3715 } else {
3716 string.push_str(&self.buf_trailing_breaks);
3717 self.buf_trailing_breaks.clear();
3718 }
3719 self.buf_leading_break.clear();
3720 }
3721 self.leading_whitespace = false;
3722 } else if !self.buf_whitespaces.is_empty() {
3723 string.push_str(&self.buf_whitespaces);
3724 self.buf_whitespaces.clear();
3725 }
3726
3727 string.push(self.input.peek());
3729 self.skip_non_blank();
3730 string.reserve(self.input.bufmaxlen());
3731
3732 let mut end = false;
3734 while !end {
3735 self.input.lookahead(self.input.bufmaxlen());
3739 let (stop, chars_consumed) = self.input.fetch_plain_scalar_chunk(
3740 &mut string,
3741 self.input.bufmaxlen() - 1,
3742 self.flow_level > 0,
3743 );
3744 end = stop;
3745 self.mark.offsets.chars += chars_consumed;
3746 self.mark.col += chars_consumed;
3747 self.mark.offsets.bytes = self.input.byte_offset();
3748 }
3749 end_mark = self.mark;
3750 }
3751
3752 if !(self.input.next_is_blank() || self.input.next_is_break()) {
3757 break;
3758 }
3759
3760 self.input.lookahead(2);
3762 while self.input.next_is_blank_or_break() {
3763 if self.input.next_is_blank() {
3764 if !self.leading_whitespace {
3765 self.buf_whitespaces.push(self.input.peek());
3766 self.skip_blank();
3767 } else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
3768 self.skip_ws_to_eol(SkipTabs::Yes)?;
3771 if !self.input.next_is_breakz() {
3772 return Err(ScanError::new_str(
3773 start_mark,
3774 "while scanning a plain scalar, found a tab",
3775 ));
3776 }
3777 } else {
3778 self.skip_blank();
3779 }
3780 } else {
3781 if self.leading_whitespace {
3783 self.skip_break();
3784 self.buf_trailing_breaks.push('\n');
3785 } else {
3786 self.buf_whitespaces.clear();
3787 self.skip_break();
3788 self.buf_leading_break.push('\n');
3789 self.leading_whitespace = true;
3790 }
3791 }
3792 self.input.lookahead(2);
3793 }
3794
3795 if self.flow_level == 0 && (self.mark.col as isize) < indent {
3797 break;
3798 }
3799 }
3800
3801 if self.leading_whitespace {
3802 self.allow_simple_key();
3803 }
3804
3805 if string.is_empty() {
3806 Err(ScanError::new_str(
3810 start_mark,
3811 "unexpected end of plain scalar",
3812 ))
3813 } else {
3814 let contents = if let (Some(start), Some(end)) =
3815 (start_mark.byte_offset(), end_mark.byte_offset())
3816 {
3817 match self.try_borrow_slice(start, end) {
3818 Some(slice) if slice == string => Cow::Borrowed(slice),
3819 _ => Cow::Owned(string),
3820 }
3821 } else {
3822 Cow::Owned(string)
3823 };
3824
3825 Ok(Token(
3826 Span::new(start_mark, end_mark),
3827 TokenType::Scalar(ScalarStyle::Plain, contents),
3828 ))
3829 }
3830 }
3831
3832 fn fetch_key(&mut self) -> ScanResult {
3833 let start_mark = self.mark;
3834 if self.flow_level == 0 {
3835 if !self.simple_key_allowed {
3837 return Err(ScanError::new_str(
3838 self.mark,
3839 "mapping keys are not allowed in this context",
3840 ));
3841 }
3842 self.roll_indent(
3843 start_mark.col,
3844 None,
3845 TokenType::BlockMappingStart,
3846 start_mark,
3847 );
3848 } else {
3849 self.set_current_flow_mapping_started(true);
3851 }
3852
3853 self.remove_simple_key()?;
3854
3855 if self.flow_level == 0 {
3856 self.allow_simple_key();
3857 } else {
3858 self.disallow_simple_key();
3859 }
3860
3861 self.skip_non_blank();
3862 let token_index = self.tokens.len();
3863 self.explicit_key_tab_check_pending = false;
3864 let stopped_after_comment = self.skip_yaml_whitespace(true)?;
3865 if self.input.peek() == '\t' {
3866 return Err(ScanError::new_str(
3867 self.mark(),
3868 "tabs disallowed in this context",
3869 ));
3870 }
3871 self.explicit_key_tab_check_pending = stopped_after_comment;
3872 self.insert_token(
3873 token_index,
3874 Token(Span::new(start_mark, self.mark), TokenType::Key),
3875 );
3876 Ok(())
3877 }
3878
3879 fn fetch_flow_value(&mut self) -> ScanResult {
3887 let nc = self.input.peek_nth(1);
3888
3889 if self.mark.index() != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
3901 return Err(ScanError::new_str(
3902 self.mark,
3903 "':' may not precede any of `[{` in flow mapping",
3904 ));
3905 }
3906
3907 self.fetch_value()
3908 }
3909
3910 fn fetch_value(&mut self) -> ScanResult {
3912 let sk = self.simple_keys.last().unwrap().clone();
3913 let start_mark = self.mark;
3914 let is_implicit_flow_mapping = self.current_flow_collection_is_sequence()
3915 && !self.current_flow_mapping_started()
3916 && !self.implicit_flow_mapping_states.is_empty();
3917 if is_implicit_flow_mapping {
3918 *self.implicit_flow_mapping_states.last_mut().unwrap() =
3919 ImplicitMappingState::Inside(self.flow_level);
3920 }
3921
3922 self.skip_non_blank();
3924 let mut trailing_tokens = VecDeque::new();
3931 if self.input.look_ch() == '\t' {
3932 let trailing_token_index = self.tokens.len();
3933 let whitespace = self.skip_ws_to_eol(SkipTabs::Yes)?;
3934 trailing_tokens = self.tokens.split_off(trailing_token_index);
3935
3936 if !whitespace.has_valid_yaml_ws()
3937 && (self.input.peek() == '-' || self.input.next_is_alpha())
3938 {
3939 return Err(ScanError::new_str(
3940 self.mark,
3941 "':' must be followed by a valid YAML whitespace",
3942 ));
3943 }
3944 }
3945
3946 if sk.possible {
3947 let token_index = self.simple_key_token_index(&sk, start_mark)?;
3948 let tok = Token(Span::empty(sk.mark), TokenType::Key);
3950 self.insert_token(token_index, tok);
3951 if is_implicit_flow_mapping {
3952 if sk.mark.line < start_mark.line {
3953 return Err(ScanError::new_str(
3954 start_mark,
3955 "illegal placement of ':' indicator",
3956 ));
3957 }
3958 self.insert_token(
3959 token_index,
3960 Token(Span::empty(sk.mark), TokenType::FlowMappingStart),
3961 );
3962 }
3963
3964 self.roll_indent(
3966 sk.mark.col,
3967 Some(sk.token_number),
3968 TokenType::BlockMappingStart,
3969 sk.mark,
3970 );
3971 self.roll_one_col_indent();
3972
3973 self.simple_keys.last_mut().unwrap().possible = false;
3974 self.disallow_simple_key();
3975 } else {
3976 if is_implicit_flow_mapping {
3977 self.tokens
3978 .push_back(Token(Span::empty(start_mark), TokenType::FlowMappingStart).into());
3979 }
3980 if self.flow_level == 0 {
3982 if !self.simple_key_allowed {
3983 return Err(ScanError::new_str(
3984 start_mark,
3985 "mapping values are not allowed in this context",
3986 ));
3987 }
3988
3989 self.roll_indent(
3990 start_mark.col,
3991 None,
3992 TokenType::BlockMappingStart,
3993 start_mark,
3994 );
3995 }
3996 self.roll_one_col_indent();
3997
3998 if self.flow_level == 0 {
3999 self.allow_simple_key();
4000 } else {
4001 self.disallow_simple_key();
4002 }
4003 }
4004 self.tokens
4005 .push_back(Token(Span::empty(start_mark), TokenType::Value).into());
4006 self.tokens.append(&mut trailing_tokens);
4007
4008 Ok(())
4009 }
4010
4011 fn roll_indent(
4017 &mut self,
4018 col: usize,
4019 number: Option<usize>,
4020 tok: TokenType<'input>,
4021 mark: Marker,
4022 ) {
4023 if self.flow_level > 0 {
4024 return;
4025 }
4026
4027 if self.indent <= col as isize {
4031 if let Some(indent) = self.indents.last() {
4032 if !indent.needs_block_end {
4033 self.indent = indent.indent;
4034 self.indents.pop();
4035 }
4036 }
4037 }
4038
4039 if self.indent < col as isize {
4040 self.indents.push(Indent {
4041 indent: self.indent,
4042 needs_block_end: true,
4043 });
4044 self.indent = col as isize;
4045 let tokens_parsed = self.tokens_parsed;
4046 match number {
4047 Some(n) => self.insert_token(n - tokens_parsed, Token(Span::empty(mark), tok)),
4048 None => self.tokens.push_back(Token(Span::empty(mark), tok).into()),
4049 }
4050 }
4051 }
4052
4053 fn unroll_indent(&mut self, col: isize) {
4059 if self.flow_level > 0 {
4060 return;
4061 }
4062 while self.indent > col {
4063 let indent = self.indents.pop().unwrap();
4064 self.indent = indent.indent;
4065 if indent.needs_block_end {
4066 self.tokens
4067 .push_back(Token(Span::empty(self.mark), TokenType::BlockEnd).into());
4068 }
4069 }
4070 }
4071
4072 fn roll_one_col_indent(&mut self) {
4078 if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
4079 self.indents.push(Indent {
4080 indent: self.indent,
4081 needs_block_end: false,
4082 });
4083 self.indent += 1;
4084 }
4085 }
4086
4087 fn unroll_non_block_indents(&mut self) {
4089 while let Some(indent) = self.indents.last() {
4090 if indent.needs_block_end {
4091 break;
4092 }
4093 self.indent = indent.indent;
4094 self.indents.pop();
4095 }
4096 }
4097
4098 fn save_simple_key(&mut self) {
4100 if self.simple_key_allowed {
4101 let required = self.flow_level == 0
4102 && self.indent == (self.mark.col as isize)
4103 && self.indents.last().unwrap().needs_block_end;
4104
4105 if let Some(last) = self.simple_keys.last_mut() {
4106 *last = SimpleKey {
4107 mark: self.mark,
4108 possible: true,
4109 required,
4110 token_number: self.tokens_parsed + self.tokens.len(),
4111 };
4112 }
4113 }
4114 }
4115
4116 fn remove_simple_key(&mut self) -> ScanResult {
4117 let last = self.simple_keys.last_mut().unwrap();
4118 if last.possible && last.required {
4119 return Err(self.simple_key_expected());
4120 }
4121
4122 last.possible = false;
4123 Ok(())
4124 }
4125
4126 fn is_within_block(&self) -> bool {
4128 !self.indents.is_empty()
4129 }
4130
4131 fn end_implicit_mapping(&mut self, mark: Marker, flow_level: u8) {
4137 if self
4138 .implicit_flow_mapping_states
4139 .last()
4140 .is_some_and(|state| *state == ImplicitMappingState::Inside(flow_level))
4141 {
4142 *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Possible;
4143 self.set_current_flow_mapping_started(false);
4144 self.tokens
4145 .push_back(Token(Span::empty(mark), TokenType::FlowMappingEnd).into());
4146 }
4147 }
4148
4149 fn current_flow_collection_is_sequence(&self) -> bool {
4150 self.flow_markers
4151 .last()
4152 .is_some_and(|(_, bracket)| *bracket == '[')
4153 }
4154
4155 fn current_flow_mapping_started(&self) -> bool {
4156 self.flow_mapping_started.last().copied().unwrap_or(false)
4157 }
4158
4159 fn set_current_flow_mapping_started(&mut self, started: bool) {
4160 if let Some(current) = self.flow_mapping_started.last_mut() {
4161 *current = started;
4162 }
4163 }
4164}
4165
4166#[derive(PartialEq, Eq)]
4170pub enum Chomping {
4171 Strip,
4173 Clip,
4175 Keep,
4177}
4178
4179#[cfg(test)]
4180mod test {
4181 use alloc::{
4182 borrow::{Cow, ToOwned},
4183 rc::Rc,
4184 string::String,
4185 vec::Vec,
4186 };
4187 use core::cell::Cell;
4188
4189 use crate::{
4190 input::{str::StrInput, BorrowedInput, BufferedInput, Input},
4191 scanner::{
4192 Comment, Marker, Placement, QueuedToken, QueuedTokenType, ScalarStyle, Scanner, Span,
4193 TEncoding, Token, TokenType,
4194 },
4195 };
4196
4197 struct CountingChars {
4198 chars: alloc::vec::IntoIter<char>,
4199 read: Rc<Cell<usize>>,
4200 }
4201
4202 impl Iterator for CountingChars {
4203 type Item = char;
4204
4205 fn next(&mut self) -> Option<Self::Item> {
4206 let next = self.chars.next();
4207 if next.is_some() {
4208 self.read.set(self.read.get() + 1);
4209 }
4210 next
4211 }
4212 }
4213
4214 struct SlicingOnlyInput<'input> {
4215 inner: StrInput<'input>,
4216 expose_slice: bool,
4217 }
4218
4219 impl<'input> SlicingOnlyInput<'input> {
4220 fn new(source: &'input str, expose_slice: bool) -> Self {
4221 Self {
4222 inner: StrInput::new(source),
4223 expose_slice,
4224 }
4225 }
4226 }
4227
4228 impl Input for SlicingOnlyInput<'_> {
4229 fn lookahead(&mut self, count: usize) {
4230 self.inner.lookahead(count);
4231 }
4232
4233 fn buflen(&self) -> usize {
4234 self.inner.buflen()
4235 }
4236
4237 fn bufmaxlen(&self) -> usize {
4238 self.inner.bufmaxlen()
4239 }
4240
4241 fn raw_read_ch(&mut self) -> char {
4242 self.inner.raw_read_ch()
4243 }
4244
4245 fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
4246 self.inner.raw_read_non_breakz_ch()
4247 }
4248
4249 fn skip(&mut self) {
4250 self.inner.skip();
4251 }
4252
4253 fn skip_n(&mut self, count: usize) {
4254 self.inner.skip_n(count);
4255 }
4256
4257 fn peek(&self) -> char {
4258 self.inner.peek()
4259 }
4260
4261 fn peek_nth(&self, n: usize) -> char {
4262 self.inner.peek_nth(n)
4263 }
4264
4265 fn byte_offset(&self) -> Option<usize> {
4266 self.inner.byte_offset()
4267 }
4268
4269 fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
4270 if self.expose_slice {
4271 self.inner.slice_bytes(start, end)
4272 } else {
4273 None
4274 }
4275 }
4276 }
4277
4278 impl<'input> BorrowedInput<'input> for SlicingOnlyInput<'input> {
4279 fn slice_borrowed(&self, _start: usize, _end: usize) -> Option<&'input str> {
4280 None
4281 }
4282 }
4283
4284 #[test]
4285 fn test_is_anchor_char() {
4286 use super::is_anchor_char;
4287 assert!(is_anchor_char('x'));
4288 }
4289
4290 #[test]
4291 fn flow_simple_key_length_limit_bounds_buffering() {
4292 let mut yaml = String::from("[\n\"start\"\n");
4293 for _ in 0..600 {
4294 yaml.push_str("\"x\"\n");
4295 }
4296 let total_chars = yaml.chars().count();
4297 let read = Rc::new(Cell::new(0));
4298 let chars = yaml.chars().collect::<Vec<_>>().into_iter();
4299 let mut scanner = Scanner::new(BufferedInput::new(CountingChars {
4300 chars,
4301 read: Rc::clone(&read),
4302 }));
4303
4304 assert!(matches!(
4305 scanner.next_token().unwrap().unwrap().1,
4306 TokenType::StreamStart(_)
4307 ));
4308
4309 let token = scanner.next_token().unwrap().unwrap();
4310 assert!(matches!(token.1, TokenType::FlowSequenceStart));
4311
4312 let token = scanner.next_token().unwrap().unwrap();
4313 assert!(matches!(
4314 token.1,
4315 TokenType::Scalar(_, ref value) if value == "start"
4316 ));
4317 assert!(
4318 read.get() < total_chars,
4319 "scanner consumed all {total_chars} chars before yielding the first flow scalar"
4320 );
4321 assert!(
4322 read.get() <= super::SIMPLE_KEY_MAX_LOOKAHEAD + 128,
4323 "scanner read {} chars before yielding the first flow scalar",
4324 read.get()
4325 );
4326 }
4327
4328 #[test]
4329 fn comment_capture_does_not_change_leading_whitespace() {
4330 let mut scanner = Scanner::new(StrInput::new("# comment\n"));
4331
4332 let token = scanner.scan_comment_token().unwrap();
4333
4334 assert!(scanner.leading_whitespace);
4335 assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " comment"));
4336
4337 let mut scanner = Scanner::new(BufferedInput::new("# streaming\n".chars()));
4338 scanner.input.lookahead(1);
4339
4340 let token = scanner.scan_comment_token().unwrap();
4341
4342 assert!(scanner.leading_whitespace);
4343 assert!(matches!(token.1, TokenType::Comment(ref comment) if comment.text == " streaming"));
4344 }
4345
4346 #[test]
4347 fn comment_capture_falls_back_to_owned_slice_when_borrow_unavailable() {
4348 let mut scanner = Scanner::new(SlicingOnlyInput::new("# sliced\n", true));
4349 scanner.input.lookahead(2);
4350 assert_eq!(scanner.input.peek_nth(1), ' ');
4351
4352 let token = scanner.scan_comment_token().unwrap();
4353
4354 assert!(matches!(token.1, TokenType::Comment(ref comment)
4355 if matches!(comment.text, Cow::Owned(ref text) if text == " sliced")));
4356 }
4357
4358 #[test]
4359 fn comment_capture_errors_when_offsets_have_no_slice() {
4360 let mut scanner = Scanner::new(SlicingOnlyInput::new("# broken\n", false));
4361
4362 let error = scanner.scan_comment_token().unwrap_err();
4363
4364 assert_eq!(
4365 error.info(),
4366 "internal error: input advertised offsets but did not provide a slice"
4367 );
4368 }
4369
4370 #[test]
4371 fn queued_token_roundtrips_public_token_variants() {
4372 let span = Span::new(Marker::new(0, 1, 0), Marker::new(7, 1, 7));
4373 let tokens = [
4374 Token(span, TokenType::StreamStart(TEncoding::Utf8)),
4375 Token(span, TokenType::StreamEnd),
4376 Token(span, TokenType::VersionDirective(1, 2)),
4377 Token(
4378 span,
4379 TokenType::TagDirective(Cow::Borrowed("!app!"), Cow::Borrowed("tag:app.example,")),
4380 ),
4381 Token(span, TokenType::DocumentStart),
4382 Token(span, TokenType::DocumentEnd),
4383 Token(span, TokenType::BlockSequenceStart),
4384 Token(span, TokenType::BlockMappingStart),
4385 Token(span, TokenType::BlockEnd),
4386 Token(span, TokenType::FlowSequenceStart),
4387 Token(span, TokenType::FlowSequenceEnd),
4388 Token(span, TokenType::FlowMappingStart),
4389 Token(span, TokenType::FlowMappingEnd),
4390 Token(span, TokenType::BlockEntry),
4391 Token(span, TokenType::FlowEntry),
4392 Token(span, TokenType::Key),
4393 Token(span, TokenType::Value),
4394 Token(span, TokenType::Alias(Cow::Borrowed("alias"))),
4395 Token(span, TokenType::Anchor(Cow::Borrowed("anchor"))),
4396 Token(
4397 span,
4398 TokenType::Tag(Cow::Borrowed("!"), Cow::Borrowed("tag")),
4399 ),
4400 Token(
4401 span,
4402 TokenType::Scalar(ScalarStyle::Literal, Cow::Borrowed("scalar")),
4403 ),
4404 Token(
4405 span,
4406 TokenType::Comment(
4407 Comment::new(span, Cow::Borrowed(" comment")).with_placement(Placement::Right),
4408 ),
4409 ),
4410 Token(
4411 span,
4412 TokenType::ReservedDirective(
4413 "reserved".to_owned(),
4414 vec!["one".to_owned(), "two".to_owned()],
4415 ),
4416 ),
4417 ];
4418
4419 for token in tokens {
4420 let queued: QueuedToken = token.clone().into();
4421
4422 assert_eq!(queued.into_public(), token);
4423 }
4424 }
4425
4426 #[test]
4427 fn comment_skipping_path_consumes_comment_without_tokenizing_it() {
4428 let mut scanner = Scanner::new(StrInput::new("# skipped\nnext: value\n"));
4429
4430 scanner.skip_yaml_whitespace(false).unwrap();
4431
4432 assert!(scanner.tokens.is_empty());
4433 assert_eq!(scanner.mark.line(), 2);
4434 assert_eq!(scanner.mark.col(), 0);
4435 }
4436
4437 #[test]
4438 fn yaml_whitespace_can_stop_after_queued_comment() {
4439 let mut scanner = Scanner::new(StrInput::new(" # queued\n# later\n"));
4440
4441 assert!(scanner.skip_yaml_whitespace(true).unwrap());
4442
4443 assert_eq!(scanner.tokens.len(), 1);
4444 assert!(matches!(
4445 scanner.tokens.front().unwrap().1,
4446 QueuedTokenType::Comment(ref comment) if comment.text == " queued"
4447 ));
4448 assert_eq!(scanner.mark.line(), 1);
4449 assert_eq!(scanner.mark.col(), 9);
4450 }
4451
4452 #[test]
4453 fn token_skip_can_stop_after_queued_comment() {
4454 let mut scanner = Scanner::new(StrInput::new("# first\n# second\n"));
4455
4456 assert!(scanner.skip_to_next_token(true).unwrap());
4457
4458 assert_eq!(scanner.tokens.len(), 1);
4459 assert!(matches!(
4460 scanner.tokens.front().unwrap().1,
4461 QueuedTokenType::Comment(ref comment) if comment.text == " first"
4462 ));
4463 assert_eq!(scanner.mark.line(), 2);
4464 assert_eq!(scanner.mark.col(), 0);
4465 }
4466
4467 #[test]
4468 fn scanner_emits_first_leading_comment_before_scanning_next_comment() {
4469 let mut scanner = Scanner::new(StrInput::new("# first\n# second\nkey: value\n"));
4470
4471 assert!(matches!(
4472 scanner.next_token().unwrap().unwrap().1,
4473 TokenType::StreamStart(_)
4474 ));
4475 assert!(matches!(
4476 scanner.next_token().unwrap().unwrap().1,
4477 TokenType::Comment(ref comment) if comment.text == " first"
4478 ));
4479 assert!(scanner.tokens.is_empty());
4480 assert!(matches!(
4481 scanner.next_token().unwrap().unwrap().1,
4482 TokenType::Comment(ref comment) if comment.text == " second"
4483 ));
4484 }
4485
4486 #[test]
4487 fn scanner_emits_quoted_scalar_comment_before_scanning_following_value() {
4488 let mut scanner = Scanner::new(StrInput::new("\"key\" # quoted\n: value\n"));
4489
4490 assert!(matches!(
4491 scanner.next_token().unwrap().unwrap().1,
4492 TokenType::StreamStart(_)
4493 ));
4494 assert!(matches!(
4495 scanner.next_token().unwrap().unwrap().1,
4496 TokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4497 ));
4498 assert!(matches!(
4499 scanner.next_token().unwrap().unwrap().1,
4500 TokenType::Comment(ref comment) if comment.text == " quoted"
4501 ));
4502 }
4503
4504 #[test]
4505 fn flow_scalar_comment_disables_adjacent_value_lookahead() {
4506 let mut scanner = Scanner::new(StrInput::new("\"key\"\n# quoted\n: value\n"));
4507
4508 scanner.fetch_flow_scalar(false).unwrap();
4509
4510 assert_eq!(scanner.adjacent_value_allowed_at, usize::MAX);
4511 assert!(matches!(
4512 scanner.tokens.front().unwrap().1,
4513 QueuedTokenType::Scalar(ScalarStyle::DoubleQuoted, ref value) if value == "key"
4514 ));
4515 assert!(scanner.tokens.iter().any(|QueuedToken(_, token)| matches!(
4516 token,
4517 QueuedTokenType::Comment(comment) if comment.text == " quoted"
4518 )));
4519 }
4520
4521 #[test]
4522 fn deferred_error_waits_for_all_comment_tokens() {
4523 let mut scanner = Scanner::new(StrInput::new("# first\n# second\n@\n"));
4524
4525 assert!(matches!(
4526 scanner.next_token().unwrap().unwrap().1,
4527 TokenType::StreamStart(_)
4528 ));
4529 assert!(matches!(
4530 scanner.next_token().unwrap().unwrap().1,
4531 TokenType::Comment(ref comment) if comment.text == " first"
4532 ));
4533 assert!(matches!(
4534 scanner.next_token().unwrap().unwrap().1,
4535 TokenType::Comment(ref comment) if comment.text == " second"
4536 ));
4537
4538 let error = scanner.next_token().unwrap_err();
4539
4540 assert!(error.info().contains("unexpected character"));
4541 }
4542
4543 #[test]
4545 fn anchor_name_is_borrowed_for_str_input() {
4546 let mut scanner = Scanner::new(StrInput::new("&anch\n"));
4547
4548 loop {
4549 let tok = scanner
4550 .next_token()
4551 .expect("valid YAML must scan without errors")
4552 .expect("scanner must eventually produce a token");
4553 if let TokenType::Anchor(name) = tok.1 {
4554 assert!(matches!(name, Cow::Borrowed("anch")));
4555 break;
4556 }
4557 }
4558 }
4559
4560 #[test]
4562 fn anchor_name_rejects_non_printable_control_chars() {
4563 let mut scanner = Scanner::new(StrInput::new("&foo\u{0001}\n"));
4564
4565 loop {
4566 let tok = scanner
4567 .next_token()
4568 .expect("scanning should not fail")
4569 .expect("scanner must eventually produce a token");
4570 if let TokenType::Anchor(name) = tok.1 {
4571 assert!(matches!(name, Cow::Borrowed("foo")));
4572 let next = scanner.next_token().expect("scanning should not fail");
4573 if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4574 assert!(rest.starts_with('\u{0001}'));
4575 }
4576 break;
4577 }
4578 }
4579 }
4580
4581 #[test]
4582 fn alias_name_rejects_non_printable_control_chars() {
4583 let mut scanner = Scanner::new(StrInput::new("*foo\u{0001}\n"));
4584
4585 loop {
4586 let tok = scanner
4587 .next_token()
4588 .expect("scanning should not fail")
4589 .expect("scanner must eventually produce a token");
4590 if let TokenType::Alias(name) = tok.1 {
4591 assert!(matches!(name, Cow::Borrowed("foo")));
4592 let next = scanner.next_token().expect("scanning should not fail");
4593 if let Some(Token(_, TokenType::Scalar(_, rest))) = next {
4594 assert!(rest.starts_with('\u{0001}'));
4595 }
4596 break;
4597 }
4598 }
4599 }
4600
4601 #[test]
4602 fn alias_name_is_borrowed_for_str_input() {
4603 let mut scanner = Scanner::new(StrInput::new("*anch\n"));
4604
4605 loop {
4606 let tok = scanner
4607 .next_token()
4608 .expect("valid YAML must scan without errors")
4609 .expect("scanner must eventually produce a token");
4610 if let TokenType::Alias(name) = tok.1 {
4611 assert!(matches!(name, Cow::Borrowed("anch")));
4612 break;
4613 }
4614 }
4615 }
4616
4617 #[test]
4619 fn tag_directive_parts_are_borrowed_for_str_input() {
4620 let mut scanner = Scanner::new(StrInput::new("%TAG !e! tag:example.com,2000:app/\n"));
4621
4622 loop {
4623 let tok = scanner
4624 .next_token()
4625 .expect("valid YAML must scan without errors")
4626 .expect("scanner must eventually produce a token");
4627 if let TokenType::TagDirective(handle, prefix) = tok.1 {
4628 assert!(matches!(handle, Cow::Borrowed("!e!")));
4629 assert!(matches!(prefix, Cow::Borrowed("tag:example.com,2000:app/")));
4630 break;
4631 }
4632 }
4633 }
4634
4635 #[test]
4636 fn plain_scalar_is_borrowed_when_whitespace_free_for_str_input() {
4637 let mut scanner = Scanner::new(StrInput::new("foo\n"));
4638
4639 loop {
4640 let tok = scanner
4641 .next_token()
4642 .expect("valid YAML must scan without errors")
4643 .expect("scanner must eventually produce a token");
4644 if let TokenType::Scalar(_, value) = tok.1 {
4645 assert!(matches!(value, Cow::Borrowed("foo")));
4646 break;
4647 }
4648 }
4649 }
4650
4651 #[test]
4652 fn plain_scalar_is_borrowed_when_whitespace_present_for_str_input() {
4653 let mut scanner = Scanner::new(StrInput::new("foo bar\n"));
4654
4655 loop {
4656 let tok = scanner
4657 .next_token()
4658 .expect("valid YAML must scan without errors")
4659 .expect("scanner must eventually produce a token");
4660 if let TokenType::Scalar(_, value) = tok.1 {
4661 assert!(matches!(value, Cow::Borrowed("foo bar")));
4662 break;
4663 }
4664 }
4665 }
4666
4667 #[test]
4668 fn single_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4669 let mut scanner = Scanner::new(StrInput::new("'foo bar'\n"));
4670
4671 loop {
4672 let tok = scanner
4673 .next_token()
4674 .expect("valid YAML must scan without errors")
4675 .expect("scanner must eventually produce a token");
4676 if let TokenType::Scalar(_, value) = tok.1 {
4677 assert!(matches!(value, Cow::Borrowed("foo bar")));
4678 break;
4679 }
4680 }
4681 }
4682
4683 #[test]
4684 fn single_quoted_scalar_is_owned_when_quote_is_escaped_for_str_input() {
4685 let mut scanner = Scanner::new(StrInput::new("'foo''bar'\n"));
4686
4687 loop {
4688 let tok = scanner
4689 .next_token()
4690 .expect("valid YAML must scan without errors")
4691 .expect("scanner must eventually produce a token");
4692 if let TokenType::Scalar(_, value) = tok.1 {
4693 assert!(matches!(value, Cow::Owned(_)));
4694 assert_eq!(&*value, "foo'bar");
4695 break;
4696 }
4697 }
4698 }
4699
4700 #[test]
4701 fn double_quoted_scalar_is_borrowed_when_verbatim_for_str_input() {
4702 let mut scanner = Scanner::new(StrInput::new("\"foo bar\"\n"));
4703
4704 loop {
4705 let tok = scanner
4706 .next_token()
4707 .expect("valid YAML must scan without errors")
4708 .expect("scanner must eventually produce a token");
4709 if let TokenType::Scalar(_, value) = tok.1 {
4710 assert!(matches!(value, Cow::Borrowed("foo bar")));
4711 break;
4712 }
4713 }
4714 }
4715
4716 #[test]
4717 fn double_quoted_scalar_is_owned_when_escape_sequence_present_for_str_input() {
4718 let mut scanner = Scanner::new(StrInput::new("\"foo\\nbar\"\n"));
4719
4720 loop {
4721 let tok = scanner
4722 .next_token()
4723 .expect("valid YAML must scan without errors")
4724 .expect("scanner must eventually produce a token");
4725 if let TokenType::Scalar(_, value) = tok.1 {
4726 assert!(matches!(value, Cow::Owned(_)));
4727 assert_eq!(&*value, "foo\nbar");
4728 break;
4729 }
4730 }
4731 }
4732
4733 #[test]
4734 fn plain_key_is_borrowed_for_str_input() {
4735 let mut scanner = Scanner::new(StrInput::new("mykey: value\n"));
4737
4738 let mut found_key = false;
4739 let mut key_value: Option<Cow<'_, str>> = None;
4740
4741 loop {
4742 let tok = scanner
4743 .next_token()
4744 .expect("valid YAML must scan without errors");
4745 let Some(tok) = tok else { break };
4746
4747 if matches!(tok.1, TokenType::Key) {
4748 found_key = true;
4749 } else if found_key {
4750 if let TokenType::Scalar(_, value) = tok.1 {
4751 key_value = Some(value);
4752 break;
4753 }
4754 }
4755 }
4756
4757 assert!(found_key, "expected to find a Key token");
4758 let key_value = key_value.expect("expected to find a scalar after Key token");
4759 assert!(
4760 matches!(key_value, Cow::Borrowed("mykey")),
4761 "key should be borrowed, got: {key_value:?}"
4762 );
4763 }
4764
4765 #[test]
4766 fn quoted_key_is_borrowed_when_verbatim_for_str_input() {
4767 let mut scanner = Scanner::new(StrInput::new("\"mykey\": value\n"));
4768
4769 let mut found_key = false;
4770 let mut key_value: Option<Cow<'_, str>> = None;
4771
4772 loop {
4773 let tok = scanner
4774 .next_token()
4775 .expect("valid YAML must scan without errors");
4776 let Some(tok) = tok else { break };
4777
4778 if matches!(tok.1, TokenType::Key) {
4779 found_key = true;
4780 } else if found_key {
4781 if let TokenType::Scalar(_, value) = tok.1 {
4782 key_value = Some(value);
4783 break;
4784 }
4785 }
4786 }
4787
4788 assert!(found_key, "expected to find a Key token");
4789 let key_value = key_value.expect("expected to find a scalar after Key token");
4790 assert!(
4791 matches!(key_value, Cow::Borrowed("mykey")),
4792 "quoted key should be borrowed when verbatim, got: {key_value:?}"
4793 );
4794 }
4795
4796 #[test]
4797 fn tag_handle_and_suffix_are_borrowed_for_str_input() {
4798 let mut scanner = Scanner::new(StrInput::new("!!str foo\n"));
4800
4801 loop {
4802 let tok = scanner
4803 .next_token()
4804 .expect("valid YAML must scan without errors")
4805 .expect("scanner must eventually produce a token");
4806 if let TokenType::Tag(handle, suffix) = tok.1 {
4807 assert!(
4808 matches!(handle, Cow::Borrowed("!!")),
4809 "tag handle should be borrowed, got: {handle:?}"
4810 );
4811 assert!(
4812 matches!(suffix, Cow::Borrowed("str")),
4813 "tag suffix should be borrowed, got: {suffix:?}"
4814 );
4815 break;
4816 }
4817 }
4818 }
4819
4820 #[test]
4821 fn local_tag_suffix_is_borrowed_for_str_input() {
4822 let mut scanner = Scanner::new(StrInput::new("!mytag foo\n"));
4824
4825 loop {
4826 let tok = scanner
4827 .next_token()
4828 .expect("valid YAML must scan without errors")
4829 .expect("scanner must eventually produce a token");
4830 if let TokenType::Tag(handle, suffix) = tok.1 {
4831 assert!(
4832 matches!(handle, Cow::Borrowed("!")),
4833 "local tag handle should be '!', got: {handle:?}"
4834 );
4835 assert!(
4836 matches!(suffix, Cow::Borrowed("mytag")),
4837 "local tag suffix should be borrowed, got: {suffix:?}"
4838 );
4839 break;
4840 }
4841 }
4842 }
4843
4844 #[test]
4845 fn tag_with_uri_escape_is_owned_for_str_input() {
4846 let mut scanner = Scanner::new(StrInput::new("!!my%20tag foo\n"));
4848
4849 loop {
4850 let tok = scanner
4851 .next_token()
4852 .expect("valid YAML must scan without errors")
4853 .expect("scanner must eventually produce a token");
4854 if let TokenType::Tag(handle, suffix) = tok.1 {
4855 assert!(
4856 matches!(handle, Cow::Borrowed("!!")),
4857 "tag handle should still be borrowed, got: {handle:?}"
4858 );
4859 assert!(
4860 matches!(suffix, Cow::Owned(_)),
4861 "tag suffix with URI escape should be owned, got: {suffix:?}"
4862 );
4863 assert_eq!(&*suffix, "my tag");
4864 break;
4865 }
4866 }
4867 }
4868
4869 #[test]
4870 fn flow_scalar_buffer_tracks_pending_whitespace() {
4871 let mut borrowed = super::FlowScalarBuf::new_borrowed(2);
4872
4873 borrowed.note_pending_ws(5, 8);
4874 borrowed.commit_pending_ws();
4875 assert!(matches!(
4876 borrowed,
4877 super::FlowScalarBuf::Borrowed {
4878 end: 8,
4879 pending_ws_start: None,
4880 pending_ws_end: 8,
4881 ..
4882 }
4883 ));
4884
4885 borrowed.note_pending_ws(9, 11);
4886 borrowed.discard_pending_ws();
4887 assert!(matches!(
4888 borrowed,
4889 super::FlowScalarBuf::Borrowed {
4890 end: 8,
4891 pending_ws_start: None,
4892 pending_ws_end: 8,
4893 ..
4894 }
4895 ));
4896 assert!(borrowed.as_owned_mut().is_none());
4897
4898 let mut owned = super::FlowScalarBuf::new_owned();
4899 owned.as_owned_mut().unwrap().push_str("owned");
4900 assert!(matches!(owned, super::FlowScalarBuf::Owned(ref s) if s == "owned"));
4901 }
4902
4903 fn first_scanner_error_info(input: &str) -> String {
4904 let mut scanner = Scanner::new(StrInput::new(input));
4905 loop {
4906 match scanner.next_token() {
4907 Ok(Some(_)) => {}
4908 Ok(None) => panic!("expected scanner error"),
4909 Err(error) => return error.info().to_owned(),
4910 }
4911 }
4912 }
4913
4914 fn first_scalar_value(input: &str) -> String {
4915 let mut scanner = Scanner::new(StrInput::new(input));
4916 loop {
4917 match scanner.next_token().expect("scanner should not error") {
4918 Some(Token(_, TokenType::Scalar(_, value))) => return value.into_owned(),
4919 Some(_) => {}
4920 None => panic!("expected scalar token"),
4921 }
4922 }
4923 }
4924
4925 #[test]
4926 fn iterator_next_records_error_and_then_stays_empty() {
4927 let mut scanner = Scanner::new(StrInput::new("\"unterminated"));
4928
4929 while scanner.next().is_some() {}
4930
4931 let error = scanner
4932 .get_error()
4933 .expect("scanner should retain the error");
4934 assert_eq!(error.info(), "unclosed quote");
4935 assert!(scanner.next().is_none());
4936 }
4937
4938 #[test]
4939 fn next_token_returns_none_after_stream_end() {
4940 let mut scanner = Scanner::new(StrInput::new(""));
4941
4942 while let Some(token) = scanner.next_token().unwrap() {
4943 if matches!(token.1, TokenType::StreamEnd) {
4944 break;
4945 }
4946 }
4947
4948 assert!(scanner.stream_started());
4949 assert!(scanner.stream_ended());
4950 assert!(scanner.next_token().unwrap().is_none());
4951 }
4952
4953 #[test]
4954 fn directive_name_must_be_present() {
4955 assert_eq!(
4956 first_scanner_error_info("%\n"),
4957 "while scanning a directive, could not find expected directive name"
4958 );
4959 }
4960
4961 #[test]
4962 fn yaml_directive_requires_dot_between_version_numbers() {
4963 assert_eq!(
4964 first_scanner_error_info("%YAML 1\n"),
4965 "while scanning a YAML directive, did not find expected digit or '.' character"
4966 );
4967 }
4968
4969 #[test]
4970 fn yaml_directive_requires_major_version_number() {
4971 assert_eq!(
4972 first_scanner_error_info("%YAML .2\n"),
4973 "while scanning a YAML directive, did not find expected version number"
4974 );
4975 }
4976
4977 #[test]
4978 fn yaml_directive_rejects_extremely_long_version_number() {
4979 assert_eq!(
4980 first_scanner_error_info("%YAML 1234567890.2\n"),
4981 "while scanning a YAML directive, found extremely long version number"
4982 );
4983 }
4984
4985 #[test]
4986 fn tag_directive_handle_must_end_with_bang() {
4987 assert_eq!(
4988 first_scanner_error_info("%TAG !bad tag:example.com,2024:\n"),
4989 "while parsing a tag directive, did not find expected '!'"
4990 );
4991 }
4992
4993 #[test]
4994 fn tag_directive_handle_must_start_with_bang() {
4995 assert_eq!(
4996 first_scanner_error_info("%TAG bad! tag:example.com,2024:\n"),
4997 "while scanning a tag, did not find expected '!'"
4998 );
4999 }
5000
5001 #[test]
5002 fn tag_directive_prefix_must_start_with_tag_character() {
5003 assert_eq!(
5004 first_scanner_error_info("%TAG !e! `bad\n"),
5005 "invalid global tag character"
5006 );
5007 }
5008
5009 #[test]
5010 fn tag_directive_prefix_must_end_before_invalid_content() {
5011 assert_eq!(
5012 first_scanner_error_info("%TAG !e! tag:example.com^suffix\n"),
5013 "while scanning TAG, did not find expected whitespace or line break"
5014 );
5015 }
5016
5017 #[test]
5018 fn tag_directive_prefix_with_uri_escape_is_owned_and_decoded() {
5019 let mut scanner =
5020 Scanner::new(StrInput::new("%TAG !e! tag:example.com,2024:some%20app/\n"));
5021
5022 loop {
5023 let token = scanner
5024 .next_token()
5025 .expect("valid directive should scan")
5026 .expect("scanner must produce a directive token");
5027 if let TokenType::TagDirective(handle, prefix) = token.1 {
5028 assert!(matches!(handle, Cow::Borrowed("!e!")));
5029 assert!(matches!(prefix, Cow::Owned(_)));
5030 assert_eq!(&*prefix, "tag:example.com,2024:some app/");
5031 break;
5032 }
5033 }
5034 }
5035
5036 #[test]
5037 fn bare_bang_tag_scans_as_non_specific_tag() {
5038 let mut scanner = Scanner::new(StrInput::new("! foo\n"));
5039
5040 loop {
5041 let token = scanner
5042 .next_token()
5043 .expect("valid tag should scan")
5044 .expect("scanner must produce a tag token");
5045 if let TokenType::Tag(handle, suffix) = token.1 {
5046 assert_eq!(&*handle, "");
5047 assert_eq!(&*suffix, "!");
5048 break;
5049 }
5050 }
5051 }
5052
5053 #[test]
5054 fn tag_requires_separation_after_suffix() {
5055 assert_eq!(
5056 first_scanner_error_info("!foo,bar\n"),
5057 "while scanning a tag, did not find expected whitespace or line break"
5058 );
5059 }
5060
5061 #[test]
5062 fn verbatim_tag_requires_uri() {
5063 assert_eq!(
5064 first_scanner_error_info("!<> foo\n"),
5065 "while parsing a tag, did not find expected tag URI"
5066 );
5067 }
5068
5069 #[test]
5070 fn verbatim_tag_requires_closing_angle_bracket() {
5071 assert_eq!(
5072 first_scanner_error_info("!<tag:yaml.org,2002:str foo\n"),
5073 "while scanning a verbatim tag, did not find the expected '>'"
5074 );
5075 }
5076
5077 #[test]
5078 fn tag_uri_escape_requires_hex_digits() {
5079 assert_eq!(
5080 first_scanner_error_info("!!bad%zz foo\n"),
5081 "while parsing a tag, found an invalid escape sequence"
5082 );
5083 }
5084
5085 #[test]
5086 fn tag_uri_escape_rejects_bad_leading_utf8_byte() {
5087 assert_eq!(
5088 first_scanner_error_info("!!bad%80 foo\n"),
5089 "while parsing a tag, found an incorrect leading UTF-8 byte"
5090 );
5091 }
5092
5093 #[test]
5094 fn tag_uri_escape_rejects_bad_trailing_utf8_byte() {
5095 assert_eq!(
5096 first_scanner_error_info("!!bad%C2%41 foo\n"),
5097 "while parsing a tag, found an incorrect trailing UTF-8 byte"
5098 );
5099 }
5100
5101 #[test]
5102 fn tag_uri_escape_rejects_invalid_utf8_codepoint() {
5103 assert_eq!(
5104 first_scanner_error_info("!!bad%F4%90%80%80 foo\n"),
5105 "while parsing a tag, found an invalid UTF-8 codepoint"
5106 );
5107 }
5108
5109 #[test]
5110 fn anchors_and_aliases_require_names() {
5111 let expected =
5112 "while scanning an anchor or alias, did not find expected alphabetic or numeric character";
5113
5114 assert_eq!(first_scanner_error_info("& \n"), expected);
5115 assert_eq!(first_scanner_error_info("* \n"), expected);
5116 }
5117
5118 #[test]
5119 fn document_end_marker_rejects_trailing_content() {
5120 assert_eq!(
5121 first_scanner_error_info("... trailing\n"),
5122 "invalid content after document end marker"
5123 );
5124 }
5125
5126 #[test]
5127 fn reserved_indicators_are_rejected_outside_directives() {
5128 assert_eq!(
5129 first_scanner_error_info(" @\n"),
5130 "unexpected character: `@'"
5131 );
5132 }
5133
5134 #[test]
5135 fn flow_block_entry_indicator_is_rejected() {
5136 assert_eq!(
5137 first_scanner_error_info("[- ]\n"),
5138 r#""-" is only valid inside a block"#
5139 );
5140 }
5141
5142 #[test]
5143 fn block_entry_after_tabbed_separator_reports_specific_error() {
5144 assert_eq!(
5145 first_scanner_error_info("-\t- value\n"),
5146 "'-' must be followed by a valid YAML whitespace"
5147 );
5148 }
5149
5150 #[test]
5151 fn document_indicator_reports_unclosed_flow_collection() {
5152 assert_eq!(first_scanner_error_info("[\n---\n"), "unclosed bracket '['");
5153 }
5154
5155 #[test]
5156 fn block_scalar_header_rejects_trailing_content() {
5157 assert_eq!(
5158 first_scanner_error_info("|+ trailing\n"),
5159 "while scanning a block scalar, did not find expected comment or line break"
5160 );
5161 }
5162
5163 #[test]
5164 fn block_scalar_rejects_zero_indent_indicator() {
5165 let expected = "while scanning a block scalar, found an indentation indicator equal to 0";
5166
5167 assert_eq!(first_scanner_error_info("|0\n"), expected);
5168 assert_eq!(first_scanner_error_info("|+0\n"), expected);
5169 }
5170
5171 #[test]
5172 fn empty_block_scalar_at_eof_honors_chomping() {
5173 assert_eq!(first_scalar_value("|-\n"), "");
5174 assert_eq!(first_scalar_value("|+\n"), "\n");
5175 }
5176
5177 #[test]
5178 fn explicit_indent_block_scalar_can_end_at_document_marker() {
5179 assert_eq!(first_scalar_value("|1\n...\n"), "");
5180 }
5181
5182 #[test]
5183 fn root_explicit_indent_block_scalar_rejects_underindented_content() {
5184 assert_eq!(
5185 first_scanner_error_info("|2\nx\n"),
5186 "wrongly indented line in block scalar"
5187 );
5188 }
5189
5190 #[test]
5191 fn quoted_scalar_rejects_document_indicator_at_line_start() {
5192 assert_eq!(
5193 first_scanner_error_info("\"one\n---\ntwo\"\n"),
5194 "while scanning a quoted scalar, found unexpected document indicator"
5195 );
5196 }
5197
5198 #[test]
5199 fn quoted_scalar_rejects_tab_indentation_after_line_break() {
5200 assert_eq!(
5201 first_scanner_error_info("a: \"one\n\tbad\"\n"),
5202 "tab cannot be used as indentation"
5203 );
5204 }
5205
5206 #[test]
5207 fn quoted_scalar_rejects_underindented_continuation() {
5208 assert_eq!(
5209 first_scanner_error_info("a: \"one\nbad\"\n"),
5210 "invalid indentation in multiline quoted scalar"
5211 );
5212 }
5213
5214 #[test]
5215 fn indented_flow_scalar_reports_invalid_indentation() {
5216 assert_eq!(
5217 first_scanner_error_info("a:\n [\nfoo]\n"),
5218 "invalid indentation"
5219 );
5220 }
5221
5222 #[test]
5223 fn required_simple_key_requires_value_at_stream_end() {
5224 assert_eq!(
5225 first_scanner_error_info("a:\n&b\n- c\n"),
5226 "simple key expect ':'"
5227 );
5228 }
5229
5230 #[test]
5231 fn plain_scalar_rejects_dash_before_flow_indicator() {
5232 assert_eq!(
5233 first_scanner_error_info("[-]\n"),
5234 "plain scalar cannot start with '-' followed by ,[]{}"
5235 );
5236 }
5237
5238 #[test]
5239 fn explicit_key_rejects_tab_after_indicator() {
5240 assert_eq!(
5241 first_scanner_error_info("? \tfoo\n"),
5242 "tabs disallowed in this context"
5243 );
5244 }
5245
5246 #[test]
5247 fn flow_mapping_rejects_adjacent_collection_value_after_plain_key() {
5248 assert_eq!(
5249 first_scanner_error_info("[a:[]]\n"),
5250 "':' may not precede any of `[{` in flow mapping"
5251 );
5252 }
5253
5254 #[test]
5255 fn implicit_flow_mapping_colon_cannot_move_to_next_line() {
5256 assert_eq!(
5257 first_scanner_error_info("[foo\n: bar]\n"),
5258 "illegal placement of ':' indicator"
5259 );
5260 }
5261
5262 #[test]
5263 fn stale_simple_key_token_position_is_a_scan_error() {
5264 let mut scanner = Scanner::new(StrInput::new(": value\n"));
5265 scanner.fetch_stream_start();
5266 scanner.tokens.clear();
5267 scanner.tokens_parsed = 1;
5268
5269 let simple_key = scanner
5270 .simple_keys
5271 .last_mut()
5272 .expect("stream start should create a simple key slot");
5273 simple_key.possible = true;
5274 simple_key.token_number = 0;
5275
5276 let error = scanner
5277 .fetch_value()
5278 .expect_err("stale simple key should be reported as a scan error");
5279 assert_eq!(error.info(), "simple key is no longer valid");
5280 }
5281}