1use crate::{error::ErrorContext, Error, Limits, Position, ResourceTracker, Result};
4
5pub mod indentation;
6pub mod scalar_scanner;
7pub mod state;
8pub mod token_processor;
9pub mod tokens;
10pub use scalar_scanner::ScalarScanner;
12pub use tokens::*;
13pub trait Scanner {
17 fn check_token(&self) -> bool;
19
20 fn peek_token(&self) -> Result<Option<&Token>>;
22
23 fn get_token(&mut self) -> Result<Option<Token>>;
25
26 fn reset(&mut self);
28
29 fn position(&self) -> Position;
31
32 fn input(&self) -> &str;
34}
35
36#[derive(Debug)]
38#[allow(dead_code)]
39pub struct BasicScanner {
40 input: String,
41 position: Position,
42 current_char: Option<char>,
43 tokens: Vec<Token>,
44 token_index: usize,
45 done: bool,
46 indent_stack: Vec<usize>,
47 current_indent: usize,
48 allow_simple_key: bool,
49 simple_key_allowed: bool,
50 flow_level: usize,
51 preserve_comments: bool,
52 detected_indent_style: Option<crate::value::IndentStyle>,
54 indent_samples: Vec<(usize, bool)>, previous_indent_level: usize, buffer: String, char_cache: Vec<char>, char_indices: Vec<(usize, char)>, current_char_index: usize, profiler: Option<crate::profiling::YamlProfiler>, scanning_error: Option<Error>, limits: Limits,
66 resource_tracker: ResourceTracker,
67 inline_sequence_depth: usize,
69}
70
71impl BasicScanner {
72 pub fn new(input: String) -> Self {
74 Self::with_limits(input, Limits::default())
75 }
76
77 pub fn with_limits(input: String, limits: Limits) -> Self {
79 let char_cache: Vec<char> = input.chars().collect();
80 let char_indices: Vec<(usize, char)> = input.char_indices().collect();
81 let current_char = char_cache.first().copied();
82
83 let mut resource_tracker = ResourceTracker::new();
85 if let Err(e) = resource_tracker.add_bytes(&limits, input.len()) {
86 return Self {
88 current_char: None,
89 input,
90 position: Position::start(),
91 tokens: Vec::new(),
92 token_index: 0,
93 done: true,
94 indent_stack: vec![0],
95 current_indent: 0,
96 allow_simple_key: false,
97 simple_key_allowed: false,
98 flow_level: 0,
99 preserve_comments: false,
100 detected_indent_style: None,
101 indent_samples: Vec::new(),
102 previous_indent_level: 0,
103 buffer: String::new(),
104 char_cache: Vec::new(),
105 char_indices: Vec::new(),
106 current_char_index: 0,
107 profiler: None,
108 scanning_error: Some(e),
109 limits,
110 resource_tracker,
111 inline_sequence_depth: 0,
112 };
113 }
114
115 Self {
116 current_char,
117 input,
118 position: Position::start(),
119 tokens: Vec::new(),
120 token_index: 0,
121 done: false,
122 indent_stack: vec![0], current_indent: 0,
124 allow_simple_key: true,
125 simple_key_allowed: true,
126 flow_level: 0,
127 preserve_comments: false,
128 detected_indent_style: None,
129 indent_samples: Vec::new(),
130 previous_indent_level: 0,
131 buffer: String::with_capacity(64), char_cache,
133 char_indices,
134 current_char_index: 0,
135 profiler: std::env::var("RUST_YAML_PROFILE")
136 .ok()
137 .map(|_| crate::profiling::YamlProfiler::new()),
138 scanning_error: None,
139 limits,
140 resource_tracker,
141 inline_sequence_depth: 0,
142 }
143 }
144
145 pub fn new_eager(input: String) -> Self {
147 Self::new_eager_with_limits(input, Limits::default())
148 }
149
150 pub fn new_eager_with_limits(input: String, limits: Limits) -> Self {
152 let mut scanner = Self::with_limits(input, limits);
153 if let Err(error) = scanner.scan_all_tokens() {
155 scanner.scanning_error = Some(error);
156 }
157 scanner
158 }
159
160 pub fn new_with_comments(input: String) -> Self {
162 let mut scanner = Self::new(input);
163 scanner.preserve_comments = true;
164 scanner
165 }
166
167 pub fn new_with_comments_and_limits(input: String, limits: Limits) -> Self {
169 let mut scanner = Self::with_limits(input, limits);
170 scanner.preserve_comments = true;
171 scanner
172 }
173
174 pub fn new_eager_with_comments(input: String) -> Self {
176 let mut scanner = Self::new_with_comments(input);
177 scanner.scan_all_tokens().unwrap_or(());
178 scanner
179 }
180
181 pub const fn detected_indent_style(&self) -> Option<&crate::value::IndentStyle> {
183 self.detected_indent_style.as_ref()
184 }
185
186 pub const fn has_scanning_error(&self) -> bool {
188 self.scanning_error.is_some()
189 }
190
191 #[allow(clippy::missing_const_for_fn)]
193 pub fn take_scanning_error(&mut self) -> Option<Error> {
194 self.scanning_error.take()
195 }
196
197 fn advance(&mut self) -> Option<char> {
199 if let Some(ch) = self.current_char {
200 self.position = self.position.advance(ch);
201 self.current_char_index += 1;
202
203 if self.current_char_index < self.char_cache.len() {
204 self.current_char = Some(self.char_cache[self.current_char_index]);
205 } else {
206 self.current_char = None;
207 }
208 }
209
210 self.current_char
211 }
212
213 fn skip_whitespace(&mut self) {
215 while let Some(ch) = self.current_char {
216 if ch == ' ' || ch == '\t' {
217 self.advance();
218 } else {
219 break;
220 }
221 }
222 }
223
224 fn handle_indentation(&mut self) -> Result<()> {
226 if self.flow_level > 0 {
228 return Ok(());
229 }
230
231 let line_start_pos = self.position;
232 let mut indent = 0;
233 let mut has_tabs = false;
234 let mut has_spaces = false;
235 let _indent_start_pos = self.position;
236
237 while let Some(ch) = self.current_char {
239 if ch == ' ' {
240 indent += 1;
241 has_spaces = true;
242 self.advance();
243 } else if ch == '\t' {
244 indent += 8; has_tabs = true;
246 self.advance();
247 } else {
248 break;
249 }
250 }
251
252 if indent > 0
255 && self.current_char.is_some()
256 && !matches!(self.current_char, Some('\n' | '\r'))
257 {
258 self.analyze_indentation_pattern(indent, has_tabs, has_spaces)?;
259 }
260
261 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
263 if indent > 0 && indent % width != 0 {
264 let is_valid_nesting = self.is_valid_indentation_level(indent);
266 if !is_valid_nesting {
267 let lower_level = (indent / width) * width;
268 let higher_level = lower_level + width;
269 let suggestion = format!(
270 "Inconsistent indentation detected. Expected multiples of {} spaces. Use {} or {} spaces instead of {}",
271 width, lower_level, higher_level, indent
272 );
273 let context =
274 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
275 .with_suggestion(suggestion);
276 return Err(Error::indentation_with_context(
277 self.position,
278 lower_level,
279 indent,
280 context,
281 ));
282 }
283 }
284 }
285
286 if indent > 0 {
288 self.previous_indent_level = indent;
289 }
290
291 self.current_indent = indent;
293
294 while let Some(&last_indent) = self.indent_stack.last() {
296 if indent < last_indent && last_indent > 0 {
297 self.indent_stack.pop();
298 self.tokens
299 .push(Token::simple(TokenType::BlockEnd, line_start_pos));
300 } else {
301 break;
302 }
303 }
304
305 Ok(())
306 }
307
308 fn analyze_indentation_pattern(
310 &mut self,
311 current_indent: usize,
312 has_tabs: bool,
313 has_spaces: bool,
314 ) -> Result<()> {
315 if has_tabs && has_spaces {
317 let context = crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
318 .with_suggestion("Use either tabs OR spaces for indentation, not both".to_string());
319 return Err(Error::invalid_character_with_context(
320 self.position,
321 '\t',
322 "mixed indentation",
323 context,
324 ));
325 }
326
327 if has_tabs {
329 match self.detected_indent_style {
330 None => {
331 self.detected_indent_style = Some(crate::value::IndentStyle::Tabs);
333 }
334 Some(crate::value::IndentStyle::Spaces(_)) => {
335 let context =
337 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
338 .with_suggestion(
339 "Use consistent indentation style throughout the document"
340 .to_string(),
341 );
342 return Err(Error::invalid_character_with_context(
343 self.position,
344 '\t',
345 "mixed indentation",
346 context,
347 ));
348 }
349 Some(crate::value::IndentStyle::Tabs) => {
350 }
352 }
353 return Ok(());
354 }
355
356 if has_spaces {
358 if matches!(
360 self.detected_indent_style,
361 Some(crate::value::IndentStyle::Tabs)
362 ) {
363 let context =
364 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
365 .with_suggestion(
366 "Use consistent indentation style throughout the document".to_string(),
367 );
368 return Err(Error::invalid_character_with_context(
369 self.position,
370 ' ',
371 "mixed indentation",
372 context,
373 ));
374 }
375
376 if current_indent > self.previous_indent_level {
378 let indent_diff = current_indent - self.previous_indent_level;
379
380 if indent_diff > 0 && indent_diff <= 8 {
382 self.indent_samples.push((indent_diff, false));
384
385 if self.detected_indent_style.is_none() {
387 self.detect_space_indentation_width();
388 }
389 }
390 }
391
392 self.validate_indentation_consistency(current_indent)?;
394 }
395
396 Ok(())
397 }
398
399 fn detect_space_indentation_width(&mut self) {
401 if self.indent_samples.is_empty() {
402 return; }
404
405 let mut width_counts = std::collections::HashMap::new();
407
408 for &(width, is_tabs) in &self.indent_samples {
409 if !is_tabs && width > 0 {
410 *width_counts.entry(width).or_insert(0) += 1;
411 }
412 }
413
414 if let Some((&most_common_width, &_count)) =
416 width_counts.iter().max_by_key(|&(_, count)| count)
417 {
418 self.detected_indent_style = Some(crate::value::IndentStyle::Spaces(most_common_width));
420 }
421 }
422
423 #[allow(clippy::missing_const_for_fn)] fn is_valid_indentation_level(&self, indent: usize) -> bool {
426 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
430 indent % width == 0
432 } else {
433 true
435 }
436 }
437
438 fn validate_indentation_consistency(&self, current_indent: usize) -> Result<()> {
440 if let Some(crate::value::IndentStyle::Spaces(width)) = self.detected_indent_style {
441 if current_indent > 0 && current_indent % width != 0 {
443 let lower_level = (current_indent / width) * width;
444 let higher_level = lower_level + width;
445 let suggestion = format!(
446 "Expected indentation to be a multiple of {} spaces. Use {} or {} spaces instead of {}",
447 width, lower_level, higher_level, current_indent
448 );
449 let context =
450 crate::error::ErrorContext::from_input(&self.input, &self.position, 4)
451 .with_suggestion(suggestion);
452 return Err(Error::indentation_with_context(
453 self.position,
454 (current_indent / width) * width, current_indent, context,
457 ));
458 }
459 }
460 Ok(())
461 }
462
463 fn is_plain_scalar_start(&self) -> bool {
465 self.current_char.map_or(false, |ch| match ch {
466 '-' | '?' | ':' | ',' | '[' | ']' | '{' | '}' | '#' | '&' | '*' | '!' | '|' | '>'
467 | '\'' | '"' | '%' | '@' | '`' => false,
468 _ => !ch.is_whitespace(),
469 })
470 }
471
472 fn is_yaml_bool(value: &str) -> bool {
474 matches!(
475 value,
476 "true"
477 | "false"
478 | "True"
479 | "False"
480 | "TRUE"
481 | "FALSE"
482 | "yes"
483 | "no"
484 | "Yes"
485 | "No"
486 | "YES"
487 | "NO"
488 | "on"
489 | "off"
490 | "On"
491 | "Off"
492 | "ON"
493 | "OFF"
494 )
495 }
496
497 fn is_yaml_null(value: &str) -> bool {
499 matches!(value, "null" | "Null" | "NULL" | "~" | "")
500 }
501
502 fn normalize_scalar(value: String) -> String {
504 if Self::is_yaml_bool(&value) {
505 match value.to_lowercase().as_str() {
507 "true" | "yes" | "on" => "true".to_string(),
508 "false" | "no" | "off" => "false".to_string(),
509 _ => value,
510 }
511 } else if Self::is_yaml_null(&value) {
512 "null".to_string()
514 } else {
515 value
516 }
517 }
518
519 fn scan_number(&mut self) -> Result<Token> {
521 let start_pos = self.position;
522 let mut value = String::new();
523
524 if self.current_char == Some('-') {
526 value.push('-');
527 self.advance();
528 }
529
530 while let Some(ch) = self.current_char {
532 if ch.is_ascii_digit() {
533 value.push(ch);
534 self.advance();
535 } else if ch == '.' {
536 value.push(ch);
537 self.advance();
538 while let Some(ch) = self.current_char {
540 if ch.is_ascii_digit() {
541 value.push(ch);
542 self.advance();
543 } else {
544 break;
545 }
546 }
547 break;
548 } else {
549 break;
550 }
551 }
552
553 Ok(Token::new(
554 TokenType::Scalar(value, tokens::QuoteStyle::Plain),
555 start_pos,
556 self.position,
557 ))
558 }
559
560 fn scan_plain_scalar(&mut self) -> Result<Token> {
562 let start_pos = self.position;
563 let mut value = String::new();
564
565 while let Some(ch) = self.current_char {
566 if self.flow_level == 0 {
568 match ch {
569 '\n' | '\r' => break,
570 ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
571 '#' if value.is_empty()
572 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
573 {
574 break;
575 }
576 _ => {}
577 }
578 } else {
579 match ch {
581 ',' | '[' | ']' | '{' | '}' => break,
582 ':' if self
583 .peek_char(1)
584 .map_or(true, |c| c.is_whitespace() || "]}".contains(c)) =>
585 {
586 break;
587 }
588 '#' if value.is_empty()
589 || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
590 {
591 break;
592 }
593 _ => {}
594 }
595 }
596
597 value.push(ch);
598 self.advance();
599 }
600
601 self.resource_tracker
603 .check_string_length(&self.limits, value.len())?;
604
605 let value = value.trim_end().to_string();
607 let normalized_value = Self::normalize_scalar(value);
608
609 Ok(Token::new(
610 TokenType::Scalar(normalized_value, tokens::QuoteStyle::Plain),
611 start_pos,
612 self.position,
613 ))
614 }
615
616 fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
618 let start_pos = self.position;
619 let mut value = String::new();
620
621 let quote_style = match quote_char {
623 '\'' => tokens::QuoteStyle::Single,
624 '"' => tokens::QuoteStyle::Double,
625 _ => tokens::QuoteStyle::Plain,
626 };
627
628 self.advance(); while let Some(ch) = self.current_char {
631 if ch == quote_char {
632 self.advance(); break;
634 } else if ch == '\\' {
635 self.advance();
636 if let Some(escaped) = self.current_char {
637 match escaped {
638 'n' => value.push('\n'), 't' => value.push('\t'), 'r' => value.push('\r'), '\\' => value.push('\\'), '\'' => value.push('\''), '"' => value.push('"'), '0' => value.push('\0'), 'a' => value.push('\x07'), 'b' => value.push('\x08'), 'f' => value.push('\x0C'), 'v' => value.push('\x0B'), 'e' => value.push('\x1B'), ' ' => value.push(' '), '/' => value.push('/'), _ => {
658 value.push('\\');
659 value.push(escaped);
660 }
661 }
662 self.advance();
663 }
664 } else {
665 value.push(ch);
666 self.advance();
667
668 if value.len() > self.limits.max_string_length {
670 return Err(Error::limit_exceeded(format!(
671 "String length {} exceeds maximum {}",
672 value.len(),
673 self.limits.max_string_length
674 )));
675 }
676 }
677 }
678
679 self.resource_tracker
681 .check_string_length(&self.limits, value.len())?;
682
683 Ok(Token::new(
684 TokenType::Scalar(value, quote_style),
685 start_pos,
686 self.position,
687 ))
688 }
689
690 fn scan_document_start(&mut self) -> Result<Option<Token>> {
692 if self.current_char == Some('-')
693 && self.peek_char(1) == Some('-')
694 && self.peek_char(2) == Some('-')
695 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
696 {
697 let start_pos = self.position;
698 self.advance(); self.advance(); self.advance(); Ok(Some(Token::new(
703 TokenType::DocumentStart,
704 start_pos,
705 self.position,
706 )))
707 } else {
708 Ok(None)
709 }
710 }
711
712 fn scan_yaml_directive(&mut self) -> Result<Option<Token>> {
714 if self.current_char != Some('%') {
715 return Ok(None);
716 }
717
718 let start_pos = self.position;
719 let saved_position = self.position;
720 self.advance(); if self.current_char == Some('Y')
724 && self.peek_char(1) == Some('A')
725 && self.peek_char(2) == Some('M')
726 && self.peek_char(3) == Some('L')
727 && self.peek_char(4).map_or(false, |c| c.is_whitespace())
728 {
729 self.advance(); self.advance(); self.advance(); self.advance(); self.skip_whitespace();
736
737 let major = if let Some(ch) = self.current_char {
739 if ch.is_ascii_digit() {
740 let digit = ch.to_digit(10).unwrap() as u8;
741 self.advance();
742 digit
743 } else {
744 return Err(Error::scan(
745 self.position,
746 "Expected major version number after %YAML".to_string(),
747 ));
748 }
749 } else {
750 return Err(Error::scan(
751 self.position,
752 "Expected version after %YAML directive".to_string(),
753 ));
754 };
755
756 if self.current_char != Some('.') {
758 return Err(Error::scan(
759 self.position,
760 "Expected '.' in YAML version".to_string(),
761 ));
762 }
763 self.advance();
764
765 let minor = if let Some(ch) = self.current_char {
767 if ch.is_ascii_digit() {
768 let digit = ch.to_digit(10).unwrap() as u8;
769 self.advance();
770 digit
771 } else {
772 return Err(Error::scan(
773 self.position,
774 "Expected minor version number after '.'".to_string(),
775 ));
776 }
777 } else {
778 return Err(Error::scan(
779 self.position,
780 "Expected minor version number".to_string(),
781 ));
782 };
783
784 Ok(Some(Token::new(
785 TokenType::YamlDirective(major, minor),
786 start_pos,
787 self.position,
788 )))
789 } else {
790 self.position = saved_position;
792 self.current_char = self
794 .char_indices
795 .iter()
796 .find(|(i, _)| *i == saved_position.index)
797 .map(|(_, ch)| *ch);
798 self.current_char_index = self
800 .char_indices
801 .iter()
802 .position(|(i, _)| *i == saved_position.index)
803 .unwrap_or(0);
804 Ok(None)
805 }
806 }
807
808 fn scan_tag_directive(&mut self) -> Result<Option<Token>> {
810 if self.current_char != Some('%') {
811 return Ok(None);
812 }
813
814 let start_pos = self.position;
815 let saved_position = self.position;
816 self.advance(); if self.current_char == Some('T')
820 && self.peek_char(1) == Some('A')
821 && self.peek_char(2) == Some('G')
822 && self.peek_char(3).map_or(false, |c| c.is_whitespace())
823 {
824 self.advance(); self.advance(); self.advance(); self.skip_whitespace();
830
831 let handle = self.scan_tag_handle()?;
833
834 self.skip_whitespace();
836
837 let prefix = self.scan_tag_prefix()?;
839
840 Ok(Some(Token::new(
841 TokenType::TagDirective(handle, prefix),
842 start_pos,
843 self.position,
844 )))
845 } else {
846 self.position = saved_position;
848 self.current_char = self
850 .char_indices
851 .iter()
852 .find(|(i, _)| *i == saved_position.index)
853 .map(|(_, ch)| *ch);
854 self.current_char_index = self
856 .char_indices
857 .iter()
858 .position(|(i, _)| *i == saved_position.index)
859 .unwrap_or(0);
860 Ok(None)
861 }
862 }
863
864 fn scan_tag_handle(&mut self) -> Result<String> {
866 let mut handle = String::new();
867
868 if self.current_char != Some('!') {
869 return Err(Error::scan(
870 self.position,
871 "Expected '!' at start of tag handle".to_string(),
872 ));
873 }
874
875 handle.push('!');
876 self.advance();
877
878 if self.current_char == Some('!') {
880 handle.push('!');
882 self.advance();
883 } else if self.current_char.map_or(false, |c| c.is_alphanumeric()) {
884 while let Some(ch) = self.current_char {
886 if ch.is_alphanumeric() || ch == '-' || ch == '_' {
887 handle.push(ch);
888 self.advance();
889 } else if ch == '!' {
890 handle.push(ch);
891 self.advance();
892 break;
893 } else {
894 break;
895 }
896 }
897 }
898 Ok(handle)
901 }
902
903 fn scan_tag_prefix(&mut self) -> Result<String> {
905 let mut prefix = String::new();
906
907 while let Some(ch) = self.current_char {
909 if ch == '\n' || ch == '\r' || ch == '#' {
910 break;
911 }
912 if ch.is_whitespace() && prefix.is_empty() {
913 self.advance();
914 continue;
915 }
916 if ch.is_whitespace() && !prefix.is_empty() {
917 break;
919 }
920 prefix.push(ch);
921 self.advance();
922 }
923
924 if prefix.is_empty() {
925 return Err(Error::scan(
926 self.position,
927 "Expected tag prefix after tag handle".to_string(),
928 ));
929 }
930
931 Ok(prefix.trim().to_string())
932 }
933
934 fn is_directive(&self) -> bool {
936 self.current_char == Some('%') && self.position.column == 1
937 }
938
939 fn scan_document_end(&mut self) -> Result<Option<Token>> {
941 if self.current_char == Some('.')
942 && self.peek_char(1) == Some('.')
943 && self.peek_char(2) == Some('.')
944 && self.peek_char(3).map_or(true, |c| c.is_whitespace())
945 {
946 let start_pos = self.position;
947 self.advance(); self.advance(); self.advance(); Ok(Some(Token::new(
952 TokenType::DocumentEnd,
953 start_pos,
954 self.position,
955 )))
956 } else {
957 Ok(None)
958 }
959 }
960
961 fn scan_comment(&mut self) -> Result<Token> {
963 let start_pos = self.position;
964 let mut comment_text = String::new();
965
966 if self.current_char == Some('#') {
968 self.advance();
969 }
970
971 while let Some(ch) = self.current_char {
973 if ch == '\n' || ch == '\r' {
974 break;
975 }
976 comment_text.push(ch);
977 self.advance();
978 }
979
980 let comment_text = comment_text.trim_start().to_string();
982
983 Ok(Token::new(
984 TokenType::Comment(comment_text),
985 start_pos,
986 self.position,
987 ))
988 }
989
990 #[allow(clippy::cognitive_complexity)]
992 fn process_line(&mut self) -> Result<()> {
993 if self.position.column == 1 && self.current_char == Some('%') {
995 if let Some(token) = self.scan_yaml_directive()? {
997 self.tokens.push(token);
998 return Ok(());
999 }
1000
1001 if let Some(token) = self.scan_tag_directive()? {
1003 self.tokens.push(token);
1004 return Ok(());
1005 }
1006
1007 if self.current_char == Some('%') {
1009 return Err(Error::scan(self.position, "Unknown directive".to_string()));
1010 }
1011 }
1012
1013 if self.position.column == 1 {
1015 if let Some(token) = self.scan_document_start()? {
1017 self.tokens.push(token);
1018 return Ok(());
1019 }
1020
1021 if let Some(token) = self.scan_document_end()? {
1023 self.tokens.push(token);
1024 return Ok(());
1025 }
1026 }
1027
1028 if self.position.column == 1 {
1030 self.handle_indentation()?;
1031 }
1032
1033 self.skip_whitespace();
1035
1036 match self.current_char {
1037 None => return Ok(()),
1038 Some('#') => {
1039 if self.preserve_comments {
1040 let comment_token = self.scan_comment()?;
1042 self.tokens.push(comment_token);
1043 } else {
1044 while let Some(ch) = self.current_char {
1046 if ch == '\n' || ch == '\r' {
1047 break;
1048 }
1049 self.advance();
1050 }
1051 }
1052 return Ok(());
1053 }
1054 Some('\n' | '\r') => {
1055 self.advance();
1056 return Ok(());
1057 }
1058 _ => {}
1059 }
1060
1061 while let Some(ch) = self.current_char {
1063 match ch {
1064 '\n' | '\r' => break,
1065 ' ' | '\t' => {
1066 self.skip_whitespace();
1067 }
1068 '#' => {
1069 if self.preserve_comments {
1070 let comment_token = self.scan_comment()?;
1072 self.tokens.push(comment_token);
1073 } else {
1074 while let Some(ch) = self.current_char {
1076 if ch == '\n' || ch == '\r' {
1077 break;
1078 }
1079 self.advance();
1080 }
1081 }
1082 break;
1083 }
1084
1085 '[' => {
1087 let pos = self.position;
1088 self.advance();
1089 self.flow_level += 1;
1090 self.resource_tracker
1092 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1093 self.tokens
1094 .push(Token::new(TokenType::FlowSequenceStart, pos, self.position));
1095 }
1096 ']' => {
1097 let pos = self.position;
1098 self.advance();
1099 if self.flow_level > 0 {
1100 self.flow_level -= 1;
1101 }
1102 self.tokens
1103 .push(Token::new(TokenType::FlowSequenceEnd, pos, self.position));
1104 }
1105 '{' => {
1106 let pos = self.position;
1107 self.advance();
1108 self.flow_level += 1;
1109 self.resource_tracker
1111 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1112 self.tokens
1113 .push(Token::new(TokenType::FlowMappingStart, pos, self.position));
1114 }
1115 '}' => {
1116 let pos = self.position;
1117 self.advance();
1118 if self.flow_level > 0 {
1119 self.flow_level -= 1;
1120 }
1121 self.tokens
1122 .push(Token::new(TokenType::FlowMappingEnd, pos, self.position));
1123 }
1124 ',' => {
1125 let pos = self.position;
1126 self.advance();
1127 self.tokens
1128 .push(Token::new(TokenType::FlowEntry, pos, self.position));
1129 }
1130
1131 ':' => {
1133 let pos = self.position;
1134 self.advance();
1135 self.tokens
1136 .push(Token::new(TokenType::Value, pos, self.position));
1137 }
1138
1139 '?' if self.flow_level == 0
1141 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1142 || self.peek_char(1).is_none()) =>
1143 {
1144 let pos = self.position;
1145 self.advance();
1146 self.tokens
1147 .push(Token::new(TokenType::Key, pos, self.position));
1148 }
1149 '?' if self.flow_level > 0
1150 && (self
1151 .peek_char(1)
1152 .map_or(true, |c| c.is_whitespace() || ",:]}".contains(c))
1153 || self.peek_char(1).is_none()) =>
1154 {
1155 let pos = self.position;
1156 self.advance();
1157 self.tokens
1158 .push(Token::new(TokenType::Key, pos, self.position));
1159 }
1160
1161 '-' if self.flow_level == 0
1163 && (self.peek_char(1).map_or(true, |c| c.is_whitespace())
1164 || self.peek_char(1).is_none()) =>
1165 {
1166 let pos = self.position;
1167 self.advance();
1168
1169 let last_indent = *self.indent_stack.last().unwrap();
1171
1172 if self.current_indent > last_indent {
1173 self.indent_stack.push(self.current_indent);
1175 self.resource_tracker
1177 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1178 self.tokens
1179 .push(Token::simple(TokenType::BlockSequenceStart, pos));
1180 } else if self.current_indent >= last_indent {
1181 let has_active_sequence = self
1184 .tokens
1185 .iter()
1186 .rev()
1187 .take_while(|t| {
1188 !matches!(
1189 t.token_type,
1190 TokenType::StreamStart
1191 | TokenType::DocumentStart
1192 | TokenType::DocumentEnd
1193 )
1194 })
1195 .any(|t| matches!(t.token_type, TokenType::BlockSequenceStart));
1196
1197 if !has_active_sequence {
1198 self.resource_tracker.check_depth(
1200 &self.limits,
1201 self.flow_level + self.indent_stack.len(),
1202 )?;
1203 self.tokens
1204 .push(Token::simple(TokenType::BlockSequenceStart, pos));
1205 }
1206 }
1207
1208 self.tokens
1209 .push(Token::new(TokenType::BlockEntry, pos, self.position));
1210
1211 self.skip_whitespace();
1213 if self.current_char == Some('-')
1214 && self.peek_char(1).map_or(true, |c| c.is_whitespace())
1215 {
1216 self.inline_sequence_depth += 1;
1219 self.indent_stack.push(self.position.column);
1221 self.resource_tracker
1223 .check_depth(&self.limits, self.flow_level + self.indent_stack.len())?;
1224 self.tokens
1225 .push(Token::simple(TokenType::BlockSequenceStart, self.position));
1226 }
1228 }
1229
1230 '"' => {
1232 let token = self.scan_quoted_string('"')?;
1233 self.tokens.push(token);
1234 }
1235 '\'' => {
1236 let token = self.scan_quoted_string('\'')?;
1237 self.tokens.push(token);
1238 }
1239
1240 '-' if self.position.column == self.current_indent + 1
1242 && !self.peek_char(1).map_or(true, |c| c.is_whitespace()) =>
1243 {
1244 if let Some(token) = self.scan_document_start()? {
1245 self.tokens.push(token);
1246 } else if self.is_plain_scalar_start() {
1247 let token = self.scan_plain_scalar()?;
1248 self.tokens.push(token);
1249 }
1250 }
1251 '.' if self.position.column == self.current_indent + 1 => {
1252 if let Some(token) = self.scan_document_end()? {
1253 self.tokens.push(token);
1254 } else if self.is_plain_scalar_start() {
1255 let token = self.scan_plain_scalar()?;
1256 self.tokens.push(token);
1257 }
1258 }
1259
1260 _ if ch.is_ascii_digit()
1262 || (ch == '-' && self.peek_char(1).map_or(false, |c| c.is_ascii_digit())) =>
1263 {
1264 let token = self.scan_number()?;
1265 self.tokens.push(token);
1266 }
1267
1268 '&' => {
1270 let token = self.scan_anchor()?;
1271 self.tokens.push(token);
1272 }
1273 '*' => {
1274 let token = self.scan_alias()?;
1275 self.tokens.push(token);
1276 }
1277
1278 '|' => {
1280 let token = self.scan_literal_block_scalar()?;
1281 self.tokens.push(token);
1282 }
1283 '>' => {
1284 let token = self.scan_folded_block_scalar()?;
1285 self.tokens.push(token);
1286 }
1287
1288 '!' => {
1290 let token = self.scan_tag()?;
1291 self.tokens.push(token);
1292 }
1293
1294 _ if self.is_plain_scalar_start() => {
1296 if self.flow_level == 0 {
1298 let should_start_mapping = self.check_for_mapping_ahead();
1299 if should_start_mapping {
1300 let last_indent = *self.indent_stack.last().unwrap();
1301
1302 let should_start_new_mapping = if self.current_indent > last_indent {
1307 true
1309 } else if self.current_indent == last_indent {
1310 let has_active_mapping_at_this_level =
1313 self.check_active_mapping_at_level(self.current_indent);
1314 !has_active_mapping_at_this_level
1315 } else {
1316 false
1318 };
1319
1320 if should_start_new_mapping {
1321 self.indent_stack.push(self.current_indent);
1323 self.resource_tracker.check_depth(
1325 &self.limits,
1326 self.flow_level + self.indent_stack.len(),
1327 )?;
1328 self.tokens.push(Token::simple(
1329 TokenType::BlockMappingStart,
1330 self.position,
1331 ));
1332 }
1333 }
1334 }
1335
1336 let token = self.scan_plain_scalar()?;
1337 self.tokens.push(token);
1338 }
1339
1340 _ => {
1341 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1342 .with_suggestion("Check for valid YAML syntax characters".to_string());
1343 return Err(Error::invalid_character_with_context(
1344 self.position,
1345 ch,
1346 "YAML document",
1347 context,
1348 ));
1349 }
1350 }
1351 }
1352
1353 while self.inline_sequence_depth > 0 {
1355 self.inline_sequence_depth -= 1;
1356 if self.indent_stack.len() > 1 {
1358 self.indent_stack.pop();
1359 }
1360 self.tokens
1361 .push(Token::simple(TokenType::BlockEnd, self.position));
1362 }
1363
1364 Ok(())
1365 }
1366
1367 fn scan_next_token(&mut self) -> Result<()> {
1369 if self.done {
1370 return Ok(());
1371 }
1372
1373 if self.tokens.is_empty() {
1375 self.tokens
1376 .push(Token::simple(TokenType::StreamStart, self.position));
1377 return Ok(());
1378 }
1379
1380 if self.current_char.is_none() {
1382 if !self
1383 .tokens
1384 .iter()
1385 .any(|t| matches!(t.token_type, TokenType::StreamEnd))
1386 {
1387 self.tokens
1388 .push(Token::simple(TokenType::StreamEnd, self.position));
1389 }
1390 self.done = true;
1391 return Ok(());
1392 }
1393
1394 let tokens_before = self.tokens.len();
1398 self.scan_all_tokens()?;
1399
1400 if self.tokens.len() == tokens_before {
1402 self.done = true;
1403 }
1404
1405 Ok(())
1406 }
1407
1408 fn scan_all_tokens(&mut self) -> Result<()> {
1410 if !self
1412 .tokens
1413 .iter()
1414 .any(|t| matches!(t.token_type, TokenType::StreamStart))
1415 {
1416 self.tokens
1417 .push(Token::simple(TokenType::StreamStart, self.position));
1418 }
1419
1420 while self.current_char.is_some() {
1421 self.process_line()?;
1422
1423 while let Some(ch) = self.current_char {
1425 if ch == '\n' || ch == '\r' {
1426 self.advance();
1427 } else {
1428 break;
1429 }
1430 }
1431 }
1432
1433 while self.indent_stack.len() > 1 {
1435 self.indent_stack.pop();
1436 self.tokens
1437 .push(Token::simple(TokenType::BlockEnd, self.position));
1438 }
1439
1440 self.tokens
1441 .push(Token::simple(TokenType::StreamEnd, self.position));
1442 self.done = true;
1443 Ok(())
1444 }
1445
1446 fn peek_char(&self, offset: isize) -> Option<char> {
1448 if offset >= 0 {
1449 let target_index = self.current_char_index + offset as usize;
1450 if target_index < self.char_cache.len() {
1451 Some(self.char_cache[target_index])
1452 } else {
1453 None
1454 }
1455 } else {
1456 let offset_magnitude = (-offset) as usize;
1457 if self.current_char_index >= offset_magnitude {
1458 Some(self.char_cache[self.current_char_index - offset_magnitude])
1459 } else {
1460 None
1461 }
1462 }
1463 }
1464
1465 fn scan_anchor(&mut self) -> Result<Token> {
1467 let start_pos = self.position;
1468 self.advance(); let name = self.scan_identifier()?;
1471 if name.is_empty() {
1472 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
1473 "Provide a valid anchor name after &, e.g., &anchor_name".to_string(),
1474 );
1475 return Err(Error::scan_with_context(
1476 self.position,
1477 "Anchor name cannot be empty",
1478 context,
1479 ));
1480 }
1481
1482 self.resource_tracker.add_anchor(&self.limits)?;
1484
1485 Ok(Token::new(
1486 TokenType::Anchor(name),
1487 start_pos,
1488 self.position,
1489 ))
1490 }
1491
1492 fn scan_alias(&mut self) -> Result<Token> {
1494 let start_pos = self.position;
1495 self.advance(); let name = self.scan_identifier()?;
1498 if name.is_empty() {
1499 let context = ErrorContext::from_input(&self.input, &self.position, 2).with_suggestion(
1500 "Provide a valid alias name after *, e.g., *alias_name".to_string(),
1501 );
1502 return Err(Error::scan_with_context(
1503 self.position,
1504 "Alias name cannot be empty",
1505 context,
1506 ));
1507 }
1508
1509 Ok(Token::new(TokenType::Alias(name), start_pos, self.position))
1510 }
1511
1512 fn scan_identifier(&mut self) -> Result<String> {
1514 let mut identifier = String::new();
1515
1516 while let Some(ch) = self.current_char {
1517 if ch.is_alphanumeric() || ch == '_' || ch == '-' {
1518 identifier.push(ch);
1519 self.advance();
1520 } else {
1521 break;
1522 }
1523 }
1524
1525 Ok(identifier)
1526 }
1527
1528 fn scan_tag(&mut self) -> Result<Token> {
1530 let start_pos = self.position;
1531 self.advance(); let mut tag = String::from("!");
1534
1535 if self.current_char == Some('<') {
1537 tag.push('<');
1538 self.advance(); while let Some(ch) = self.current_char {
1542 if ch == '>' {
1543 tag.push(ch);
1544 self.advance();
1545 break;
1546 } else if ch.is_control() || ch.is_whitespace() {
1547 return Err(Error::scan(
1548 self.position,
1549 "Invalid character in verbatim tag".to_string(),
1550 ));
1551 }
1552 tag.push(ch);
1553 self.advance();
1554 }
1555 } else {
1556 if self.current_char == Some('!') {
1558 tag.push('!');
1559 self.advance(); }
1561
1562 while let Some(ch) = self.current_char {
1564 if ch.is_alphanumeric() || "-./_:".contains(ch) {
1565 tag.push(ch);
1566 self.advance();
1567 } else {
1568 break;
1569 }
1570 }
1571 }
1572
1573 Ok(Token::new(TokenType::Tag(tag), start_pos, self.position))
1574 }
1575
1576 fn scan_literal_block_scalar(&mut self) -> Result<Token> {
1578 let start_pos = self.position;
1579 self.advance(); let (keep_trailing, explicit_indent) = self.scan_block_scalar_header()?;
1583
1584 self.skip_to_next_line()?;
1586
1587 let base_indent = self.current_indent;
1589 let content_indent = if let Some(explicit) = explicit_indent {
1590 base_indent + explicit
1591 } else {
1592 self.find_block_scalar_indent(base_indent)?
1594 };
1595
1596 let content = self.collect_literal_block_content(content_indent, keep_trailing)?;
1598
1599 Ok(Token::new(
1600 TokenType::BlockScalarLiteral(content),
1601 start_pos,
1602 self.position,
1603 ))
1604 }
1605
1606 fn scan_folded_block_scalar(&mut self) -> Result<Token> {
1608 let start_pos = self.position;
1609 self.advance(); let (keep_trailing, explicit_indent) = self.scan_block_scalar_header()?;
1613
1614 self.skip_to_next_line()?;
1616
1617 let base_indent = self.current_indent;
1619 let content_indent = if let Some(explicit) = explicit_indent {
1620 base_indent + explicit
1621 } else {
1622 self.find_block_scalar_indent(base_indent)?
1624 };
1625
1626 let content = self.collect_folded_block_content(content_indent, keep_trailing)?;
1628
1629 Ok(Token::new(
1630 TokenType::BlockScalarFolded(content),
1631 start_pos,
1632 self.position,
1633 ))
1634 }
1635
1636 fn scan_block_scalar_header(&mut self) -> Result<(bool, Option<usize>)> {
1638 let mut keep_trailing = false;
1639 let mut explicit_indent: Option<usize> = None;
1640
1641 while let Some(ch) = self.current_char {
1643 match ch {
1644 '+' => {
1645 keep_trailing = true;
1646 self.advance();
1647 }
1648 '-' => {
1649 keep_trailing = false; self.advance();
1651 }
1652 '0'..='9' => {
1653 let digit = ch.to_digit(10).unwrap() as usize;
1654 if explicit_indent.is_some() {
1655 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1656 .with_suggestion(
1657 "Use only one indent indicator digit in block scalar".to_string(),
1658 );
1659 return Err(Error::scan_with_context(
1660 self.position,
1661 "Multiple indent indicators in block scalar",
1662 context,
1663 ));
1664 }
1665 explicit_indent = Some(digit);
1666 self.advance();
1667 }
1668 ' ' | '\t' => {
1669 self.advance(); }
1671 '#' => {
1672 while let Some(ch) = self.current_char {
1674 self.advance();
1675 if ch == '\n' || ch == '\r' {
1676 break;
1677 }
1678 }
1679 break;
1680 }
1681 '\n' | '\r' => break,
1682 _ => {
1683 let context = ErrorContext::from_input(&self.input, &self.position, 2)
1684 .with_suggestion("Use valid block scalar indicators: | (literal), > (folded), + (keep), - (strip), or digit (indent)".to_string());
1685 return Err(Error::invalid_character_with_context(
1686 self.position,
1687 ch,
1688 "block scalar header",
1689 context,
1690 ));
1691 }
1692 }
1693 }
1694
1695 Ok((keep_trailing, explicit_indent))
1696 }
1697
1698 fn skip_to_next_line(&mut self) -> Result<()> {
1700 while let Some(ch) = self.current_char {
1701 match ch {
1702 '\n' | '\r' => {
1703 self.advance();
1704 break;
1705 }
1706 ' ' | '\t' => {
1707 self.advance();
1708 }
1709 _ => break,
1710 }
1711 }
1712 Ok(())
1713 }
1714
1715 fn find_block_scalar_indent(&mut self, base_indent: usize) -> Result<usize> {
1717 let saved_position = self.position;
1718 let saved_char = self.current_char;
1719 let saved_char_index = self.current_char_index;
1720
1721 let mut content_indent = base_indent + 1; while let Some(ch) = self.current_char {
1725 self.advance();
1726 if ch == '\n' || ch == '\r' {
1727 let line_indent = self.count_line_indent();
1728
1729 if let Some(line_ch) = self.current_char {
1731 if line_ch != '\n' && line_ch != '\r' {
1732 if line_indent > base_indent {
1733 content_indent = line_indent;
1734 break;
1735 }
1736 content_indent = base_indent + 1;
1738 break;
1739 }
1740 }
1741 }
1742 }
1743
1744 self.position = saved_position;
1746 self.current_char = saved_char;
1747 self.current_char_index = saved_char_index;
1748
1749 Ok(content_indent)
1750 }
1751
1752 fn count_line_indent(&mut self) -> usize {
1754 let mut indent = 0;
1755 let saved_position = self.position;
1756 let saved_char = self.current_char;
1757 let saved_char_index = self.current_char_index;
1758
1759 while let Some(ch) = self.current_char {
1760 if ch == ' ' {
1761 indent += 1;
1762 self.advance();
1763 } else if ch == '\t' {
1764 indent += 8; self.advance();
1766 } else {
1767 break;
1768 }
1769 }
1770
1771 self.position = saved_position;
1773 self.current_char = saved_char;
1774 self.current_char_index = saved_char_index;
1775
1776 indent
1777 }
1778
1779 fn collect_literal_block_content(
1781 &mut self,
1782 content_indent: usize,
1783 _keep_trailing: bool,
1784 ) -> Result<String> {
1785 let mut content = String::new();
1786
1787 while let Some(_) = self.current_char {
1788 let line_indent = self.count_line_indent();
1789
1790 for _ in 0..content_indent.min(line_indent) {
1792 if let Some(' ' | '\t') = self.current_char {
1793 self.advance();
1794 }
1795 }
1796
1797 let mut line = String::new();
1799 while let Some(ch) = self.current_char {
1800 if ch == '\n' || ch == '\r' {
1801 self.advance();
1802 break;
1803 }
1804 line.push(ch);
1805 self.advance();
1806 }
1807
1808 if line_indent < content_indent && !line.trim().is_empty() {
1810 break;
1812 }
1813
1814 content.push_str(&line);
1816 if self.current_char.is_some() {
1817 content.push('\n');
1818 }
1819
1820 if self.current_char.is_none() {
1822 break;
1823 }
1824 }
1825
1826 Ok(content)
1827 }
1828
1829 fn collect_folded_block_content(
1831 &mut self,
1832 content_indent: usize,
1833 _keep_trailing: bool,
1834 ) -> Result<String> {
1835 let mut content = String::new();
1836 let mut prev_was_empty = false;
1837 let mut first_line = true;
1838
1839 while let Some(_) = self.current_char {
1840 let line_indent = self.count_line_indent();
1841
1842 for _ in 0..content_indent.min(line_indent) {
1844 if let Some(' ' | '\t') = self.current_char {
1845 self.advance();
1846 }
1847 }
1848
1849 let mut line = String::new();
1851 while let Some(ch) = self.current_char {
1852 if ch == '\n' || ch == '\r' {
1853 self.advance();
1854 break;
1855 }
1856 line.push(ch);
1857 self.advance();
1858 }
1859
1860 if line_indent < content_indent && !line.trim().is_empty() {
1862 break;
1863 }
1864
1865 let line_is_empty = line.trim().is_empty();
1866
1867 if line_is_empty {
1868 if !first_line && !prev_was_empty {
1870 content.push('\n');
1871 }
1872 prev_was_empty = true;
1873 } else {
1874 if !first_line && !prev_was_empty {
1876 content.push(' '); }
1878 content.push_str(line.trim());
1879 prev_was_empty = false;
1880 }
1881
1882 first_line = false;
1883
1884 if self.current_char.is_none() {
1885 break;
1886 }
1887 }
1888
1889 Ok(content)
1890 }
1891
1892 fn check_for_mapping_ahead(&self) -> bool {
1894 for i in self.current_char_index..self.char_cache.len() {
1896 let ch = self.char_cache[i];
1897 match ch {
1898 ':' => {
1899 let next_char = self.char_cache.get(i + 1).copied();
1901 return next_char.map_or(true, |c| c.is_whitespace());
1902 }
1903 '\n' | '\r' => break, _ => {}
1905 }
1906 }
1907 false
1908 }
1909
1910 fn check_active_mapping_at_level(&self, _target_indent: usize) -> bool {
1913 let mut mapping_depth = 0;
1914 let _current_mapping_indent: Option<usize> = None;
1915
1916 for token in self.tokens.iter().rev() {
1918 match &token.token_type {
1919 TokenType::BlockMappingStart => {
1920 if mapping_depth == 0 {
1921 return true; }
1926 mapping_depth -= 1;
1927 }
1928 TokenType::BlockEnd => {
1929 mapping_depth += 1;
1930 }
1931 TokenType::StreamStart | TokenType::DocumentStart | TokenType::DocumentEnd => {
1932 break;
1934 }
1935 _ => {}
1936 }
1937 }
1938
1939 false
1940 }
1941}
1942
1943impl Scanner for BasicScanner {
1944 fn check_token(&self) -> bool {
1945 self.token_index < self.tokens.len() || !self.done
1947 }
1948
1949 fn peek_token(&self) -> Result<Option<&Token>> {
1950 Ok(self.tokens.get(self.token_index))
1953 }
1954
1955 fn get_token(&mut self) -> Result<Option<Token>> {
1956 if self.token_index >= self.tokens.len() && !self.done {
1958 self.scan_next_token()?;
1959 }
1960
1961 if self.token_index < self.tokens.len() {
1962 let token = self.tokens[self.token_index].clone();
1963 self.token_index += 1;
1964 Ok(Some(token))
1965 } else {
1966 Ok(None)
1967 }
1968 }
1969
1970 fn reset(&mut self) {
1971 self.token_index = 0;
1972 self.position = Position::start();
1973 self.tokens.clear();
1974 self.done = false;
1975 self.current_char = self.input.chars().next();
1976 self.indent_stack = vec![0];
1977 self.current_indent = 0;
1978 self.flow_level = 0;
1979 self.detected_indent_style = None;
1980 self.indent_samples.clear();
1981 self.previous_indent_level = 0;
1982 self.current_char_index = 0;
1983 self.current_char = self.char_cache.first().copied();
1984 }
1985
1986 fn position(&self) -> Position {
1987 self.position
1988 }
1989
1990 fn input(&self) -> &str {
1991 &self.input
1992 }
1993}
1994
1995#[cfg(test)]
1996mod tests {
1997 use super::*;
1998
1999 #[test]
2000 fn test_basic_tokenization() {
2001 let mut scanner = BasicScanner::new("42".to_string());
2002
2003 assert!(scanner.check_token());
2004
2005 let token = scanner.get_token().unwrap().unwrap();
2007 assert!(matches!(token.token_type, TokenType::StreamStart));
2008
2009 let token = scanner.get_token().unwrap().unwrap();
2011 if let TokenType::Scalar(value, _) = token.token_type {
2012 assert_eq!(value, "42");
2013 } else {
2014 panic!("Expected scalar token");
2015 }
2016
2017 let token = scanner.get_token().unwrap().unwrap();
2019 assert!(matches!(token.token_type, TokenType::StreamEnd));
2020 }
2021
2022 #[test]
2023 fn test_flow_sequence() {
2024 let mut scanner = BasicScanner::new("[1, 2, 3]".to_string());
2025
2026 scanner.get_token().unwrap();
2028
2029 let token = scanner.get_token().unwrap().unwrap();
2031 assert!(matches!(token.token_type, TokenType::FlowSequenceStart));
2032
2033 let token = scanner.get_token().unwrap().unwrap();
2035 if let TokenType::Scalar(value, _) = token.token_type {
2036 assert_eq!(value, "1");
2037 }
2038
2039 let token = scanner.get_token().unwrap().unwrap();
2041 assert!(matches!(token.token_type, TokenType::FlowEntry));
2042 }
2043
2044 #[test]
2045 fn test_quoted_strings() {
2046 let mut scanner = BasicScanner::new(r#""hello world""#.to_string());
2047
2048 scanner.get_token().unwrap();
2050
2051 let token = scanner.get_token().unwrap().unwrap();
2053 if let TokenType::Scalar(value, _) = token.token_type {
2054 assert_eq!(value, "hello world");
2055 } else {
2056 panic!("Expected scalar token");
2057 }
2058 }
2059
2060 #[test]
2061 fn test_comment_handling() {
2062 let input = r"
2063# Full line comment
2064key: value # End of line comment
2065# Another comment
2066data: test
2067";
2068 let mut scanner = BasicScanner::new(input.to_string());
2069
2070 let mut tokens = Vec::new();
2071 while let Ok(Some(token)) = scanner.get_token() {
2072 tokens.push(token);
2073 }
2074
2075 let scalar_values: Vec<String> = tokens
2077 .iter()
2078 .filter_map(|t| match &t.token_type {
2079 TokenType::Scalar(s, _) => Some(s.clone()),
2080 _ => None,
2081 })
2082 .collect();
2083
2084 assert_eq!(scalar_values, vec!["key", "value", "data", "test"]);
2085
2086 assert!(!tokens
2088 .iter()
2089 .any(|t| matches!(t.token_type, TokenType::Comment(_))));
2090 }
2091
2092 #[test]
2093 fn test_hash_in_strings() {
2094 let input = r#"
2095string1: "This has a # character"
2096string2: 'Also has # character'
2097normal: value # This is a comment
2098"#;
2099 let mut scanner = BasicScanner::new(input.to_string());
2100
2101 let mut scalar_values = Vec::new();
2102 while let Ok(Some(token)) = scanner.get_token() {
2103 if let TokenType::Scalar(value, _) = token.token_type {
2104 scalar_values.push(value);
2105 }
2106 }
2107
2108 assert!(scalar_values.contains(&"This has a # character".to_string()));
2109 assert!(scalar_values.contains(&"Also has # character".to_string()));
2110 assert!(scalar_values.contains(&"value".to_string()));
2111 assert!(!scalar_values
2112 .iter()
2113 .any(|s| s.contains("This is a comment")));
2114 }
2115
2116 #[test]
2117 fn test_escape_sequences() {
2118 let test_cases = vec![
2120 (r#""Line 1\nLine 2""#, "Line 1\nLine 2"),
2121 (r#""Col1\tCol2""#, "Col1\tCol2"),
2122 (r#""First\rSecond""#, "First\rSecond"),
2123 (r#""Path\\to\\file""#, "Path\\to\\file"),
2124 (r#""He said \"Hello\"""#, "He said \"Hello\""),
2125 (r"'Don\'t do that'", "Don't do that"),
2126 ];
2127
2128 for (input, expected) in test_cases {
2129 let mut scanner = BasicScanner::new(input.to_string());
2130 scanner.get_token().unwrap(); if let Ok(Some(token)) = scanner.get_token() {
2133 if let TokenType::Scalar(value, _) = token.token_type {
2134 assert_eq!(value, expected, "Failed for input: {}", input);
2135 } else {
2136 panic!("Expected scalar token for input: {}", input);
2137 }
2138 } else {
2139 panic!("Failed to get token for input: {}", input);
2140 }
2141 }
2142 }
2143
2144 #[test]
2145 fn test_extended_yaml_escapes() {
2146 let test_cases = vec![
2148 (r#""\0""#, "\0"), (r#""\a""#, "\x07"), (r#""\b""#, "\x08"), (r#""\f""#, "\x0C"), (r#""\v""#, "\x0B"), (r#""\e""#, "\x1B"), (r#""\ ""#, " "), (r#""\/"#, "/"), ];
2157
2158 for (input, expected) in test_cases {
2159 let mut scanner = BasicScanner::new(input.to_string());
2160 scanner.get_token().unwrap(); if let Ok(Some(token)) = scanner.get_token() {
2163 if let TokenType::Scalar(value, _) = token.token_type {
2164 assert_eq!(value, expected, "Failed for input: {}", input);
2165 } else {
2166 panic!("Expected scalar token for input: {}", input);
2167 }
2168 } else {
2169 panic!("Failed to get token for input: {}", input);
2170 }
2171 }
2172 }
2173
2174 #[test]
2175 fn test_unknown_escape_sequences() {
2176 let input = r#""\z\q\8""#;
2178 let expected = "\\z\\q\\8"; let mut scanner = BasicScanner::new(input.to_string());
2181 scanner.get_token().unwrap(); if let Ok(Some(token)) = scanner.get_token() {
2184 if let TokenType::Scalar(value, _) = token.token_type {
2185 assert_eq!(value, expected);
2186 } else {
2187 panic!("Expected scalar token");
2188 }
2189 } else {
2190 panic!("Failed to get token");
2191 }
2192 }
2193}