yyaml/parser/loader.rs
1// Parser removed - using StateMachine directly
2use crate::error::{Marker, ScanError};
3
4use crate::events::{Event, EventReceiver, TScalarStyle, TokenType};
5use crate::linked_hash_map::LinkedHashMap;
6use crate::semantic::tags::schema::SchemaProcessor;
7use crate::semantic::tags::types::SchemaType;
8use crate::yaml::Yaml;
9use log::{debug, trace, warn};
10use std::char::decode_utf16;
11use std::collections::HashMap;
12
13/// Encoding types for YAML byte streams
14#[derive(Debug, Clone, Copy)]
15pub enum Encoding {
16 Utf8,
17 Utf16Le,
18 Utf16Be,
19}
20
21/// Byte order for UTF-16 decoding
22#[derive(Debug, Clone, Copy)]
23enum Endian {
24 Little,
25 Big,
26}
27
28/// Our main "public" API: load from a string → produce Vec<Yaml>.
29pub struct YamlLoader;
30
31impl YamlLoader {
32 /// Load YAML from string using Failsafe schema by default (all scalars as strings)
33 pub fn load_from_str(s: &str) -> Result<Vec<Yaml>, ScanError> {
34 Self::load_from_str_with_schema(s, SchemaType::Failsafe)
35 }
36
37 /// Load YAML from string with explicit schema selection
38 ///
39 /// # Schemas
40 /// - `Core`: Full YAML 1.2 with all standard types (null, bool, int, float, str, binary, timestamp, etc.)
41 /// - `Failsafe`: Minimal types only (all scalars treated as strings)
42 /// - `Json`: JSON-compatible subset
43 /// - `Custom`: User-defined types
44 pub fn load_from_str_with_schema(s: &str, schema: SchemaType) -> Result<Vec<Yaml>, ScanError> {
45 debug!(
46 "=== YamlLoader::load_from_str_with_schema ENTRY with: '{}', schema: {:?} ===",
47 s, schema
48 );
49
50 let mut schema_processor = SchemaProcessor::<'static>::new();
51 schema_processor.set_schema(schema);
52
53 // Fast path for simple cases - zero allocation, blazing fast
54 debug!("YamlLoader: trying fast parse");
55 match Self::try_fast_parse(s, &mut schema_processor) {
56 Ok(Some(result)) => {
57 debug!("Fast parser succeeded with: {result:?}");
58 return Ok(vec![result]);
59 }
60 Ok(None) => {
61 debug!("Fast parser detected complex syntax, falling back to full parser");
62 debug!("YamlLoader: fast parser returned None, falling back to StateMachine");
63 } // Fall through to full parser
64 Err(error) => {
65 debug!("Fast parser failed: {error:?}");
66 return Err(error);
67 } // Propagate parsing errors
68 }
69
70 // Handle multi-document streams
71 let mut documents = Vec::new();
72 debug!(
73 "YamlLoader: creating StateMachine with schema: {:?}",
74 schema
75 );
76 let mut state_machine = crate::parser::state_machine::StateMachine::new_with_processor(
77 s.chars(),
78 schema,
79 schema_processor,
80 );
81 debug!("YamlLoader: StateMachine created, starting document parsing loop");
82
83 // Process all documents in stream
84 while !state_machine.at_stream_end() {
85 debug!("YamlLoader: parsing next document...");
86 match state_machine.parse_next_document() {
87 Ok(Some(doc)) => {
88 debug!("Parsed document: {doc:?}");
89 documents.push(doc);
90 }
91 Ok(None) => break, // End of stream
92 Err(e) => {
93 debug!("State machine failed: {e:?}");
94 return Err(e);
95 }
96 }
97 }
98
99 // Handle empty streams (return empty vec, not error)
100 if documents.is_empty() {
101 debug!("No documents found in stream");
102 documents.push(Yaml::Null);
103 }
104
105 Ok(documents)
106 }
107
108 pub fn load_from_bytes(input: Vec<u8>) -> Result<Vec<Yaml>, ScanError> {
109 if input.is_empty() {
110 return Ok(vec![Yaml::Null]);
111 }
112
113 let mut bytes = input.as_slice();
114 let encoding = Self::detect_bom(&mut bytes)?;
115
116 let decoded = match encoding {
117 Encoding::Utf8 => {
118 // Already checked BOM, decode remaining
119 std::str::from_utf8(bytes)
120 .map_err(|e| ScanError::EncodingError(format!("Invalid UTF-8: {}", e)))?
121 .to_string()
122 }
123 Encoding::Utf16Le => Self::decode_utf16_bytes(bytes, Endian::Little)?,
124 Encoding::Utf16Be => Self::decode_utf16_bytes(bytes, Endian::Big)?,
125 };
126
127 // Now use existing parser
128 Self::load_from_str(&decoded)
129 }
130
131 fn detect_bom(bytes: &mut &[u8]) -> Result<Encoding, ScanError> {
132 if bytes.len() < 2 {
133 return Ok(Encoding::Utf8);
134 }
135
136 match bytes.get(0..3) {
137 Some(&[239, 187, 191]) => {
138 *bytes = &bytes[3..];
139 Ok(Encoding::Utf8)
140 }
141 _ if bytes[0] == 255 && bytes[1] == 254 => {
142 *bytes = &bytes[2..];
143 Ok(Encoding::Utf16Le)
144 }
145 _ if bytes[0] == 254 && bytes[1] == 255 => {
146 *bytes = &bytes[2..];
147 Ok(Encoding::Utf16Be)
148 }
149 _ => Ok(Encoding::Utf8), // Fallback
150 }
151 }
152
153 fn decode_utf16_bytes(bytes: &[u8], endian: Endian) -> Result<String, ScanError> {
154 if !bytes.len().is_multiple_of(2) {
155 return Err(ScanError::EncodingError(
156 "Invalid UTF-16: odd byte length".to_string(),
157 ));
158 }
159 let u16_iter = bytes.chunks_exact(2).map(|chunk| match endian {
160 Endian::Little => u16::from_le_bytes([chunk[0], chunk[1]]),
161 Endian::Big => u16::from_be_bytes([chunk[0], chunk[1]]),
162 });
163 decode_utf16(u16_iter)
164 .collect::<Result<String, _>>()
165 .map_err(|e| ScanError::EncodingError(format!("Invalid UTF-16: {}", e)))
166 }
167
168 /// Blazing-fast zero-allocation parser for common simple cases with production-grade error handling
169 /// Handles: "key: value", "- item", "[1, 2, 3]", "{key: value}", multi-line mappings, and simple scalars
170 fn try_fast_parse(
171 s: &str,
172 processor: &mut SchemaProcessor<'static>,
173 ) -> Result<Option<Yaml>, ScanError> {
174 debug!(
175 "try_fast_parse called with: '{}' (schema: {:?})",
176 s,
177 processor.current_schema()
178 );
179 let mut trimmed = s.trim();
180 debug!("try_fast_parse: trimmed = '{}'", trimmed);
181
182 // Strip BOM if present for accurate parsing decisions per YAML 1.2
183 if trimmed.starts_with('\u{feff}') {
184 trimmed = &trimmed[3..]; // BOM is 3 bytes in UTF-8
185 }
186
187 // Empty document
188 if trimmed.is_empty() {
189 return Ok(Some(Yaml::Null));
190 }
191
192 // CRITICAL FIX: If content starts with "- ", it's a sequence - ALWAYS use full parser
193 // The fast parser incorrectly handles complex sequences, so force full parser
194 if trimmed.starts_with("- ") {
195 return Ok(None);
196 }
197
198 // Simple scalar cases (no structure indicators)
199 if !trimmed.contains(':')
200 && !trimmed.contains('-')
201 && !trimmed.contains('[')
202 && !trimmed.contains('{')
203 && !trimmed.contains('|')
204 && !trimmed.contains('>')
205 {
206 return Self::resolve_plain_scalar(processor, trimmed).map(Some);
207 }
208
209 // YAML 1.2 Complete Feature Detection - Zero allocation, optimal performance
210 // Comprehensive spec compliance check using iterator chains for maximum efficiency
211
212 // Chapter 6.8: All directive detection (YAML, TAG, reserved)
213 let has_directives = trimmed.lines().any(|line| {
214 let trimmed_line = line.trim_start();
215 trimmed_line.starts_with("%YAML ")
216 || trimmed_line.starts_with("%TAG ")
217 || (trimmed_line.starts_with('%')
218 && trimmed_line
219 .chars()
220 .nth(1)
221 .is_some_and(|c| c.is_ascii_uppercase()))
222 });
223 if has_directives {
224 return Ok(None);
225 }
226
227 // Chapter 9.2: Multi-document stream detection - optimized counting
228 let mut doc_markers = 0u8;
229 let mut line_start = true;
230 for (i, &byte) in trimmed.as_bytes().iter().enumerate() {
231 match byte {
232 b'\n' => line_start = true,
233 b'-' if line_start => {
234 if trimmed.as_bytes().get(i + 1) == Some(&b'-')
235 && trimmed.as_bytes().get(i + 2) == Some(&b'-')
236 && trimmed
237 .as_bytes()
238 .get(i + 3)
239 .is_none_or(|&b| b == b' ' || b == b'\t' || b == b'\n')
240 {
241 doc_markers += 1;
242 if doc_markers > 1 {
243 return Ok(None);
244 }
245 }
246 line_start = false;
247 }
248 b'.' if line_start => {
249 if trimmed.as_bytes().get(i + 1) == Some(&b'.')
250 && trimmed.as_bytes().get(i + 2) == Some(&b'.')
251 && trimmed
252 .as_bytes()
253 .get(i + 3)
254 .is_none_or(|&b| b == b' ' || b == b'\t' || b == b'\n')
255 {
256 return Ok(None); // Any document end marker requires full parser
257 }
258 line_start = false;
259 }
260 b' ' | b'\t' => {}
261 _ => line_start = false,
262 }
263 }
264
265 // Chapter 6.9: Node properties in mapping contexts - comprehensive detection
266 if trimmed.contains(':') {
267 let has_node_properties = trimmed.lines().any(|line| {
268 let trimmed_line = line.trim();
269 // Tag detection: ! not at start of line or after whitespace indicating tagged values
270 if let Some(exclaim_pos) = trimmed_line.find('!') {
271 // Not a comment (!= case) and not negation (!something without space)
272 let is_tag = exclaim_pos == 0
273 || trimmed_line
274 .chars()
275 .nth(exclaim_pos.saturating_sub(1))
276 .is_some_and(|c| c.is_whitespace())
277 || trimmed_line[exclaim_pos..].starts_with("!!")
278 || trimmed_line[exclaim_pos..]
279 .chars()
280 .nth(1)
281 .is_some_and(|c| c.is_ascii_lowercase() || c == '<');
282 if is_tag {
283 return true;
284 }
285 }
286 // Anchor detection: & followed by valid anchor characters
287 if let Some(amp_pos) = trimmed_line.find('&') {
288 let is_anchor = trimmed_line[amp_pos + 1..]
289 .chars()
290 .next()
291 .is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-');
292 if is_anchor {
293 return true;
294 }
295 }
296 // Alias detection: * followed by valid anchor characters
297 if let Some(star_pos) = trimmed_line.find('*') {
298 let is_alias = trimmed_line[star_pos + 1..]
299 .chars()
300 .next()
301 .is_some_and(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-');
302 if is_alias {
303 return true;
304 }
305 }
306 false
307 });
308 if has_node_properties {
309 return Ok(None);
310 }
311 }
312
313 // Chapter 8.2: Complex block mapping structures that exceed fast parser capabilities
314 if trimmed.contains(':') && trimmed.lines().count() > 1 {
315 // Detect explicit mapping indicators (?) requiring full parser
316 if trimmed.lines().any(|line| {
317 let trimmed_line = line.trim_start();
318 trimmed_line.starts_with("? ") || trimmed_line == "?"
319 }) {
320 return Ok(None);
321 }
322
323 // Detect flow collections embedded in block mappings
324 if trimmed.chars().any(|c| matches!(c, '[' | ']' | '{' | '}')) {
325 return Ok(None);
326 }
327
328 // Detect complex indentation patterns that require full parser
329 let mut prev_indent = None;
330 for line in trimmed.lines() {
331 if !line.trim().is_empty() && line.contains(':') {
332 let indent = line.len() - line.trim_start().len();
333 if let Some(prev) = prev_indent
334 && indent != prev
335 && indent != 0
336 {
337 return Ok(None); // Variable indentation requires full parser
338 }
339 prev_indent = Some(indent);
340 }
341 }
342 }
343
344 // Block sequence: handle lists with "- item" syntax (CHECK FIRST!)
345 // If it starts with "- ", it's likely a sequence - don't let block mapping claim it
346 if trimmed.starts_with("- ") {
347 // Try parsing as block sequence - let try_parse_block_sequence handle complexity
348 if Self::is_valid_block_sequence(trimmed) {
349 return Self::try_parse_block_sequence(trimmed, processor);
350 } else {
351 // Invalid structure - fall back to full parser instead of erroring
352 return Ok(None);
353 }
354 }
355
356 // Multi-line mapping: handle simple block mappings (ONLY if not a sequence)
357 // CRITICAL: Don't claim sequences that start with "- " as mappings!
358 if trimmed.contains(':') && trimmed.lines().count() > 1 && !trimmed.starts_with("- ") {
359 if let Some(result) = Self::try_parse_block_mapping(trimmed, processor) {
360 return Ok(Some(result));
361 } else {
362 // Complex mapping detected (anchors/aliases), fall back to full parser
363 return Ok(None);
364 }
365 }
366
367 // Single-line mapping: "key: value"
368 if trimmed.contains(':')
369 && trimmed.lines().count() == 1
370 && let Some(colon_pos) = trimmed.find(':')
371 {
372 let key_part = trimmed[..colon_pos].trim();
373 let value_part = trimmed[colon_pos + 1..].trim();
374
375 if !key_part.is_empty()
376 && !key_part.contains('[')
377 && !key_part.contains('{')
378 && !value_part.contains('[')
379 && !value_part.contains('{')
380 && !value_part.contains(':')
381 && !key_part.contains('&')
382 && !key_part.contains('*')
383 && !value_part.contains('&')
384 && !value_part.contains('*')
385 {
386 let mut hash = crate::linked_hash_map::LinkedHashMap::new();
387 let key = Yaml::String(key_part.to_string());
388 let value = if value_part.is_empty() {
389 Yaml::Null
390 } else {
391 Self::resolve_plain_scalar(processor, value_part)?
392 };
393 hash.insert(key, value);
394 return Ok(Some(Yaml::Hash(hash)));
395 }
396 }
397
398 // Simple array case: "[1, 2, 3]"
399 if trimmed.starts_with('[') && trimmed.ends_with(']') && trimmed.lines().count() == 1 {
400 return Self::try_parse_flow_sequence(trimmed, processor);
401 }
402
403 Ok(None)
404 }
405
406 /// Intelligent block sequence validation - zero allocation, blazing fast
407 /// Validates block sequence structure with support for nested content
408 #[inline]
409 fn is_valid_block_sequence(s: &str) -> bool {
410 let lines: Vec<&str> = s.lines().collect();
411 if lines.is_empty() {
412 return false;
413 }
414
415 let mut base_indent = None;
416 let mut in_sequence_item = false;
417 let mut item_indent = None;
418
419 for line in lines.iter() {
420 // Calculate indentation level
421 let trimmed = line.trim();
422 let indent_level = line.len() - line.trim_start().len();
423
424 // Skip empty lines and comments
425 if trimmed.is_empty() || trimmed.starts_with('#') {
426 continue;
427 }
428
429 if trimmed.starts_with("- ") {
430 // This is a sequence item marker
431 if base_indent.is_none() {
432 base_indent = Some(indent_level);
433 } else if base_indent != Some(indent_level) {
434 // Sequence items must be at same indentation level
435 return false;
436 }
437 in_sequence_item = true;
438 item_indent = Some(indent_level + 2); // Content after "- " should be indented more
439 } else if in_sequence_item {
440 // This is content within a sequence item (nested mapping/sequence)
441 if let Some(expected_indent) = item_indent
442 && indent_level < expected_indent
443 {
444 // Content must be indented more than sequence marker
445 return false;
446 }
447 // Allow nested content within sequence items
448 } else {
449 // First line should be a sequence item, or we're not in a valid sequence
450 return false;
451 }
452 }
453
454 // Must have encountered at least one sequence item
455 base_indent.is_some()
456 }
457
458 /// Parse simple block mapping format: key: value on separate lines
459 fn try_parse_block_mapping(s: &str, processor: &mut SchemaProcessor<'static>) -> Option<Yaml> {
460 let mut map = crate::linked_hash_map::LinkedHashMap::new();
461
462 // First pass: check for nested indented content - if found, fall back to full parser
463 let lines: Vec<&str> = s.lines().collect();
464 for (i, line) in lines.iter().enumerate() {
465 let line = line.trim();
466 if line.is_empty() || line.starts_with('#') {
467 continue;
468 }
469
470 // If this line has a colon with empty value, check if next non-empty line is indented
471 if let Some(colon_pos) = line.find(':') {
472 let value_part = line[colon_pos + 1..].trim();
473 if value_part.is_empty() && i + 1 < lines.len() {
474 // Check if next non-empty line is indented (nested content)
475 for next_line in &lines[i + 1..] {
476 if next_line.trim().is_empty() || next_line.trim().starts_with('#') {
477 continue;
478 }
479 let next_indent = next_line.len() - next_line.trim_start().len();
480 let current_indent = lines[i].len() - lines[i].trim_start().len();
481 if next_indent > current_indent {
482 return None;
483 }
484 break;
485 }
486 }
487 }
488 }
489
490 for line in s.lines() {
491 let trimmed = line.trim();
492 if trimmed.is_empty() || trimmed.starts_with('#') {
493 continue;
494 }
495 if let Some(colon_pos) = trimmed.find(':') {
496 let key = trimmed[..colon_pos].trim();
497 let value = trimmed[colon_pos + 1..].trim();
498 if key.is_empty() {
499 return None;
500 }
501 let yaml_value = if value.is_empty() {
502 Yaml::Null
503 } else {
504 Self::resolve_plain_scalar(processor, value).ok()?
505 };
506 map.insert(Yaml::String(key.to_string()), yaml_value);
507 } else {
508 return None;
509 }
510 }
511
512 Some(Yaml::Hash(map))
513 }
514
515 fn try_parse_block_sequence(
516 s: &str,
517 processor: &mut SchemaProcessor<'static>,
518 ) -> Result<Option<Yaml>, ScanError> {
519 let mut items = Vec::new();
520 let mut lines_iter = s.lines().enumerate();
521
522 // Pre-allocate with estimated capacity for better performance
523 if s.len() > 100 {
524 items.reserve(s.len() / 50); // Rough estimate: 50 chars per item
525 }
526
527 while let Some((line_num, line)) = lines_iter.next() {
528 let trimmed = line.trim();
529
530 // Skip empty lines and comments - zero allocation fast path
531 if trimmed.is_empty() || trimmed.starts_with('#') {
532 continue;
533 }
534
535 if let Some(first_line_content) = trimmed.strip_prefix("- ") {
536 // Found a sequence item - collect all lines that belong to this item
537 let base_indent = line.len() - line.trim_start().len();
538 let item_content_indent = base_indent + 2; // Content after "- " should be more indented
539
540 // Zero-allocation parsing: work with string slices directly
541 // Remove "- " prefix
542 let first_content_trimmed = first_line_content.trim();
543
544 // Determine item boundaries without collecting into Vec
545 let _item_start_pos = if first_content_trimmed.is_empty() {
546 None
547 } else {
548 Some((first_content_trimmed, line_num, base_indent))
549 };
550
551 let mut item_end_line = line_num;
552 let mut has_multiline_content = false;
553
554 // Peek ahead to find item boundaries - zero allocation approach
555 let mut line_offset = 1;
556 let mut next_item_start = None;
557
558 for next_line in s.lines().skip(line_num + 1) {
559 let actual_line_num = line_num + line_offset;
560 let next_trimmed = next_line.trim();
561 let next_indent = next_line.len() - next_line.trim_start().len();
562
563 // Skip empty lines and comments
564 if next_trimmed.is_empty() || next_trimmed.starts_with('#') {
565 line_offset += 1;
566 continue;
567 }
568
569 // If this line starts a new sequence item, stop collecting
570 if next_trimmed.starts_with("- ") && next_indent == base_indent {
571 next_item_start = Some(actual_line_num);
572 break;
573 }
574
575 // If this line is at or less indented than expected content, stop collecting
576 if next_indent < item_content_indent {
577 break;
578 }
579
580 // This line belongs to the current sequence item
581 item_end_line = actual_line_num;
582 has_multiline_content = true;
583 line_offset += 1;
584 }
585
586 // Parse item content with zero-allocation approach
587 let item = if !has_multiline_content {
588 // Single line item - parse directly without allocation
589 if first_content_trimmed.is_empty() {
590 Ok(Yaml::Null)
591 } else {
592 Self::parse_item_content(first_content_trimmed, processor)
593 }
594 } else {
595 // Multi-line item - extract slice and parse
596 let item_lines: Vec<&str> = s
597 .lines()
598 .skip(line_num)
599 .take(item_end_line - line_num + 1)
600 .collect();
601
602 let mut content_parts = Vec::new();
603
604 // Add first line content if not empty
605 if !first_content_trimmed.is_empty() {
606 content_parts.push(first_content_trimmed);
607 }
608
609 // Add subsequent lines with normalized indentation
610 for item_line in item_lines.iter().skip(1) {
611 let item_trimmed = item_line.trim();
612 if item_trimmed.is_empty() || item_trimmed.starts_with('#') {
613 continue;
614 }
615
616 let item_indent = item_line.len() - item_line.trim_start().len();
617 let normalized_line = if item_indent >= item_content_indent {
618 &item_line[item_content_indent.min(item_line.len())..]
619 } else {
620 item_line
621 };
622 content_parts.push(normalized_line);
623 }
624
625 if content_parts.is_empty() {
626 Ok(Yaml::Null)
627 } else if content_parts.len() == 1 {
628 Self::parse_item_content(content_parts[0], processor)
629 } else {
630 // Only allocate string when absolutely necessary
631 let joined_content = content_parts.join("\n");
632 Self::parse_item_content(&joined_content, processor)
633 }
634 };
635
636 // Handle parsing errors
637 let parsed_item = item?;
638
639 items.push(parsed_item);
640
641 // Skip lines we've already processed
642 if let Some(next_start) = next_item_start {
643 // Fast-forward iterator to next item
644 for (current_line_num, _) in lines_iter.by_ref() {
645 if current_line_num + 1 >= next_start {
646 break;
647 }
648 }
649 } else {
650 // Skip to end of current item
651 for _ in line_num..item_end_line {
652 lines_iter.next();
653 }
654 }
655 } else {
656 // Unexpected line that doesn't start with "- " at the expected level
657 return Err(ScanError::new(
658 Marker {
659 index: 0,
660 line: line_num + 1,
661 col: 0,
662 },
663 &format!(
664 "invalid block sequence: expected '- ' at line {}, found '{}'",
665 line_num + 1,
666 trimmed
667 ),
668 ));
669 }
670 }
671
672 if items.is_empty() {
673 Ok(None)
674 } else {
675 Ok(Some(Yaml::Array(items)))
676 }
677 }
678
679 /// Parse content within a sequence item - handles scalars, mappings, and nested sequences
680 /// Returns errors for malformed nested content
681 #[inline]
682 fn parse_item_content(
683 content: &str,
684 processor: &mut SchemaProcessor<'static>,
685 ) -> Result<Yaml, ScanError> {
686 let trimmed = content.trim();
687 if trimmed.is_empty() {
688 return Ok(Yaml::Null);
689 }
690
691 // Use direct scalar parsing to avoid infinite recursion
692 // (parse_item_content is called from try_fast_parse, so we can't call try_fast_parse again)
693
694 // For complex content, use scalar parsing as fallback
695 // This maintains compatibility while allowing nested structures
696 Self::resolve_plain_scalar(processor, trimmed)
697 }
698
699 fn try_parse_flow_sequence(
700 s: &str,
701 processor: &mut SchemaProcessor<'static>,
702 ) -> Result<Option<Yaml>, ScanError> {
703 // Strip brackets
704 let inner = &s[1..s.len() - 1].trim();
705 if inner.is_empty() {
706 return Ok(Some(Yaml::Array(Vec::new())));
707 }
708
709 let mut items = Vec::new();
710 for item in inner.split(',') {
711 items.push(Self::resolve_plain_scalar(processor, item.trim())?);
712 }
713 Ok(Some(Yaml::Array(items)))
714 }
715
716 /// Direct scalar parsing with schema-aware type inference
717 /// Uses SchemaProcessor for proper YAML 1.2 schema compliance
718 fn resolve_plain_scalar(
719 processor: &mut SchemaProcessor<'static>,
720 raw: &str,
721 ) -> Result<Yaml, ScanError> {
722 let trimmed = raw.trim();
723 let marker = Marker {
724 index: 0,
725 line: 1,
726 col: 1,
727 };
728
729 Self::convert_plain_scalar(processor, trimmed, marker)
730 }
731
732 fn convert_plain_scalar(
733 processor: &mut SchemaProcessor<'static>,
734 trimmed: &str,
735 marker: Marker,
736 ) -> Result<Yaml, ScanError> {
737 use crate::semantic::tags::types::YamlType;
738
739 if trimmed.len() >= 2
740 && ((trimmed.starts_with('"') && trimmed.ends_with('"'))
741 || (trimmed.starts_with('\'') && trimmed.ends_with('\'')))
742 {
743 return Ok(Yaml::String(trimmed[1..trimmed.len() - 1].to_string()));
744 }
745
746 match processor.infer_scalar_type(trimmed) {
747 YamlType::Null => Ok(Yaml::Null),
748 YamlType::Bool => match processor.current_schema() {
749 SchemaType::Json => match trimmed {
750 "true" => Ok(Yaml::Boolean(true)),
751 "false" => Ok(Yaml::Boolean(false)),
752 _ => Err(ScanError::new(
753 marker,
754 &format!(
755 "Scalar '{trimmed}' is not a canonical JSON boolean (expected 'true' or 'false')"
756 ),
757 )),
758 },
759 _ => match trimmed.to_ascii_lowercase().as_str() {
760 "true" | "yes" | "on" => Ok(Yaml::Boolean(true)),
761 "false" | "no" | "off" => Ok(Yaml::Boolean(false)),
762 _ => Ok(Yaml::String(trimmed.to_string())),
763 },
764 },
765 YamlType::Int => {
766 if processor.is_integer_pattern(trimmed) {
767 trimmed
768 .parse::<i64>()
769 .map(Yaml::Integer)
770 .map_err(|_| {
771 ScanError::new(
772 marker,
773 &format!("Scalar '{trimmed}' is not a canonical JSON integer"),
774 )
775 })
776 } else {
777 Err(ScanError::new(
778 marker,
779 &format!("Scalar '{trimmed}' is not a canonical JSON integer"),
780 ))
781 }
782 }
783 YamlType::Float => match processor.current_schema() {
784 SchemaType::Json => {
785 if processor.is_float_pattern(trimmed) {
786 trimmed
787 .parse::<f64>()
788 .map(|f| Yaml::Real(f.to_string()))
789 .map_err(|_| {
790 ScanError::new(
791 marker,
792 &format!(
793 "Scalar '{trimmed}' is not a canonical JSON number"
794 ),
795 )
796 })
797 } else {
798 Err(ScanError::new(
799 marker,
800 &format!("Scalar '{trimmed}' is not a canonical JSON number"),
801 ))
802 }
803 }
804 _ => match trimmed.to_ascii_lowercase().as_str() {
805 ".inf" | "+.inf" => Ok(Yaml::Real("+.inf".to_string())),
806 "-.inf" => Ok(Yaml::Real("-.inf".to_string())),
807 ".nan" => Ok(Yaml::Real(".nan".to_string())),
808 _ => trimmed
809 .parse::<f64>()
810 .map(|f| Yaml::Real(f.to_string()))
811 .map_err(|_| {
812 ScanError::new(
813 marker,
814 &format!("Scalar '{trimmed}' is not a valid YAML float"),
815 )
816 }),
817 },
818 },
819 YamlType::Str
820 | YamlType::Unknown
821 | YamlType::Custom(_)
822 | YamlType::Binary
823 | YamlType::Timestamp
824 | YamlType::Seq
825 | YamlType::Map
826 | YamlType::Pairs
827 | YamlType::Set
828 | YamlType::Omap
829 | YamlType::Merge
830 | YamlType::Value => Ok(Yaml::String(trimmed.to_string())),
831 }
832 }
833}
834
835/// The data structure that builds `Yaml` AST from parser events
836pub struct YamlReceiver {
837 pub docs: Vec<Yaml>,
838 doc_stack: Vec<(Yaml, usize)>,
839 key_stack: Vec<Yaml>,
840 anchors: HashMap<usize, Yaml>,
841 // Simple circular reference detection
842 resolution_stack: Vec<usize>,
843 // Billion laughs protection
844 alias_count: usize,
845 // Schema processor for scalar type inference
846 schema_processor: SchemaProcessor<'static>,
847 error: Option<ScanError>,
848}
849
850impl Default for YamlReceiver {
851 fn default() -> Self {
852 Self::new()
853 }
854}
855
856impl YamlReceiver {
857 #[must_use]
858 pub fn new() -> Self {
859 Self::new_with_schema(SchemaType::Core)
860 }
861
862 #[must_use]
863 pub fn new_with_schema(schema: SchemaType) -> Self {
864 let mut processor = SchemaProcessor::<'static>::new();
865 processor.set_schema(schema);
866 Self {
867 docs: Vec::with_capacity(1), // Most YAML files have 1 document
868 doc_stack: Vec::with_capacity(8), // Typical nesting depth
869 key_stack: Vec::with_capacity(8), // Typical mapping depth
870 anchors: HashMap::with_capacity(16), // Reasonable anchor count
871 resolution_stack: Vec::with_capacity(8), // Rare deep circular refs
872 alias_count: 0, // Start with no aliases processed
873 schema_processor: processor,
874 error: None,
875 }
876 }
877
878 #[inline]
879 fn insert_new_node(&mut self, (node, aid): (Yaml, usize)) {
880 if self.error.is_some() {
881 return;
882 }
883 // store anchor if needed - blazing-fast HashMap operations
884 if aid > 0 {
885 self.anchors.insert(aid, node.clone());
886 }
887 if self.doc_stack.is_empty() {
888 self.doc_stack.push((node, 0));
889 } else if let Some(top) = self.doc_stack.last_mut() {
890 match top.0 {
891 Yaml::Array(ref mut arr) => arr.push(node),
892 Yaml::Hash(ref mut h) => {
893 if let Some(cur_key) = self.key_stack.last_mut() {
894 if cur_key.is_badvalue() {
895 *cur_key = node;
896 } else {
897 let mut swap_key = Yaml::BadValue;
898 std::mem::swap(&mut swap_key, cur_key);
899 h.insert(swap_key, node);
900 }
901 }
902 }
903 _ => {}
904 }
905 }
906 }
907
908 /// Blazing-fast alias resolution with circular reference protection
909 #[inline]
910 fn resolve_alias(&mut self, id: usize) -> Yaml {
911 // Billion laughs protection - limit total alias resolutions
912 self.alias_count += 1;
913 if self.alias_count > 1000 {
914 warn!(
915 "Alias count exceeded limit ({}), potential billion laughs attack",
916 self.alias_count
917 );
918 return Yaml::Null;
919 }
920
921 // Fast circular reference check - O(n) but n is typically very small (< 10 deep)
922 if self.resolution_stack.contains(&id) {
923 warn!(
924 "Circular reference detected for alias ID {}, breaking cycle",
925 id
926 );
927 return Yaml::Null;
928 }
929
930 // Look up the anchored value and return it immediately
931 if let Some(anchored_node) = self.anchors.get(&id).cloned() {
932 anchored_node
933 } else {
934 warn!("Anchor ID {} not found, returning null", id);
935 Yaml::Null
936 }
937 }
938
939 /// Reset alias tracking state (called between documents)
940 #[inline]
941 fn reset_alias_tracking(&mut self) {
942 self.resolution_stack.clear();
943 self.alias_count = 0;
944 }
945}
946
947impl EventReceiver for YamlReceiver {
948 fn on_event(&mut self, ev: Event) {
949 if self.error.is_some() {
950 return;
951 }
952 trace!(
953 "YAML EVENT: {:?} (doc_stack len: {}, docs len: {})",
954 ev,
955 self.doc_stack.len(),
956 self.docs.len()
957 );
958 match ev {
959 Event::DocumentStart => {
960 // Reset alias tracking for each new document
961 self.reset_alias_tracking();
962 }
963 Event::DocumentEnd => match self.doc_stack.len() {
964 0 => self.docs.push(Yaml::BadValue),
965 1 => {
966 if let Some((doc, _)) = self.doc_stack.pop() {
967 self.docs.push(doc);
968 }
969 }
970 _ => {}
971 },
972 Event::StreamStart => {}
973 Event::StreamEnd => {}
974 Event::Alias(id) => {
975 let node = self.resolve_alias(id);
976 self.insert_new_node((node, 0));
977 }
978 Event::Scalar(s, style, aid, tag) => {
979 let node = if style != TScalarStyle::Plain {
980 Yaml::String(s)
981 } else if let Some(TokenType::Tag(ref handle, ref suffix)) = tag {
982 // handle tag
983 if handle == "!!" {
984 match suffix.as_str() {
985 "bool" => match s.parse::<bool>() {
986 Ok(b) => Yaml::Boolean(b),
987 Err(_) => Yaml::BadValue,
988 },
989 "int" => match s.parse::<i64>() {
990 Ok(i) => Yaml::Integer(i),
991 Err(_) => Yaml::BadValue,
992 },
993 "float" => match s.parse::<f64>() {
994 Ok(_) => Yaml::Real(s),
995 Err(_) => Yaml::BadValue,
996 },
997 "null" => {
998 if s == "~" || s == "null" {
999 Yaml::Null
1000 } else {
1001 Yaml::BadValue
1002 }
1003 }
1004 _ => Yaml::String(s),
1005 }
1006 } else {
1007 // Preserve custom tag by creating a Tagged variant
1008 let tag_name = if handle.is_empty() {
1009 suffix.clone()
1010 } else {
1011 format!("{}{}", handle, suffix)
1012 };
1013 match YamlLoader::resolve_plain_scalar(&mut self.schema_processor, &s) {
1014 Ok(value) => Yaml::Tagged(tag_name, Box::new(value)),
1015 Err(err) => {
1016 self.error = Some(err);
1017 Yaml::Null
1018 }
1019 }
1020 }
1021 } else {
1022 // autodetect
1023 match YamlLoader::resolve_plain_scalar(&mut self.schema_processor, &s) {
1024 Ok(value) => value,
1025 Err(err) => {
1026 self.error = Some(err);
1027 Yaml::Null
1028 }
1029 }
1030 };
1031 self.insert_new_node((node, aid));
1032 }
1033 Event::SequenceStart(aid) => {
1034 self.doc_stack.push((Yaml::Array(Vec::new()), aid));
1035 }
1036 Event::SequenceEnd => {
1037 if let Some(top) = self.doc_stack.pop() {
1038 self.insert_new_node(top);
1039 }
1040 }
1041 Event::MappingStart(aid) => {
1042 let h = LinkedHashMap::new();
1043 self.doc_stack.push((Yaml::Hash(h), aid));
1044 self.key_stack.push(Yaml::BadValue);
1045 }
1046 Event::MappingEnd => {
1047 self.key_stack.pop();
1048 if let Some(top) = self.doc_stack.pop() {
1049 self.insert_new_node(top);
1050 }
1051 }
1052 Event::YamlDirective(_major, _minor) => {
1053 // Store YAML version directive for document processing
1054 // This is handled at the state machine level, no action needed here
1055 }
1056 Event::TagDirective(_handle, _prefix) => {
1057 // Store TAG directive for document processing
1058 // This is handled at the state machine level, no action needed here
1059 }
1060 Event::Nothing => {}
1061 }
1062 }
1063}
1064
1065impl YamlReceiver {
1066 pub fn into_result(self) -> Result<Vec<Yaml>, ScanError> {
1067 if let Some(err) = self.error {
1068 Err(err)
1069 } else {
1070 Ok(self.docs)
1071 }
1072 }
1073}
1074
1075// Old load function removed - StateMachine::parse() handles loading directly
1076/*
1077pub fn load<T: Iterator<Item = char>, R: MarkedEventReceiver>(
1078 parser: &mut Parser<T>,
1079 recv: &mut R,
1080 multi: bool,
1081) -> Result<(), ScanError> {
1082 // ZERO-ALLOCATION, NON-RECURSIVE LOADER USING EXPLICIT STACK
1083 // Uses Vec<ContainerType> to track nesting instead of recursion
1084 #[derive(Debug, Clone, Copy)]
1085 enum ContainerType {
1086 Sequence,
1087 Mapping,
1088 }
1089
1090 let mut nesting_stack: Vec<ContainerType> = Vec::with_capacity(32); // Pre-allocate for performance
1091 let mut documents_processed = 0;
1092 let mut in_document = false;
1093
1094 // Ensure stream has started
1095 if !parser.scanner.stream_started() {
1096 let (ev, mark) = parser.next()?;
1097 if ev != Event::StreamStart {
1098 return Err(ScanError::new(
1099 mark,
1100 &format!("Expected StreamStart event, got {ev:?}")
1101 ));
1102 }
1103 recv.on_event(ev, mark);
1104 }
1105
1106 if parser.scanner.stream_ended() {
1107 recv.on_event(Event::StreamEnd, parser.scanner.mark());
1108 return Ok(());
1109 }
1110
1111 // FLAT EVENT PROCESSING LOOP - ZERO RECURSION
1112 loop {
1113 let (ev, mark) = parser.next()?;
1114
1115 match ev {
1116 Event::StreamEnd => {
1117 if in_document {
1118 recv.on_event(Event::DocumentEnd, mark);
1119 }
1120 recv.on_event(ev, mark);
1121 break;
1122 }
1123
1124 Event::DocumentStart => {
1125 if in_document && multi {
1126 recv.on_event(Event::DocumentEnd, mark);
1127 }
1128 parser.anchors.clear();
1129 recv.on_event(ev, mark);
1130 in_document = true;
1131 documents_processed += 1;
1132 if !multi && documents_processed > 1 {
1133 // Single document mode: ignore additional documents
1134 continue;
1135 }
1136 }
1137
1138 Event::DocumentEnd => {
1139 recv.on_event(ev, mark);
1140 in_document = false;
1141 }
1142
1143 Event::SequenceStart(_) => {
1144 if !in_document {
1145 // Implicit document start
1146 parser.anchors.clear();
1147 recv.on_event(Event::DocumentStart, mark);
1148 in_document = true;
1149 documents_processed += 1;
1150 }
1151 recv.on_event(ev, mark);
1152 nesting_stack.push(ContainerType::Sequence);
1153 }
1154
1155 Event::SequenceEnd => {
1156 recv.on_event(ev, mark);
1157 if let Some(ContainerType::Sequence) = nesting_stack.pop() {
1158 // Correct nesting
1159 } else {
1160 return Err(ScanError::new(
1161 mark,
1162 "Unexpected SequenceEnd: not inside sequence"
1163 ));
1164 }
1165 }
1166
1167 Event::MappingStart(_) => {
1168 if !in_document {
1169 // Implicit document start
1170 parser.anchors.clear();
1171 recv.on_event(Event::DocumentStart, mark);
1172 in_document = true;
1173 documents_processed += 1;
1174 }
1175 recv.on_event(ev, mark);
1176 nesting_stack.push(ContainerType::Mapping);
1177 }
1178
1179 Event::MappingEnd => {
1180 recv.on_event(ev, mark);
1181 if let Some(ContainerType::Mapping) = nesting_stack.pop() {
1182 // Correct nesting
1183 } else {
1184 return Err(ScanError::new(
1185 mark,
1186 "Unexpected MappingEnd: not inside mapping"
1187 ));
1188 }
1189 }
1190
1191 Event::Scalar(..) | Event::Alias(..) => {
1192 if !in_document {
1193 // Implicit document start
1194 parser.anchors.clear();
1195 recv.on_event(Event::DocumentStart, mark);
1196 in_document = true;
1197 documents_processed += 1;
1198 }
1199 recv.on_event(ev, mark);
1200 }
1201
1202 _ => {
1203 // Handle any other events directly
1204 if !in_document {
1205 // Implicit document start
1206 parser.anchors.clear();
1207 recv.on_event(Event::DocumentStart, mark);
1208 in_document = true;
1209 documents_processed += 1;
1210 }
1211 recv.on_event(ev, mark);
1212 }
1213 }
1214
1215 // Single document mode: break after processing first document
1216 if !multi && documents_processed >= 1 && nesting_stack.is_empty() && in_document {
1217 // Continue to find StreamEnd
1218 loop {
1219 let (next_ev, next_mark) = parser.next()?;
1220 if matches!(next_ev, Event::StreamEnd) {
1221 recv.on_event(Event::DocumentEnd, next_mark);
1222 recv.on_event(next_ev, next_mark);
1223 break;
1224 }
1225 // Skip other events in single document mode
1226 }
1227 break;
1228 }
1229 }
1230
1231 // Verify all containers were properly closed
1232 if !nesting_stack.is_empty() {
1233 return Err(ScanError::new(
1234 parser.scanner.mark(),
1235 &format!("Unclosed containers at end of stream: {} remaining", nesting_stack.len())
1236 ));
1237 }
1238
1239 Ok(())
1240}
1241*/
1242
1243// REMOVED: load_document function - replaced with flat, non-recursive loader
1244// This function was causing stack overflow via recursive calls to load_node
1245
1246// REMOVED: load_node function - replaced with flat, non-recursive loader
1247// This function was causing infinite recursion via load_sequence/load_mapping calls
1248
1249// REMOVED: load_sequence function - replaced with flat, non-recursive loader
1250// This function was causing infinite recursion via load_node calls
1251
1252// REMOVED: load_mapping function - replaced with flat, non-recursive loader
1253// This function was causing infinite recursion via load_node calls