Skip to main content

rust_yaml/scanner/
scalar_scanner.rs

1//! Scalar scanning functionality for YAML scanner
2
3use super::{QuoteStyle, Token, TokenType};
4use crate::{Error, Position, Result};
5
6/// Trait for scanning scalar values
7pub trait ScalarScanner {
8    /// Scan a plain scalar (unquoted string)
9    fn scan_plain_scalar(&mut self) -> Result<Token>;
10
11    /// Scan a quoted string (single or double quotes)
12    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token>;
13
14    /// Scan a number (integer or float)
15    fn scan_number(&mut self) -> Result<Token>;
16
17    /// Scan a literal block scalar (|)
18    fn scan_literal_block_scalar(&mut self) -> Result<Token>;
19
20    /// Scan a folded block scalar (>)
21    fn scan_folded_block_scalar(&mut self) -> Result<Token>;
22
23    /// Scan block scalar header for chomping and indentation
24    fn scan_block_scalar_header(&mut self) -> Result<(bool, Option<usize>)>;
25
26    /// Helper: get current position
27    fn current_position(&self) -> Position;
28
29    /// Helper: peek at current character
30    fn current_char(&self) -> Option<char>;
31
32    /// Helper: advance to next character
33    fn advance_char(&mut self) -> Option<char>;
34
35    /// Helper: peek at next character
36    fn peek_char(&self, offset: usize) -> Option<char>;
37
38    /// Helper: check if at line start
39    fn at_line_start(&self) -> bool;
40}
41
42/// Helper functions for scalar processing
43pub(super) fn is_plain_scalar_char(ch: char) -> bool {
44    !matches!(
45        ch,
46        ':' | ','
47            | '['
48            | ']'
49            | '{'
50            | '}'
51            | '#'
52            | '&'
53            | '*'
54            | '!'
55            | '|'
56            | '>'
57            | '\''
58            | '"'
59            | '%'
60            | '@'
61            | '`'
62    )
63}
64
65pub(super) fn process_escape_sequence(ch: char) -> Result<String> {
66    match ch {
67        'n' => Ok("\n".to_string()),
68        'r' => Ok("\r".to_string()),
69        't' => Ok("\t".to_string()),
70        '\\' => Ok("\\".to_string()),
71        '"' => Ok("\"".to_string()),
72        '\'' => Ok("'".to_string()),
73        '0' => Ok("\0".to_string()),
74        'a' => Ok("\x07".to_string()), // Bell
75        'b' => Ok("\x08".to_string()), // Backspace
76        'f' => Ok("\x0C".to_string()), // Form feed
77        'v' => Ok("\x0B".to_string()), // Vertical tab
78        'e' => Ok("\x1B".to_string()), // Escape
79        ' ' => Ok(" ".to_string()),
80        'N' => Ok("\u{85}".to_string()),   // Next line (NEL)
81        '_' => Ok("\u{A0}".to_string()),   // Non-breaking space
82        'L' => Ok("\u{2028}".to_string()), // Line separator
83        'P' => Ok("\u{2029}".to_string()), // Paragraph separator
84        _ => Err(Error::scan(
85            Position::new(),
86            format!("Invalid escape sequence: \\{}", ch),
87        )),
88    }
89}
90
91/// Implementation of ScalarScanner for BasicScanner
92impl ScalarScanner for super::BasicScanner {
93    fn scan_plain_scalar(&mut self) -> Result<Token> {
94        let start_pos = self.position;
95        let mut value = String::new();
96
97        while let Some(ch) = self.current_char {
98            // Stop at structural characters in block context
99            if self.flow_level == 0 {
100                match ch {
101                    '\n' | '\r' => break,
102                    ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
103                    '#' if value.is_empty()
104                        || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
105                    {
106                        break;
107                    }
108                    _ => {}
109                }
110            } else {
111                // In flow context, stop at flow indicators
112                match ch {
113                    ',' | '[' | ']' | '{' | '}' => break,
114                    ':' if self
115                        .peek_char(1)
116                        .map_or(true, |c| c.is_whitespace() || "]}".contains(c)) =>
117                    {
118                        break;
119                    }
120                    '#' if value.is_empty()
121                        || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
122                    {
123                        break;
124                    }
125                    _ => {}
126                }
127            }
128
129            value.push(ch);
130            self.advance();
131        }
132
133        // Check string length limit
134        self.resource_tracker
135            .check_string_length(&self.limits, value.len())?;
136
137        // Trim trailing whitespace from plain scalars
138        let value = value.trim_end().to_string();
139        let normalized_value = Self::normalize_scalar(value);
140
141        Ok(Token::new(
142            TokenType::Scalar(normalized_value, QuoteStyle::Plain),
143            start_pos,
144            self.position,
145        ))
146    }
147
148    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
149        let start_pos = self.position;
150        let mut value = String::new();
151
152        // Skip opening quote
153        self.advance();
154
155        while let Some(ch) = self.current_char {
156            if ch == quote_char {
157                // End quote found
158                self.advance();
159                break;
160            } else if ch == '\\' && quote_char == '"' {
161                // Handle escape sequences in double quotes
162                self.advance();
163                if let Some(escaped_char) = self.current_char {
164                    match escaped_char {
165                        'n' => value.push('\n'),
166                        'r' => value.push('\r'),
167                        't' => value.push('\t'),
168                        '\\' => value.push('\\'),
169                        '"' => value.push('"'),
170                        '\'' => value.push('\''),
171                        '0' => value.push('\0'),
172                        'a' => value.push('\x07'), // Bell
173                        'b' => value.push('\x08'), // Backspace
174                        'f' => value.push('\x0C'), // Form feed
175                        'v' => value.push('\x0B'), // Vertical tab
176                        'e' => value.push('\x1B'), // Escape
177                        ' ' => value.push(' '),
178                        'N' => value.push('\u{85}'),   // Next line (NEL)
179                        '_' => value.push('\u{A0}'),   // Non-breaking space
180                        'L' => value.push('\u{2028}'), // Line separator
181                        'P' => value.push('\u{2029}'), // Paragraph separator
182                        _ => {
183                            // Invalid escape sequence
184                            return Err(Error::scan(
185                                self.position,
186                                format!("Invalid escape sequence: \\{}", escaped_char),
187                            ));
188                        }
189                    }
190                    self.advance();
191                } else {
192                    return Err(Error::scan(
193                        self.position,
194                        "Unterminated escape sequence".to_string(),
195                    ));
196                }
197            } else {
198                value.push(ch);
199                self.advance();
200            }
201        }
202
203        // Check string length limit
204        self.resource_tracker
205            .check_string_length(&self.limits, value.len())?;
206
207        let quote_style = match quote_char {
208            '\'' => QuoteStyle::Single,
209            '"' => QuoteStyle::Double,
210            _ => QuoteStyle::Plain,
211        };
212
213        Ok(Token::new(
214            TokenType::Scalar(value, quote_style),
215            start_pos,
216            self.position,
217        ))
218    }
219
220    fn scan_number(&mut self) -> Result<Token> {
221        let start_pos = self.position;
222        let mut value = String::new();
223
224        // Handle negative numbers
225        if self.current_char == Some('-') {
226            value.push('-');
227            self.advance();
228        }
229
230        // Scan digits
231        while let Some(ch) = self.current_char {
232            if ch.is_ascii_digit() {
233                value.push(ch);
234                self.advance();
235            } else if ch == '.' {
236                value.push(ch);
237                self.advance();
238                // Scan fractional part
239                while let Some(ch) = self.current_char {
240                    if ch.is_ascii_digit() {
241                        value.push(ch);
242                        self.advance();
243                    } else {
244                        break;
245                    }
246                }
247                break;
248            } else {
249                break;
250            }
251        }
252
253        Ok(Token::new(
254            TokenType::Scalar(value, QuoteStyle::Plain),
255            start_pos,
256            self.position,
257        ))
258    }
259
260    fn scan_literal_block_scalar(&mut self) -> Result<Token> {
261        let start_pos = self.position;
262
263        // Skip the '|' character
264        self.advance();
265
266        // Scan block scalar header for chomping and indentation
267        let (keep_chomping, explicit_indent) = self.scan_block_scalar_header()?;
268
269        // Find the base indentation level
270        let mut base_indent = None;
271        let mut lines = Vec::new();
272        let mut current_line = String::new();
273
274        // Skip to end of header line
275        while let Some(ch) = self.current_char {
276            if ch == '\n' || ch == '\r' {
277                self.advance();
278                break;
279            }
280            self.advance();
281        }
282
283        // Collect lines
284        while let Some(ch) = self.current_char {
285            if ch == '\n' || ch == '\r' {
286                lines.push(current_line.clone());
287                current_line.clear();
288                self.advance();
289
290                // Check if next line has content to determine if we should continue
291                let mut temp_indent = 0usize;
292                let mut has_content = false;
293
294                while let Some(next_ch) = self.peek_char(temp_indent as isize) {
295                    if next_ch == ' ' || next_ch == '\t' {
296                        temp_indent += 1;
297                    } else if next_ch == '\n' || next_ch == '\r' {
298                        // Empty line, continue collecting
299                        break;
300                    } else {
301                        has_content = true;
302                        break;
303                    }
304                }
305
306                if !has_content {
307                    // No more content lines
308                    break;
309                }
310
311                // Set base indentation from first content line
312                if base_indent.is_none() && has_content {
313                    base_indent = Some(explicit_indent.unwrap_or(temp_indent));
314                }
315            } else {
316                current_line.push(ch);
317                self.advance();
318            }
319        }
320
321        // Add final line if not empty
322        if !current_line.is_empty() {
323            lines.push(current_line);
324        }
325
326        // Join lines with newlines (literal style preserves line breaks)
327        let mut value = lines.join("\n");
328
329        // `scan_block_scalar_header` resolves to the inherent impl
330        // (returning ChompingMode); this trait impl is dead code, but
331        // we keep it compilable.
332        if matches!(keep_chomping, super::ChompingMode::Strip) {
333            value = value.trim_end_matches('\n').to_string();
334        }
335
336        // Check string length limit
337        self.resource_tracker
338            .check_string_length(&self.limits, value.len())?;
339
340        Ok(Token::new(
341            TokenType::BlockScalarLiteral(value),
342            start_pos,
343            self.position,
344        ))
345    }
346
347    fn scan_folded_block_scalar(&mut self) -> Result<Token> {
348        let start_pos = self.position;
349
350        // Skip the '>' character
351        self.advance();
352
353        // Scan block scalar header for chomping and indentation
354        let (keep_chomping, explicit_indent) = self.scan_block_scalar_header()?;
355
356        // Similar to literal but fold newlines
357        let mut base_indent = None;
358        let mut lines = Vec::new();
359        let mut current_line = String::new();
360
361        // Skip to end of header line
362        while let Some(ch) = self.current_char {
363            if ch == '\n' || ch == '\r' {
364                self.advance();
365                break;
366            }
367            self.advance();
368        }
369
370        // Collect lines
371        while let Some(ch) = self.current_char {
372            if ch == '\n' || ch == '\r' {
373                lines.push(current_line.clone());
374                current_line.clear();
375                self.advance();
376
377                // Check if next line has content
378                let mut temp_indent = 0usize;
379                let mut has_content = false;
380
381                while let Some(next_ch) = self.peek_char(temp_indent as isize) {
382                    if next_ch == ' ' || next_ch == '\t' {
383                        temp_indent += 1;
384                    } else if next_ch == '\n' || next_ch == '\r' {
385                        break;
386                    } else {
387                        has_content = true;
388                        break;
389                    }
390                }
391
392                if !has_content {
393                    break;
394                }
395
396                if base_indent.is_none() && has_content {
397                    base_indent = Some(explicit_indent.unwrap_or(temp_indent));
398                }
399            } else {
400                current_line.push(ch);
401                self.advance();
402            }
403        }
404
405        if !current_line.is_empty() {
406            lines.push(current_line);
407        }
408
409        // Fold lines: join non-empty lines with spaces, preserve empty lines
410        let mut value = String::new();
411        let mut prev_was_empty = false;
412
413        for (i, line) in lines.iter().enumerate() {
414            if line.trim().is_empty() {
415                if !prev_was_empty && i > 0 {
416                    value.push('\n');
417                }
418                prev_was_empty = true;
419            } else {
420                if i > 0 && !prev_was_empty {
421                    value.push(' ');
422                } else if prev_was_empty && i > 0 {
423                    value.push('\n');
424                }
425                value.push_str(line.trim());
426                prev_was_empty = false;
427            }
428        }
429
430        // `scan_block_scalar_header` resolves to the inherent impl
431        // (returning ChompingMode); this trait impl is dead code, but
432        // we keep it compilable.
433        if matches!(keep_chomping, super::ChompingMode::Strip) {
434            value = value.trim_end_matches('\n').to_string();
435        }
436
437        // Check string length limit
438        self.resource_tracker
439            .check_string_length(&self.limits, value.len())?;
440
441        Ok(Token::new(
442            TokenType::BlockScalarFolded(value),
443            start_pos,
444            self.position,
445        ))
446    }
447
448    fn scan_block_scalar_header(&mut self) -> Result<(bool, Option<usize>)> {
449        let mut keep_chomping = true;
450        let mut explicit_indent = None;
451
452        // Skip whitespace after '|' or '>'
453        while let Some(ch) = self.current_char {
454            if ch == ' ' || ch == '\t' {
455                self.advance();
456            } else {
457                break;
458            }
459        }
460
461        // Check for explicit indentation indicator (digit)
462        if let Some(ch) = self.current_char {
463            if ch.is_ascii_digit() {
464                explicit_indent = Some(ch.to_digit(10).unwrap() as usize);
465                self.advance();
466            }
467        }
468
469        // Check for chomping indicator
470        if let Some(ch) = self.current_char {
471            match ch {
472                '-' => {
473                    keep_chomping = false; // Strip final newlines
474                    self.advance();
475                }
476                '+' => {
477                    keep_chomping = true; // Keep final newlines
478                    self.advance();
479                }
480                _ => {}
481            }
482        }
483
484        Ok((keep_chomping, explicit_indent))
485    }
486
487    // Helper trait methods
488    fn current_position(&self) -> Position {
489        self.position
490    }
491
492    fn current_char(&self) -> Option<char> {
493        self.current_char
494    }
495
496    fn advance_char(&mut self) -> Option<char> {
497        self.advance()
498    }
499
500    fn peek_char(&self, offset: usize) -> Option<char> {
501        self.peek_char(offset as isize)
502    }
503
504    fn at_line_start(&self) -> bool {
505        self.position.column == 1
506    }
507}