rust_yaml/scanner/
scalar_scanner.rs

1//! Scalar scanning functionality for YAML scanner
2
3use super::{QuoteStyle, Token, TokenType};
4use crate::{Error, Position, Result};
5
6/// Trait for scanning scalar values
7pub trait ScalarScanner {
8    /// Scan a plain scalar (unquoted string)
9    fn scan_plain_scalar(&mut self) -> Result<Token>;
10
11    /// Scan a quoted string (single or double quotes)
12    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token>;
13
14    /// Scan a number (integer or float)
15    fn scan_number(&mut self) -> Result<Token>;
16
17    /// Scan a literal block scalar (|)
18    fn scan_literal_block_scalar(&mut self) -> Result<Token>;
19
20    /// Scan a folded block scalar (>)
21    fn scan_folded_block_scalar(&mut self) -> Result<Token>;
22
23    /// Scan block scalar header for chomping and indentation
24    fn scan_block_scalar_header(&mut self) -> Result<(bool, Option<usize>)>;
25
26    /// Helper: get current position
27    fn current_position(&self) -> Position;
28
29    /// Helper: peek at current character
30    fn current_char(&self) -> Option<char>;
31
32    /// Helper: advance to next character
33    fn advance_char(&mut self) -> Option<char>;
34
35    /// Helper: peek at next character
36    fn peek_char(&self, offset: usize) -> Option<char>;
37
38    /// Helper: check if at line start
39    fn at_line_start(&self) -> bool;
40}
41
42/// Helper functions for scalar processing
43pub(super) fn is_plain_scalar_char(ch: char) -> bool {
44    !matches!(
45        ch,
46        ':' | ','
47            | '['
48            | ']'
49            | '{'
50            | '}'
51            | '#'
52            | '&'
53            | '*'
54            | '!'
55            | '|'
56            | '>'
57            | '\''
58            | '"'
59            | '%'
60            | '@'
61            | '`'
62    )
63}
64
65pub(super) fn process_escape_sequence(ch: char) -> Result<String> {
66    match ch {
67        'n' => Ok("\n".to_string()),
68        'r' => Ok("\r".to_string()),
69        't' => Ok("\t".to_string()),
70        '\\' => Ok("\\".to_string()),
71        '"' => Ok("\"".to_string()),
72        '\'' => Ok("'".to_string()),
73        '0' => Ok("\0".to_string()),
74        'a' => Ok("\x07".to_string()), // Bell
75        'b' => Ok("\x08".to_string()), // Backspace
76        'f' => Ok("\x0C".to_string()), // Form feed
77        'v' => Ok("\x0B".to_string()), // Vertical tab
78        'e' => Ok("\x1B".to_string()), // Escape
79        ' ' => Ok(" ".to_string()),
80        'N' => Ok("\u{85}".to_string()),   // Next line (NEL)
81        '_' => Ok("\u{A0}".to_string()),   // Non-breaking space
82        'L' => Ok("\u{2028}".to_string()), // Line separator
83        'P' => Ok("\u{2029}".to_string()), // Paragraph separator
84        _ => Err(Error::scan(
85            Position::new(),
86            format!("Invalid escape sequence: \\{}", ch),
87        )),
88    }
89}
90
91/// Implementation of ScalarScanner for BasicScanner
92impl ScalarScanner for super::BasicScanner {
93    fn scan_plain_scalar(&mut self) -> Result<Token> {
94        let start_pos = self.position;
95        let mut value = String::new();
96
97        while let Some(ch) = self.current_char {
98            // Stop at structural characters in block context
99            if self.flow_level == 0 {
100                match ch {
101                    '\n' | '\r' => break,
102                    ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
103                    '#' if value.is_empty()
104                        || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
105                    {
106                        break;
107                    }
108                    _ => {}
109                }
110            } else {
111                // In flow context, stop at flow indicators
112                match ch {
113                    ',' | '[' | ']' | '{' | '}' => break,
114                    ':' if self
115                        .peek_char(1)
116                        .map_or(true, |c| c.is_whitespace() || "]}".contains(c)) =>
117                    {
118                        break;
119                    }
120                    '#' if value.is_empty()
121                        || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
122                    {
123                        break;
124                    }
125                    _ => {}
126                }
127            }
128
129            value.push(ch);
130            self.advance();
131        }
132
133        // Check string length limit
134        self.resource_tracker
135            .check_string_length(&self.limits, value.len())?;
136
137        // Trim trailing whitespace from plain scalars
138        let value = value.trim_end().to_string();
139        let normalized_value = Self::normalize_scalar(value);
140
141        Ok(Token::new(
142            TokenType::Scalar(normalized_value, QuoteStyle::Plain),
143            start_pos,
144            self.position,
145        ))
146    }
147
148    fn scan_quoted_string(&mut self, quote_char: char) -> Result<Token> {
149        let start_pos = self.position;
150        let mut value = String::new();
151
152        // Skip opening quote
153        self.advance();
154
155        while let Some(ch) = self.current_char {
156            if ch == quote_char {
157                // End quote found
158                self.advance();
159                break;
160            } else if ch == '\\' && quote_char == '"' {
161                // Handle escape sequences in double quotes
162                self.advance();
163                if let Some(escaped_char) = self.current_char {
164                    match escaped_char {
165                        'n' => value.push('\n'),
166                        'r' => value.push('\r'),
167                        't' => value.push('\t'),
168                        '\\' => value.push('\\'),
169                        '"' => value.push('"'),
170                        '\'' => value.push('\''),
171                        '0' => value.push('\0'),
172                        'a' => value.push('\x07'), // Bell
173                        'b' => value.push('\x08'), // Backspace
174                        'f' => value.push('\x0C'), // Form feed
175                        'v' => value.push('\x0B'), // Vertical tab
176                        'e' => value.push('\x1B'), // Escape
177                        ' ' => value.push(' '),
178                        'N' => value.push('\u{85}'),   // Next line (NEL)
179                        '_' => value.push('\u{A0}'),   // Non-breaking space
180                        'L' => value.push('\u{2028}'), // Line separator
181                        'P' => value.push('\u{2029}'), // Paragraph separator
182                        _ => {
183                            // Invalid escape sequence
184                            return Err(Error::scan(
185                                self.position,
186                                format!("Invalid escape sequence: \\{}", escaped_char),
187                            ));
188                        }
189                    }
190                    self.advance();
191                } else {
192                    return Err(Error::scan(
193                        self.position,
194                        "Unterminated escape sequence".to_string(),
195                    ));
196                }
197            } else {
198                value.push(ch);
199                self.advance();
200            }
201        }
202
203        // Check string length limit
204        self.resource_tracker
205            .check_string_length(&self.limits, value.len())?;
206
207        let quote_style = match quote_char {
208            '\'' => QuoteStyle::Single,
209            '"' => QuoteStyle::Double,
210            _ => QuoteStyle::Plain,
211        };
212
213        Ok(Token::new(
214            TokenType::Scalar(value, quote_style),
215            start_pos,
216            self.position,
217        ))
218    }
219
220    fn scan_number(&mut self) -> Result<Token> {
221        let start_pos = self.position;
222        let mut value = String::new();
223
224        // Handle negative numbers
225        if self.current_char == Some('-') {
226            value.push('-');
227            self.advance();
228        }
229
230        // Scan digits
231        while let Some(ch) = self.current_char {
232            if ch.is_ascii_digit() {
233                value.push(ch);
234                self.advance();
235            } else if ch == '.' {
236                value.push(ch);
237                self.advance();
238                // Scan fractional part
239                while let Some(ch) = self.current_char {
240                    if ch.is_ascii_digit() {
241                        value.push(ch);
242                        self.advance();
243                    } else {
244                        break;
245                    }
246                }
247                break;
248            } else {
249                break;
250            }
251        }
252
253        Ok(Token::new(
254            TokenType::Scalar(value, QuoteStyle::Plain),
255            start_pos,
256            self.position,
257        ))
258    }
259
260    fn scan_literal_block_scalar(&mut self) -> Result<Token> {
261        let start_pos = self.position;
262
263        // Skip the '|' character
264        self.advance();
265
266        // Scan block scalar header for chomping and indentation
267        let (keep_chomping, explicit_indent) = self.scan_block_scalar_header()?;
268
269        // Find the base indentation level
270        let mut base_indent = None;
271        let mut lines = Vec::new();
272        let mut current_line = String::new();
273
274        // Skip to end of header line
275        while let Some(ch) = self.current_char {
276            if ch == '\n' || ch == '\r' {
277                self.advance();
278                break;
279            }
280            self.advance();
281        }
282
283        // Collect lines
284        while let Some(ch) = self.current_char {
285            if ch == '\n' || ch == '\r' {
286                lines.push(current_line.clone());
287                current_line.clear();
288                self.advance();
289
290                // Check if next line has content to determine if we should continue
291                let mut temp_indent = 0usize;
292                let mut has_content = false;
293
294                while let Some(next_ch) = self.peek_char(temp_indent as isize) {
295                    if next_ch == ' ' || next_ch == '\t' {
296                        temp_indent += 1;
297                    } else if next_ch == '\n' || next_ch == '\r' {
298                        // Empty line, continue collecting
299                        break;
300                    } else {
301                        has_content = true;
302                        break;
303                    }
304                }
305
306                if !has_content {
307                    // No more content lines
308                    break;
309                }
310
311                // Set base indentation from first content line
312                if base_indent.is_none() && has_content {
313                    base_indent = Some(explicit_indent.unwrap_or(temp_indent));
314                }
315            } else {
316                current_line.push(ch);
317                self.advance();
318            }
319        }
320
321        // Add final line if not empty
322        if !current_line.is_empty() {
323            lines.push(current_line);
324        }
325
326        // Join lines with newlines (literal style preserves line breaks)
327        let mut value = lines.join("\n");
328
329        // Apply chomping rules
330        if !keep_chomping {
331            value = value.trim_end_matches('\n').to_string();
332        }
333
334        // Check string length limit
335        self.resource_tracker
336            .check_string_length(&self.limits, value.len())?;
337
338        Ok(Token::new(
339            TokenType::BlockScalarLiteral(value),
340            start_pos,
341            self.position,
342        ))
343    }
344
345    fn scan_folded_block_scalar(&mut self) -> Result<Token> {
346        let start_pos = self.position;
347
348        // Skip the '>' character
349        self.advance();
350
351        // Scan block scalar header for chomping and indentation
352        let (keep_chomping, explicit_indent) = self.scan_block_scalar_header()?;
353
354        // Similar to literal but fold newlines
355        let mut base_indent = None;
356        let mut lines = Vec::new();
357        let mut current_line = String::new();
358
359        // Skip to end of header line
360        while let Some(ch) = self.current_char {
361            if ch == '\n' || ch == '\r' {
362                self.advance();
363                break;
364            }
365            self.advance();
366        }
367
368        // Collect lines
369        while let Some(ch) = self.current_char {
370            if ch == '\n' || ch == '\r' {
371                lines.push(current_line.clone());
372                current_line.clear();
373                self.advance();
374
375                // Check if next line has content
376                let mut temp_indent = 0usize;
377                let mut has_content = false;
378
379                while let Some(next_ch) = self.peek_char(temp_indent as isize) {
380                    if next_ch == ' ' || next_ch == '\t' {
381                        temp_indent += 1;
382                    } else if next_ch == '\n' || next_ch == '\r' {
383                        break;
384                    } else {
385                        has_content = true;
386                        break;
387                    }
388                }
389
390                if !has_content {
391                    break;
392                }
393
394                if base_indent.is_none() && has_content {
395                    base_indent = Some(explicit_indent.unwrap_or(temp_indent));
396                }
397            } else {
398                current_line.push(ch);
399                self.advance();
400            }
401        }
402
403        if !current_line.is_empty() {
404            lines.push(current_line);
405        }
406
407        // Fold lines: join non-empty lines with spaces, preserve empty lines
408        let mut value = String::new();
409        let mut prev_was_empty = false;
410
411        for (i, line) in lines.iter().enumerate() {
412            if line.trim().is_empty() {
413                if !prev_was_empty && i > 0 {
414                    value.push('\n');
415                }
416                prev_was_empty = true;
417            } else {
418                if i > 0 && !prev_was_empty {
419                    value.push(' ');
420                } else if prev_was_empty && i > 0 {
421                    value.push('\n');
422                }
423                value.push_str(line.trim());
424                prev_was_empty = false;
425            }
426        }
427
428        // Apply chomping rules
429        if !keep_chomping {
430            value = value.trim_end_matches('\n').to_string();
431        }
432
433        // Check string length limit
434        self.resource_tracker
435            .check_string_length(&self.limits, value.len())?;
436
437        Ok(Token::new(
438            TokenType::BlockScalarFolded(value),
439            start_pos,
440            self.position,
441        ))
442    }
443
444    fn scan_block_scalar_header(&mut self) -> Result<(bool, Option<usize>)> {
445        let mut keep_chomping = true;
446        let mut explicit_indent = None;
447
448        // Skip whitespace after '|' or '>'
449        while let Some(ch) = self.current_char {
450            if ch == ' ' || ch == '\t' {
451                self.advance();
452            } else {
453                break;
454            }
455        }
456
457        // Check for explicit indentation indicator (digit)
458        if let Some(ch) = self.current_char {
459            if ch.is_ascii_digit() {
460                explicit_indent = Some(ch.to_digit(10).unwrap() as usize);
461                self.advance();
462            }
463        }
464
465        // Check for chomping indicator
466        if let Some(ch) = self.current_char {
467            match ch {
468                '-' => {
469                    keep_chomping = false; // Strip final newlines
470                    self.advance();
471                }
472                '+' => {
473                    keep_chomping = true; // Keep final newlines
474                    self.advance();
475                }
476                _ => {}
477            }
478        }
479
480        Ok((keep_chomping, explicit_indent))
481    }
482
483    // Helper trait methods
484    fn current_position(&self) -> Position {
485        self.position
486    }
487
488    fn current_char(&self) -> Option<char> {
489        self.current_char
490    }
491
492    fn advance_char(&mut self) -> Option<char> {
493        self.advance()
494    }
495
496    fn peek_char(&self, offset: usize) -> Option<char> {
497        self.peek_char(offset as isize)
498    }
499
500    fn at_line_start(&self) -> bool {
501        self.position.column == 1
502    }
503}