rust_yaml/
zerocopy.rs

1//! Zero-copy parsing optimizations for YAML processing
2//!
3//! This module provides data structures and utilities for minimizing allocations
4//! during YAML parsing by using string slices where possible instead of owned strings.
5
6use crate::{Position, Result};
7use std::borrow::Cow;
8
9/// A zero-copy string that can either borrow from the input or own its data
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct ZeroString<'a> {
12    data: Cow<'a, str>,
13}
14
15impl<'a> ZeroString<'a> {
16    /// Create a borrowed zero-copy string
17    pub fn borrowed(s: &'a str) -> Self {
18        Self {
19            data: Cow::Borrowed(s),
20        }
21    }
22
23    /// Create an owned zero-copy string
24    pub fn owned(s: String) -> Self {
25        Self {
26            data: Cow::Owned(s),
27        }
28    }
29
30    /// Get the string content as a &str
31    pub fn as_str(&self) -> &str {
32        &self.data
33    }
34
35    /// Convert to owned String
36    pub fn into_owned(self) -> String {
37        self.data.into_owned()
38    }
39
40    /// Check if this is borrowed data
41    pub fn is_borrowed(&self) -> bool {
42        matches!(self.data, Cow::Borrowed(_))
43    }
44
45    /// Get the length in bytes
46    pub fn len(&self) -> usize {
47        self.data.len()
48    }
49
50    /// Check if empty
51    pub fn is_empty(&self) -> bool {
52        self.data.is_empty()
53    }
54}
55
56impl<'a> From<&'a str> for ZeroString<'a> {
57    fn from(s: &'a str) -> Self {
58        Self::borrowed(s)
59    }
60}
61
62impl<'a> From<String> for ZeroString<'a> {
63    fn from(s: String) -> Self {
64        Self::owned(s)
65    }
66}
67
68impl<'a> AsRef<str> for ZeroString<'a> {
69    fn as_ref(&self) -> &str {
70        &self.data
71    }
72}
73
74/// Zero-copy token types that use string slices where possible
75#[derive(Debug, Clone, PartialEq)]
76pub enum ZeroTokenType<'a> {
77    /// Stream start marker
78    StreamStart,
79    /// Stream end marker
80    StreamEnd,
81    /// Document start marker (---)
82    DocumentStart,
83    /// Document end marker (...)
84    DocumentEnd,
85    /// Block sequence start ([)
86    BlockSequenceStart,
87    /// Block mapping start ({)
88    BlockMappingStart,
89    /// Block end marker
90    BlockEnd,
91    /// Flow sequence start ([)
92    FlowSequenceStart,
93    /// Flow sequence end (])
94    FlowSequenceEnd,
95    /// Flow mapping start ({)
96    FlowMappingStart,
97    /// Flow mapping end (})
98    FlowMappingEnd,
99    /// Block entry marker (-)
100    BlockEntry,
101    /// Flow entry separator (,)
102    FlowEntry,
103    /// Key marker (?)
104    Key,
105    /// Value separator (:)
106    Value,
107    /// Scalar value with quote style
108    Scalar(ZeroString<'a>, crate::scanner::QuoteStyle),
109    /// Literal block scalar (|)
110    BlockScalarLiteral(ZeroString<'a>),
111    /// Folded block scalar (>)
112    BlockScalarFolded(ZeroString<'a>),
113    /// Anchor definition (&name)
114    Anchor(ZeroString<'a>),
115    /// Alias reference (*name)
116    Alias(ZeroString<'a>),
117    /// Tag (!tag)
118    Tag(ZeroString<'a>),
119    /// Comment (# comment)
120    Comment(ZeroString<'a>),
121}
122
123/// Zero-copy token with position information
124#[derive(Debug, Clone, PartialEq)]
125pub struct ZeroToken<'a> {
126    /// The type of token
127    pub token_type: ZeroTokenType<'a>,
128    /// Starting position in the input
129    pub start_position: Position,
130    /// Ending position in the input
131    pub end_position: Position,
132}
133
134impl<'a> ZeroToken<'a> {
135    /// Create a new zero-copy token
136    pub fn new(
137        token_type: ZeroTokenType<'a>,
138        start_position: Position,
139        end_position: Position,
140    ) -> Self {
141        Self {
142            token_type,
143            start_position,
144            end_position,
145        }
146    }
147
148    /// Create a simple token without data
149    pub fn simple(token_type: ZeroTokenType<'a>, position: Position) -> Self {
150        Self::new(token_type, position, position)
151    }
152
153    /// Convert to owned token (for compatibility)
154    pub fn into_owned(self) -> crate::scanner::Token {
155        use crate::scanner::{Token, TokenType};
156
157        let token_type = match self.token_type {
158            ZeroTokenType::StreamStart => TokenType::StreamStart,
159            ZeroTokenType::StreamEnd => TokenType::StreamEnd,
160            ZeroTokenType::DocumentStart => TokenType::DocumentStart,
161            ZeroTokenType::DocumentEnd => TokenType::DocumentEnd,
162            ZeroTokenType::BlockSequenceStart => TokenType::BlockSequenceStart,
163            ZeroTokenType::BlockMappingStart => TokenType::BlockMappingStart,
164            ZeroTokenType::BlockEnd => TokenType::BlockEnd,
165            ZeroTokenType::FlowSequenceStart => TokenType::FlowSequenceStart,
166            ZeroTokenType::FlowSequenceEnd => TokenType::FlowSequenceEnd,
167            ZeroTokenType::FlowMappingStart => TokenType::FlowMappingStart,
168            ZeroTokenType::FlowMappingEnd => TokenType::FlowMappingEnd,
169            ZeroTokenType::BlockEntry => TokenType::BlockEntry,
170            ZeroTokenType::FlowEntry => TokenType::FlowEntry,
171            ZeroTokenType::Key => TokenType::Key,
172            ZeroTokenType::Value => TokenType::Value,
173            ZeroTokenType::Scalar(s, style) => TokenType::Scalar(s.into_owned(), style),
174            ZeroTokenType::BlockScalarLiteral(s) => TokenType::BlockScalarLiteral(s.into_owned()),
175            ZeroTokenType::BlockScalarFolded(s) => TokenType::BlockScalarFolded(s.into_owned()),
176            ZeroTokenType::Anchor(s) => TokenType::Anchor(s.into_owned()),
177            ZeroTokenType::Alias(s) => TokenType::Alias(s.into_owned()),
178            ZeroTokenType::Tag(s) => TokenType::Tag(s.into_owned()),
179            ZeroTokenType::Comment(s) => TokenType::Comment(s.into_owned()),
180        };
181
182        Token::new(token_type, self.start_position, self.end_position)
183    }
184}
185
186/// Memory pool for token allocation to reduce heap allocations
187pub struct TokenPool<'a> {
188    /// Pool of reusable tokens
189    tokens: Vec<ZeroToken<'a>>,
190    /// Current index in the pool
191    index: usize,
192}
193
194impl<'a> TokenPool<'a> {
195    /// Create a new token pool with initial capacity
196    pub fn with_capacity(capacity: usize) -> Self {
197        Self {
198            tokens: Vec::with_capacity(capacity),
199            index: 0,
200        }
201    }
202
203    /// Get a token from the pool or create a new one
204    pub fn get_token(&mut self) -> &mut ZeroToken<'a> {
205        if self.index >= self.tokens.len() {
206            // Need to allocate a new token
207            self.tokens.push(ZeroToken::simple(
208                ZeroTokenType::StreamStart,
209                Position::start(),
210            ));
211        }
212
213        let token = &mut self.tokens[self.index];
214        self.index += 1;
215        token
216    }
217
218    /// Reset the pool for reuse
219    pub fn reset(&mut self) {
220        self.index = 0;
221    }
222
223    /// Get the number of tokens currently allocated
224    pub fn allocated_count(&self) -> usize {
225        self.tokens.len()
226    }
227
228    /// Get the number of tokens currently in use
229    pub fn used_count(&self) -> usize {
230        self.index
231    }
232}
233
234/// Zero-copy string scanner that operates on slices
235pub struct ZeroScanner<'a> {
236    /// Reference to the input string
237    input: &'a str,
238    /// Current position in the input
239    pub position: Position,
240    /// Current character index
241    char_index: usize,
242    /// Cached character indices for faster access
243    char_indices: Vec<(usize, char)>,
244    /// Token pool for allocation optimization
245    token_pool: TokenPool<'a>,
246}
247
248impl<'a> ZeroScanner<'a> {
249    /// Create a new zero-copy scanner
250    pub fn new(input: &'a str) -> Self {
251        let char_indices: Vec<(usize, char)> = input.char_indices().collect();
252
253        Self {
254            input,
255            position: Position::start(),
256            char_index: 0,
257            char_indices,
258            token_pool: TokenPool::with_capacity(128), // Start with reasonable capacity
259        }
260    }
261
262    /// Get the current character
263    pub fn current_char(&self) -> Option<char> {
264        self.char_indices.get(self.char_index).map(|(_, ch)| *ch)
265    }
266
267    /// Advance to the next character
268    pub fn advance(&mut self) -> Option<char> {
269        if let Some((_byte_index, ch)) = self.char_indices.get(self.char_index) {
270            self.position = self.position.advance(*ch);
271            self.char_index += 1;
272            self.char_indices.get(self.char_index).map(|(_, ch)| *ch)
273        } else {
274            None
275        }
276    }
277
278    /// Peek at a character at the given offset
279    pub fn peek_char(&self, offset: isize) -> Option<char> {
280        if offset >= 0 {
281            let index = self.char_index + offset as usize;
282            self.char_indices.get(index).map(|(_, ch)| *ch)
283        } else {
284            let offset_abs = (-offset) as usize;
285            if self.char_index >= offset_abs {
286                let index = self.char_index - offset_abs;
287                self.char_indices.get(index).map(|(_, ch)| *ch)
288            } else {
289                None
290            }
291        }
292    }
293
294    /// Get a slice of the input from start position to current position
295    pub fn slice_from(&self, start_position: Position) -> Result<&'a str> {
296        let start_byte = start_position.index;
297        let end_byte = self.position.index;
298
299        if start_byte <= end_byte && end_byte <= self.input.len() {
300            Ok(&self.input[start_byte..end_byte])
301        } else {
302            Err(crate::Error::parse(
303                self.position,
304                "Invalid slice bounds".to_string(),
305            ))
306        }
307    }
308
309    /// Get a slice between two positions
310    pub fn slice_between(&self, start: Position, end: Position) -> Result<&'a str> {
311        let start_byte = start.index;
312        let end_byte = end.index;
313
314        if start_byte <= end_byte && end_byte <= self.input.len() {
315            Ok(&self.input[start_byte..end_byte])
316        } else {
317            Err(crate::Error::parse(
318                self.position,
319                "Invalid slice bounds".to_string(),
320            ))
321        }
322    }
323
324    /// Reset the scanner to the beginning
325    pub fn reset(&mut self) {
326        self.position = Position::start();
327        self.char_index = 0;
328        self.token_pool.reset();
329    }
330
331    /// Get scanner statistics for performance monitoring
332    pub fn stats(&self) -> ScannerStats {
333        ScannerStats {
334            input_length: self.input.len(),
335            chars_processed: self.char_index,
336            tokens_allocated: self.token_pool.allocated_count(),
337            tokens_used: self.token_pool.used_count(),
338            position: self.position,
339        }
340    }
341
342    /// Scan a plain scalar using zero-copy slicing
343    pub fn scan_plain_scalar_zero_copy(&mut self) -> Result<ZeroToken<'a>> {
344        let start_pos = self.position;
345
346        // Find the end of the scalar without allocating
347        while let Some(ch) = self.current_char() {
348            // Stop at structural characters (same logic as regular scanner)
349            match ch {
350                '\n' | '\r' => break,
351                ':' if self.peek_char(1).map_or(true, |c| c.is_whitespace()) => break,
352                '#' if self.char_index == 0
353                    || self.peek_char(-1).map_or(false, |c| c.is_whitespace()) =>
354                {
355                    break;
356                }
357                ',' | '[' | ']' | '{' | '}' => break,
358                _ => {
359                    self.advance();
360                }
361            }
362        }
363
364        // Get the slice without allocation
365        let slice = self.slice_from(start_pos)?;
366        let trimmed_slice = slice.trim_end();
367
368        // Use borrowed string if possible
369        let zero_string = if trimmed_slice.len() == slice.len() {
370            // No trimming needed, can use borrowed slice directly
371            ZeroString::borrowed(trimmed_slice)
372        } else {
373            // Need to allocate for trimmed version
374            ZeroString::owned(trimmed_slice.to_string())
375        };
376
377        Ok(ZeroToken::new(
378            ZeroTokenType::Scalar(zero_string, crate::scanner::QuoteStyle::Plain),
379            start_pos,
380            self.position,
381        ))
382    }
383
384    /// Scan a simple identifier using zero-copy slicing (for anchors/aliases)
385    pub fn scan_identifier_zero_copy(&mut self) -> Result<ZeroString<'a>> {
386        let start_pos = self.position;
387
388        // Scan identifier characters
389        while let Some(ch) = self.current_char() {
390            if ch.is_alphanumeric() || ch == '_' || ch == '-' {
391                self.advance();
392            } else {
393                break;
394            }
395        }
396
397        let slice = self.slice_from(start_pos)?;
398        Ok(ZeroString::borrowed(slice))
399    }
400
401    /// Skip whitespace efficiently
402    pub fn skip_whitespace(&mut self) {
403        while let Some(ch) = self.current_char() {
404            if ch == ' ' || ch == '\t' {
405                self.advance();
406            } else {
407                break;
408            }
409        }
410    }
411}
412
413/// Statistics about scanner performance
414#[derive(Debug, Clone)]
415pub struct ScannerStats {
416    /// Total input length in bytes
417    pub input_length: usize,
418    /// Number of characters processed
419    pub chars_processed: usize,
420    /// Number of tokens allocated in the pool
421    pub tokens_allocated: usize,
422    /// Number of tokens currently used
423    pub tokens_used: usize,
424    /// Current position
425    pub position: Position,
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431
432    #[test]
433    fn test_zero_string_borrowed() {
434        let s = "hello world";
435        let zs = ZeroString::borrowed(s);
436
437        assert!(zs.is_borrowed());
438        assert_eq!(zs.as_str(), "hello world");
439        assert_eq!(zs.len(), 11);
440        assert!(!zs.is_empty());
441    }
442
443    #[test]
444    fn test_zero_string_owned() {
445        let s = String::from("hello world");
446        let zs = ZeroString::owned(s);
447
448        assert!(!zs.is_borrowed());
449        assert_eq!(zs.as_str(), "hello world");
450        assert_eq!(zs.len(), 11);
451    }
452
453    #[test]
454    fn test_zero_scanner_basic() {
455        let input = "hello: world";
456        let mut scanner = ZeroScanner::new(input);
457
458        assert_eq!(scanner.current_char(), Some('h'));
459        assert_eq!(scanner.advance(), Some('e'));
460        assert_eq!(scanner.current_char(), Some('e'));
461
462        // Test peeking
463        assert_eq!(scanner.peek_char(1), Some('l'));
464        assert_eq!(scanner.peek_char(-1), Some('h'));
465    }
466
467    #[test]
468    fn test_zero_scanner_slicing() {
469        let input = "hello: world";
470        let mut scanner = ZeroScanner::new(input);
471
472        let start = scanner.position;
473
474        // Advance past "hello"
475        for _ in 0..5 {
476            scanner.advance();
477        }
478
479        let slice = scanner.slice_from(start).unwrap();
480        assert_eq!(slice, "hello");
481    }
482
483    #[test]
484    fn test_token_pool() {
485        let mut pool = TokenPool::with_capacity(2);
486
487        assert_eq!(pool.allocated_count(), 0);
488        assert_eq!(pool.used_count(), 0);
489
490        let _token1 = pool.get_token();
491        assert_eq!(pool.allocated_count(), 1);
492        assert_eq!(pool.used_count(), 1);
493
494        let _token2 = pool.get_token();
495        assert_eq!(pool.allocated_count(), 2);
496        assert_eq!(pool.used_count(), 2);
497
498        pool.reset();
499        assert_eq!(pool.allocated_count(), 2); // Still allocated
500        assert_eq!(pool.used_count(), 0); // But not in use
501    }
502
503    #[test]
504    fn test_zero_copy_scalar_scanning() {
505        let input = "hello world: test";
506        let mut scanner = ZeroScanner::new(input);
507
508        let token = scanner.scan_plain_scalar_zero_copy().unwrap();
509
510        if let ZeroTokenType::Scalar(value, _) = token.token_type {
511            assert_eq!(value.as_str(), "hello world");
512            assert!(value.is_borrowed()); // Should be zero-copy
513        } else {
514            panic!("Expected scalar token");
515        }
516    }
517
518    #[test]
519    fn test_zero_copy_identifier_scanning() {
520        let input = "my_anchor_123 ";
521        let mut scanner = ZeroScanner::new(input);
522
523        let identifier = scanner.scan_identifier_zero_copy().unwrap();
524        assert_eq!(identifier.as_str(), "my_anchor_123");
525        assert!(identifier.is_borrowed()); // Should be zero-copy
526    }
527
528    #[test]
529    fn test_zero_copy_trimming() {
530        let input = "hello   \n";
531        let mut scanner = ZeroScanner::new(input);
532
533        let token = scanner.scan_plain_scalar_zero_copy().unwrap();
534
535        if let ZeroTokenType::Scalar(value, _) = token.token_type {
536            assert_eq!(value.as_str(), "hello");
537            // Should be owned because trimming was needed
538            assert!(!value.is_borrowed());
539        } else {
540            panic!("Expected scalar token");
541        }
542    }
543}