Skip to main content

ass_core/tokenizer/
ass_tokenizer.rs

1//! Incremental ASS tokenizer definition and lifecycle helpers.
2//!
3//! Houses the [`AssTokenizer`] struct together with construction, batch
4//! tokenization, issue access, and position queries. The per-token stepping
5//! logic lives in the sibling `next_token` module.
6
7use super::{IssueCollector, Token, TokenContext, TokenIssue, TokenScanner};
8use crate::Result;
9use alloc::vec::Vec;
10
11#[cfg(not(feature = "std"))]
12extern crate alloc;
13
14/// Incremental tokenizer for ASS scripts with zero-copy design
15///
16/// Maintains lexical state for streaming tokenization. Uses `&'a str` spans
17/// to avoid allocations, with optional SIMD acceleration for hot paths.
18#[derive(Debug, Clone)]
19pub struct AssTokenizer<'a> {
20    /// Source text being tokenized
21    pub(super) source: &'a str,
22    /// Token scanner for character processing
23    pub(super) scanner: TokenScanner<'a>,
24    /// Current tokenization context
25    pub(super) context: TokenContext,
26    /// Issue collector for error reporting
27    issues: IssueCollector<'a>,
28}
29
30impl<'a> AssTokenizer<'a> {
31    /// Create new tokenizer for source text
32    ///
33    /// Handles BOM detection and UTF-8 validation upfront.
34    #[must_use]
35    pub fn new(source: &'a str) -> Self {
36        let initial_position = if source.starts_with('\u{FEFF}') {
37            3 // BOM is 3 bytes
38        } else {
39            0
40        };
41
42        Self {
43            source,
44            scanner: TokenScanner::new(source, initial_position, 1, 1),
45            context: TokenContext::Document,
46            issues: IssueCollector::new(),
47        }
48    }
49
50    /// Get all tokens as vector for batch processing
51    ///
52    /// # Errors
53    ///
54    /// Returns an error if tokenization fails for any token in the input.
55    pub fn tokenize_all(&mut self) -> Result<Vec<Token<'a>>> {
56        let mut tokens = Vec::new();
57        let mut iteration_count = 0;
58        while let Some(token) = self.next_token()? {
59            tokens.push(token);
60            iteration_count += 1;
61            if iteration_count > 50 {
62                return Err(crate::utils::CoreError::internal(
63                    "Too many tokenizer iterations",
64                ));
65            }
66        }
67
68        Ok(tokens)
69    }
70
71    /// Get accumulated tokenization issues
72    #[must_use]
73    pub fn issues(&self) -> &[TokenIssue<'a>] {
74        self.issues.issues()
75    }
76
77    /// Get current position in source
78    #[must_use]
79    pub const fn position(&self) -> usize {
80        self.scanner.navigator().position()
81    }
82
83    /// Get current line number (1-based)
84    #[must_use]
85    pub const fn line(&self) -> usize {
86        self.scanner.navigator().line()
87    }
88
89    /// Get current column number (1-based)
90    #[must_use]
91    pub const fn column(&self) -> usize {
92        self.scanner.navigator().column()
93    }
94
95    /// Reset tokenizer to beginning of source
96    pub fn reset(&mut self) {
97        let initial_position = if self.source.starts_with('\u{FEFF}') {
98            3
99        } else {
100            0
101        };
102        self.scanner = TokenScanner::new(self.source, initial_position, 1, 1);
103        self.context = TokenContext::Document;
104        self.issues.clear();
105    }
106}