Skip to main content

ass_core/tokenizer/scanner/
text_scanner.rs

1//! Text and field-value scanning routines for the token scanner.
2//!
3//! Implements general text scanning, field-value scanning, and hex-value
4//! detection, including the optional SIMD-accelerated fast paths.
5
6use super::token_scanner::TokenScanner;
7use crate::tokenizer::{state::TokenContext, tokens::TokenType};
8use crate::Result;
9
10#[cfg(feature = "simd")]
11use crate::tokenizer::simd;
12
13impl TokenScanner<'_> {
14    /// Scan general text content
15    ///
16    /// # Errors
17    ///
18    /// Returns an error if character navigation fails.
19    pub fn scan_text(&mut self, context: TokenContext) -> Result<TokenType> {
20        let start = self.navigator.position();
21
22        // Use SIMD delimiter scanning when available and context doesn't affect delimiters
23        #[cfg(feature = "simd")]
24        {
25            // Only use SIMD when context doesn't change delimiter behavior
26            let use_simd = !matches!(context, TokenContext::FieldValue);
27
28            if use_simd {
29                if let Some(delimiter_pos) = self.scan_delimiters_simd(start) {
30                    self.navigator.position = delimiter_pos;
31                } else {
32                    self.navigator.position = self.source.len();
33                }
34                self.navigator.chars = self.source[self.navigator.position..].chars();
35                self.navigator.peek_char = None;
36            }
37        }
38
39        // Fallback to scalar scanning (or when SIMD can't be used due to context)
40        #[cfg(not(feature = "simd"))]
41        let use_scalar = true;
42        #[cfg(feature = "simd")]
43        let use_scalar = matches!(context, TokenContext::FieldValue);
44
45        if use_scalar {
46            while !self.navigator.is_at_end() {
47                let ch = self.navigator.peek_char()?;
48
49                // Check for delimiters based on context
50                let is_delimiter = match context {
51                    TokenContext::FieldValue => {
52                        // In field values, don't treat colon as delimiter (for time formats)
53                        matches!(ch, ',' | '{' | '}' | '[' | ']' | '\n' | '\r')
54                    }
55                    _ => {
56                        // In other contexts, treat colon as delimiter
57                        matches!(ch, ',' | ':' | '{' | '}' | '[' | ']' | '\n' | '\r')
58                            || (ch == ';' && context == TokenContext::Document)
59                    }
60                };
61
62                if is_delimiter {
63                    break;
64                }
65
66                self.navigator.advance_char()?;
67            }
68        }
69
70        let span = &self.source[start..self.navigator.position()];
71
72        if context == TokenContext::SectionHeader {
73            Ok(TokenType::SectionName)
74        } else if Self::is_hex_value(span) {
75            Ok(TokenType::HexValue)
76        } else if !span.is_empty()
77            && span
78                .chars()
79                .all(|c| c.is_ascii_digit() || c == '.' || c == '-')
80        {
81            Ok(TokenType::Number)
82        } else if !span.is_empty() && span.chars().all(char::is_whitespace) {
83            Ok(TokenType::Whitespace)
84        } else {
85            Ok(TokenType::Text)
86        }
87    }
88
89    /// Check if a span represents a hex value
90    pub(super) fn is_hex_value(span: &str) -> bool {
91        // Check &H format first (standard ASS hex format)
92        if let Some(after_prefix) = span.strip_prefix("&H") {
93            let hex_part = after_prefix
94                .strip_suffix('&')
95                .map_or(after_prefix, |stripped| stripped);
96
97            if !hex_part.is_empty()
98                && hex_part.len() % 2 == 0
99                && hex_part.len() <= 8
100                && hex_part.chars().all(|c| c.is_ascii_hexdigit())
101            {
102                #[cfg(feature = "simd")]
103                {
104                    return TokenScanner::parse_hex_simd(hex_part).is_some();
105                }
106                #[cfg(not(feature = "simd"))]
107                {
108                    return true;
109                }
110            }
111        }
112
113        // Raw hex without &H prefix is very rare in ASS files - don't detect it
114        // to avoid conflicts with numbers and text
115
116        false
117    }
118
119    /// Fast delimiter scanning using SIMD when available
120    #[cfg(feature = "simd")]
121    fn scan_delimiters_simd(&self, start: usize) -> Option<usize> {
122        simd::scan_delimiters(&self.source[start..]).map(|offset| start + offset)
123    }
124
125    /// Fast hex parsing using SIMD when available
126    #[cfg(feature = "simd")]
127    fn parse_hex_simd(hex_str: &str) -> Option<u32> {
128        simd::parse_hex_u32(hex_str)
129    }
130
131    /// Scan field value content in field value context
132    ///
133    /// In field value context, colons are not delimiters (for time formats)
134    /// and we consume until comma, newline, or end of input.
135    ///
136    /// # Errors
137    ///
138    /// Returns an error if character navigation fails.
139    pub fn scan_field_value(&mut self) -> Result<TokenType> {
140        let start = self.navigator.position();
141
142        while !self.navigator.is_at_end() {
143            let ch = self.navigator.peek_char()?;
144
145            // Stop at delimiters that end field values
146            if ch == ',' || ch == '\n' || ch == '\r' || ch == '{' || ch == '[' {
147                break;
148            }
149
150            self.navigator.advance_char()?;
151        }
152
153        let span = &self.source[start..self.navigator.position()];
154
155        if !span.is_empty()
156            && span
157                .chars()
158                .all(|c| c.is_ascii_digit() || c == '.' || c == '-' || c == ':')
159        {
160            Ok(TokenType::Number)
161        } else if !span.is_empty() && span.chars().all(char::is_whitespace) {
162            Ok(TokenType::Whitespace)
163        } else {
164            Ok(TokenType::Text)
165        }
166    }
167}