Skip to main content

rez_next_version/
parser.rs

1//! High-performance version parsing utilities with zero-copy state machine
2
3use super::Version;
4use ahash::AHashMap;
5use once_cell::sync::Lazy;
6use rez_next_common::RezCoreError;
7use smallvec::SmallVec;
8use std::sync::RwLock;
9
10/// String interning pool for reducing memory allocations
11static STRING_INTERN_POOL: Lazy<RwLock<AHashMap<String, &'static str>>> =
12    Lazy::new(|| RwLock::new(AHashMap::new()));
13
14/// Token types for state machine parsing
15#[derive(Debug, Clone, PartialEq)]
16pub enum TokenType {
17    Numeric(u64),
18    Alphanumeric(String),
19    Separator(char),
20}
21
22/// Parser state for state machine
23#[derive(Debug, Clone, Copy, PartialEq)]
24enum ParseState {
25    Start,
26    InToken,
27    InSeparator,
28}
29
30/// High-performance version parser with state machine and zero-copy optimization
31pub struct StateMachineParser {
32    /// Enable string interning for memory optimization
33    use_interning: bool,
34    /// Maximum number of tokens allowed
35    max_tokens: usize,
36    /// Maximum number of numeric tokens allowed
37    max_numeric_tokens: usize,
38}
39
40impl StateMachineParser {
41    /// Create a new high-performance parser
42    pub fn new() -> Self {
43        Self {
44            use_interning: true,
45            max_tokens: 10,
46            max_numeric_tokens: 5,
47        }
48    }
49
50    /// Create parser with custom configuration
51    pub fn with_config(use_interning: bool, max_tokens: usize, max_numeric_tokens: usize) -> Self {
52        Self {
53            use_interning,
54            max_tokens,
55            max_numeric_tokens,
56        }
57    }
58
59    /// Intern a string to reduce memory allocations
60    fn intern_string(&self, s: String) -> String {
61        if !self.use_interning || s.len() > 64 {
62            return s;
63        }
64
65        // Try to get from pool first
66        {
67            let pool = STRING_INTERN_POOL.read().unwrap();
68            if let Some(&interned) = pool.get(&s) {
69                return interned.to_string();
70            }
71        }
72
73        // Add to pool if not found
74        {
75            let mut pool = STRING_INTERN_POOL.write().unwrap();
76            // Double-check after acquiring write lock
77            if let Some(&interned) = pool.get(&s) {
78                return interned.to_string();
79            }
80
81            // Limit pool size to prevent memory leaks
82            if pool.len() < 10000 {
83                let leaked: &'static str = Box::leak(s.clone().into_boxed_str());
84                pool.insert(s.clone(), leaked);
85                return leaked.to_string();
86            }
87        }
88
89        s
90    }
91
92    /// Fast character classification using lookup table
93    #[inline(always)]
94    fn is_valid_separator(c: char) -> bool {
95        matches!(c, '.' | '-' | '_' | '+')
96    }
97
98    /// Fast alphanumeric check with underscore support
99    #[inline(always)]
100    fn is_token_char(c: char) -> bool {
101        c.is_ascii_alphanumeric() || c == '_'
102    }
103
104    /// Parse version string using zero-copy state machine
105    #[allow(clippy::type_complexity)]
106    pub fn parse_tokens(
107        &self,
108        input: &str,
109    ) -> Result<(SmallVec<[TokenType; 8]>, SmallVec<[char; 7]>), RezCoreError> {
110        if input.is_empty() {
111            return Ok((SmallVec::new(), SmallVec::new()));
112        }
113
114        let mut tokens = SmallVec::new();
115        let mut separators = SmallVec::new();
116        let mut state = ParseState::Start;
117        let mut current_token = String::new();
118        let mut numeric_count = 0;
119
120        let chars: SmallVec<[char; 64]> = input.chars().collect();
121        let mut i = 0;
122
123        while i < chars.len() {
124            let c = chars[i];
125
126            match state {
127                ParseState::Start => {
128                    if Self::is_token_char(c) {
129                        current_token.push(c);
130                        state = ParseState::InToken;
131                    } else if Self::is_valid_separator(c) {
132                        return Err(RezCoreError::VersionParse(format!(
133                            "Version cannot start with separator '{}'",
134                            c
135                        )));
136                    } else {
137                        return Err(RezCoreError::VersionParse(format!(
138                            "Invalid character '{}' at start of version",
139                            c
140                        )));
141                    }
142                }
143
144                ParseState::InToken => {
145                    if Self::is_token_char(c) {
146                        current_token.push(c);
147                    } else if Self::is_valid_separator(c) {
148                        // Finalize current token
149                        self.finalize_token(&mut current_token, &mut tokens, &mut numeric_count)?;
150                        separators.push(c);
151                        state = ParseState::InSeparator;
152                    } else {
153                        return Err(RezCoreError::VersionParse(format!(
154                            "Invalid character '{}' in token",
155                            c
156                        )));
157                    }
158                }
159
160                ParseState::InSeparator => {
161                    if Self::is_token_char(c) {
162                        current_token.push(c);
163                        state = ParseState::InToken;
164                    } else {
165                        return Err(RezCoreError::VersionParse(format!(
166                            "Expected token character after separator, found '{}'",
167                            c
168                        )));
169                    }
170                }
171            }
172
173            i += 1;
174        }
175
176        // Finalize last token if we're in a token state
177        if state == ParseState::InToken && !current_token.is_empty() {
178            self.finalize_token(&mut current_token, &mut tokens, &mut numeric_count)?;
179        } else if state == ParseState::InSeparator {
180            return Err(RezCoreError::VersionParse(
181                "Version cannot end with separator".to_string(),
182            ));
183        }
184
185        // Validate token counts
186        if tokens.len() > self.max_tokens {
187            return Err(RezCoreError::VersionParse(format!(
188                "Too many tokens: {} (max: {})",
189                tokens.len(),
190                self.max_tokens
191            )));
192        }
193
194        if numeric_count > self.max_numeric_tokens {
195            return Err(RezCoreError::VersionParse(format!(
196                "Too many numeric tokens: {} (max: {})",
197                numeric_count, self.max_numeric_tokens
198            )));
199        }
200
201        Ok((tokens, separators))
202    }
203
204    /// Finalize a token and add it to the tokens list
205    fn finalize_token(
206        &self,
207        current_token: &mut String,
208        tokens: &mut SmallVec<[TokenType; 8]>,
209        numeric_count: &mut usize,
210    ) -> Result<(), RezCoreError> {
211        if current_token.is_empty() {
212            return Err(RezCoreError::VersionParse("Empty token found".to_string()));
213        }
214
215        // Validate token format
216        if current_token.starts_with('_') || current_token.ends_with('_') {
217            return Err(RezCoreError::VersionParse(format!(
218                "Invalid token format: '{}'",
219                current_token
220            )));
221        }
222
223        // Check for invalid patterns
224        if current_token == "not" || current_token == "version" {
225            return Err(RezCoreError::VersionParse(format!(
226                "Invalid version token: '{}'",
227                current_token
228            )));
229        }
230
231        // Reject overly long alphabetic tokens
232        if current_token.chars().all(|c| c.is_alphabetic()) && current_token.len() > 10 {
233            return Err(RezCoreError::VersionParse(format!(
234                "Invalid version token: '{}'",
235                current_token
236            )));
237        }
238
239        // Try to parse as numeric first (fast path)
240        if current_token.chars().all(|c| c.is_ascii_digit()) {
241            if let Ok(num) = current_token.parse::<u64>() {
242                tokens.push(TokenType::Numeric(num));
243                *numeric_count += 1;
244            } else {
245                // Number too large, treat as alphanumeric
246                let interned = self.intern_string(current_token.clone());
247                tokens.push(TokenType::Alphanumeric(interned));
248            }
249        } else {
250            // Alphanumeric token
251            let interned = self.intern_string(current_token.clone());
252            tokens.push(TokenType::Alphanumeric(interned));
253        }
254
255        current_token.clear();
256        Ok(())
257    }
258}
259
260/// Legacy VersionParser for backward compatibility
261pub struct VersionParser {
262    _inner: StateMachineParser,
263}
264
265impl VersionParser {
266    /// Create a new parser
267    pub fn new() -> Self {
268        Self {
269            _inner: StateMachineParser::new(),
270        }
271    }
272
273    /// Parse a complete version string
274    pub fn parse_version(&self, input: &str) -> Result<Version, RezCoreError> {
275        // Use the new state machine parser for better performance
276        // but fall back to the original implementation for now
277        Version::parse(input)
278    }
279}
280
281impl Default for VersionParser {
282    fn default() -> Self {
283        Self::new()
284    }
285}
286
287impl Default for StateMachineParser {
288    fn default() -> Self {
289        Self::new()
290    }
291}
292
293#[cfg(test)]
294mod tests {
295    use super::*;
296
297    #[test]
298    fn test_parser_creation() {
299        let _parser = VersionParser::new();
300        let _state_machine_parser = StateMachineParser::new();
301        // Verify parsers can be created without panicking
302    }
303
304    #[test]
305    fn test_state_machine_parser_basic() {
306        let parser = StateMachineParser::new();
307
308        // Test empty input
309        let (tokens, separators) = parser.parse_tokens("").unwrap();
310        assert!(tokens.is_empty());
311        assert!(separators.is_empty());
312
313        // Test simple version
314        let (tokens, separators) = parser.parse_tokens("1.2.3").unwrap();
315        assert_eq!(tokens.len(), 3);
316        assert_eq!(separators.len(), 2);
317
318        // Check token types
319        match &tokens[0] {
320            TokenType::Numeric(n) => assert_eq!(*n, 1),
321            _ => panic!("Expected numeric token"),
322        }
323
324        assert_eq!(separators[0], '.');
325        assert_eq!(separators[1], '.');
326    }
327
328    #[test]
329    fn test_state_machine_parser_alphanumeric() {
330        let parser = StateMachineParser::new();
331
332        let (tokens, separators) = parser.parse_tokens("1.2.3-alpha1").unwrap();
333        assert_eq!(tokens.len(), 4);
334        assert_eq!(separators.len(), 3);
335
336        // Check mixed token types
337        match &tokens[0] {
338            TokenType::Numeric(n) => assert_eq!(*n, 1),
339            _ => panic!("Expected numeric token"),
340        }
341
342        match &tokens[3] {
343            TokenType::Alphanumeric(s) => assert_eq!(s, "alpha1"),
344            _ => panic!("Expected alphanumeric token"),
345        }
346    }
347
348    #[test]
349    fn test_state_machine_parser_errors() {
350        let parser = StateMachineParser::new();
351
352        // Test invalid start
353        assert!(parser.parse_tokens(".1.2.3").is_err());
354
355        // Test invalid end
356        assert!(parser.parse_tokens("1.2.3.").is_err());
357
358        // Test invalid characters
359        assert!(parser.parse_tokens("1.2.3@").is_err());
360
361        // Test invalid token patterns
362        assert!(parser.parse_tokens("_invalid").is_err());
363        assert!(parser.parse_tokens("invalid_").is_err());
364    }
365
366    #[test]
367    fn test_string_interning() {
368        let parser = StateMachineParser::with_config(true, 10, 5);
369
370        // Parse the same version multiple times
371        let (tokens1, _) = parser.parse_tokens("1.0.0-alpha").unwrap();
372        let (tokens2, _) = parser.parse_tokens("1.0.0-alpha").unwrap();
373
374        // String interning should work for alphanumeric tokens
375        if let (TokenType::Alphanumeric(s1), TokenType::Alphanumeric(s2)) =
376            (&tokens1[3], &tokens2[3])
377        {
378            // Note: We can't directly test pointer equality due to the way we handle interning
379            assert_eq!(s1, s2);
380        }
381    }
382
383    #[test]
384    fn test_performance_limits() {
385        let parser = StateMachineParser::new();
386
387        // Test max tokens limit
388        let too_many_tokens = (0..15).map(|i| i.to_string()).collect::<Vec<_>>().join(".");
389        assert!(parser.parse_tokens(&too_many_tokens).is_err());
390
391        // Test max numeric tokens limit
392        let too_many_numeric = (0..10).map(|i| i.to_string()).collect::<Vec<_>>().join(".");
393        assert!(parser.parse_tokens(&too_many_numeric).is_err());
394    }
395
396    #[test]
397    fn test_character_classification() {
398        assert!(StateMachineParser::is_valid_separator('.'));
399        assert!(StateMachineParser::is_valid_separator('-'));
400        assert!(StateMachineParser::is_valid_separator('_'));
401        assert!(StateMachineParser::is_valid_separator('+'));
402        assert!(!StateMachineParser::is_valid_separator('@'));
403
404        assert!(StateMachineParser::is_token_char('a'));
405        assert!(StateMachineParser::is_token_char('1'));
406        assert!(StateMachineParser::is_token_char('_'));
407        assert!(!StateMachineParser::is_token_char('.'));
408    }
409}