rez_next_version/
parser.rs1use super::Version;
4use ahash::AHashMap;
5use once_cell::sync::Lazy;
6use rez_next_common::RezCoreError;
7use smallvec::SmallVec;
8use std::sync::RwLock;
9
10static STRING_INTERN_POOL: Lazy<RwLock<AHashMap<String, &'static str>>> =
12 Lazy::new(|| RwLock::new(AHashMap::new()));
13
14#[derive(Debug, Clone, PartialEq)]
16pub enum TokenType {
17 Numeric(u64),
18 Alphanumeric(String),
19 Separator(char),
20}
21
22#[derive(Debug, Clone, Copy, PartialEq)]
24enum ParseState {
25 Start,
26 InToken,
27 InSeparator,
28}
29
30pub struct StateMachineParser {
32 use_interning: bool,
34 max_tokens: usize,
36 max_numeric_tokens: usize,
38}
39
40impl StateMachineParser {
41 pub fn new() -> Self {
43 Self {
44 use_interning: true,
45 max_tokens: 10,
46 max_numeric_tokens: 5,
47 }
48 }
49
50 pub fn with_config(use_interning: bool, max_tokens: usize, max_numeric_tokens: usize) -> Self {
52 Self {
53 use_interning,
54 max_tokens,
55 max_numeric_tokens,
56 }
57 }
58
59 fn intern_string(&self, s: String) -> String {
61 if !self.use_interning || s.len() > 64 {
62 return s;
63 }
64
65 {
67 let pool = STRING_INTERN_POOL.read().unwrap();
68 if let Some(&interned) = pool.get(&s) {
69 return interned.to_string();
70 }
71 }
72
73 {
75 let mut pool = STRING_INTERN_POOL.write().unwrap();
76 if let Some(&interned) = pool.get(&s) {
78 return interned.to_string();
79 }
80
81 if pool.len() < 10000 {
83 let leaked: &'static str = Box::leak(s.clone().into_boxed_str());
84 pool.insert(s.clone(), leaked);
85 return leaked.to_string();
86 }
87 }
88
89 s
90 }
91
92 #[inline(always)]
94 fn is_valid_separator(c: char) -> bool {
95 matches!(c, '.' | '-' | '_' | '+')
96 }
97
98 #[inline(always)]
100 fn is_token_char(c: char) -> bool {
101 c.is_ascii_alphanumeric() || c == '_'
102 }
103
104 #[allow(clippy::type_complexity)]
106 pub fn parse_tokens(
107 &self,
108 input: &str,
109 ) -> Result<(SmallVec<[TokenType; 8]>, SmallVec<[char; 7]>), RezCoreError> {
110 if input.is_empty() {
111 return Ok((SmallVec::new(), SmallVec::new()));
112 }
113
114 let mut tokens = SmallVec::new();
115 let mut separators = SmallVec::new();
116 let mut state = ParseState::Start;
117 let mut current_token = String::new();
118 let mut numeric_count = 0;
119
120 let chars: SmallVec<[char; 64]> = input.chars().collect();
121 let mut i = 0;
122
123 while i < chars.len() {
124 let c = chars[i];
125
126 match state {
127 ParseState::Start => {
128 if Self::is_token_char(c) {
129 current_token.push(c);
130 state = ParseState::InToken;
131 } else if Self::is_valid_separator(c) {
132 return Err(RezCoreError::VersionParse(format!(
133 "Version cannot start with separator '{}'",
134 c
135 )));
136 } else {
137 return Err(RezCoreError::VersionParse(format!(
138 "Invalid character '{}' at start of version",
139 c
140 )));
141 }
142 }
143
144 ParseState::InToken => {
145 if Self::is_token_char(c) {
146 current_token.push(c);
147 } else if Self::is_valid_separator(c) {
148 self.finalize_token(&mut current_token, &mut tokens, &mut numeric_count)?;
150 separators.push(c);
151 state = ParseState::InSeparator;
152 } else {
153 return Err(RezCoreError::VersionParse(format!(
154 "Invalid character '{}' in token",
155 c
156 )));
157 }
158 }
159
160 ParseState::InSeparator => {
161 if Self::is_token_char(c) {
162 current_token.push(c);
163 state = ParseState::InToken;
164 } else {
165 return Err(RezCoreError::VersionParse(format!(
166 "Expected token character after separator, found '{}'",
167 c
168 )));
169 }
170 }
171 }
172
173 i += 1;
174 }
175
176 if state == ParseState::InToken && !current_token.is_empty() {
178 self.finalize_token(&mut current_token, &mut tokens, &mut numeric_count)?;
179 } else if state == ParseState::InSeparator {
180 return Err(RezCoreError::VersionParse(
181 "Version cannot end with separator".to_string(),
182 ));
183 }
184
185 if tokens.len() > self.max_tokens {
187 return Err(RezCoreError::VersionParse(format!(
188 "Too many tokens: {} (max: {})",
189 tokens.len(),
190 self.max_tokens
191 )));
192 }
193
194 if numeric_count > self.max_numeric_tokens {
195 return Err(RezCoreError::VersionParse(format!(
196 "Too many numeric tokens: {} (max: {})",
197 numeric_count, self.max_numeric_tokens
198 )));
199 }
200
201 Ok((tokens, separators))
202 }
203
204 fn finalize_token(
206 &self,
207 current_token: &mut String,
208 tokens: &mut SmallVec<[TokenType; 8]>,
209 numeric_count: &mut usize,
210 ) -> Result<(), RezCoreError> {
211 if current_token.is_empty() {
212 return Err(RezCoreError::VersionParse("Empty token found".to_string()));
213 }
214
215 if current_token.starts_with('_') || current_token.ends_with('_') {
217 return Err(RezCoreError::VersionParse(format!(
218 "Invalid token format: '{}'",
219 current_token
220 )));
221 }
222
223 if current_token == "not" || current_token == "version" {
225 return Err(RezCoreError::VersionParse(format!(
226 "Invalid version token: '{}'",
227 current_token
228 )));
229 }
230
231 if current_token.chars().all(|c| c.is_alphabetic()) && current_token.len() > 10 {
233 return Err(RezCoreError::VersionParse(format!(
234 "Invalid version token: '{}'",
235 current_token
236 )));
237 }
238
239 if current_token.chars().all(|c| c.is_ascii_digit()) {
241 if let Ok(num) = current_token.parse::<u64>() {
242 tokens.push(TokenType::Numeric(num));
243 *numeric_count += 1;
244 } else {
245 let interned = self.intern_string(current_token.clone());
247 tokens.push(TokenType::Alphanumeric(interned));
248 }
249 } else {
250 let interned = self.intern_string(current_token.clone());
252 tokens.push(TokenType::Alphanumeric(interned));
253 }
254
255 current_token.clear();
256 Ok(())
257 }
258}
259
260pub struct VersionParser {
262 _inner: StateMachineParser,
263}
264
265impl VersionParser {
266 pub fn new() -> Self {
268 Self {
269 _inner: StateMachineParser::new(),
270 }
271 }
272
273 pub fn parse_version(&self, input: &str) -> Result<Version, RezCoreError> {
275 Version::parse(input)
278 }
279}
280
281impl Default for VersionParser {
282 fn default() -> Self {
283 Self::new()
284 }
285}
286
287impl Default for StateMachineParser {
288 fn default() -> Self {
289 Self::new()
290 }
291}
292
293#[cfg(test)]
294mod tests {
295 use super::*;
296
297 #[test]
298 fn test_parser_creation() {
299 let _parser = VersionParser::new();
300 let _state_machine_parser = StateMachineParser::new();
301 }
303
304 #[test]
305 fn test_state_machine_parser_basic() {
306 let parser = StateMachineParser::new();
307
308 let (tokens, separators) = parser.parse_tokens("").unwrap();
310 assert!(tokens.is_empty());
311 assert!(separators.is_empty());
312
313 let (tokens, separators) = parser.parse_tokens("1.2.3").unwrap();
315 assert_eq!(tokens.len(), 3);
316 assert_eq!(separators.len(), 2);
317
318 match &tokens[0] {
320 TokenType::Numeric(n) => assert_eq!(*n, 1),
321 _ => panic!("Expected numeric token"),
322 }
323
324 assert_eq!(separators[0], '.');
325 assert_eq!(separators[1], '.');
326 }
327
328 #[test]
329 fn test_state_machine_parser_alphanumeric() {
330 let parser = StateMachineParser::new();
331
332 let (tokens, separators) = parser.parse_tokens("1.2.3-alpha1").unwrap();
333 assert_eq!(tokens.len(), 4);
334 assert_eq!(separators.len(), 3);
335
336 match &tokens[0] {
338 TokenType::Numeric(n) => assert_eq!(*n, 1),
339 _ => panic!("Expected numeric token"),
340 }
341
342 match &tokens[3] {
343 TokenType::Alphanumeric(s) => assert_eq!(s, "alpha1"),
344 _ => panic!("Expected alphanumeric token"),
345 }
346 }
347
348 #[test]
349 fn test_state_machine_parser_errors() {
350 let parser = StateMachineParser::new();
351
352 assert!(parser.parse_tokens(".1.2.3").is_err());
354
355 assert!(parser.parse_tokens("1.2.3.").is_err());
357
358 assert!(parser.parse_tokens("1.2.3@").is_err());
360
361 assert!(parser.parse_tokens("_invalid").is_err());
363 assert!(parser.parse_tokens("invalid_").is_err());
364 }
365
366 #[test]
367 fn test_string_interning() {
368 let parser = StateMachineParser::with_config(true, 10, 5);
369
370 let (tokens1, _) = parser.parse_tokens("1.0.0-alpha").unwrap();
372 let (tokens2, _) = parser.parse_tokens("1.0.0-alpha").unwrap();
373
374 if let (TokenType::Alphanumeric(s1), TokenType::Alphanumeric(s2)) =
376 (&tokens1[3], &tokens2[3])
377 {
378 assert_eq!(s1, s2);
380 }
381 }
382
383 #[test]
384 fn test_performance_limits() {
385 let parser = StateMachineParser::new();
386
387 let too_many_tokens = (0..15).map(|i| i.to_string()).collect::<Vec<_>>().join(".");
389 assert!(parser.parse_tokens(&too_many_tokens).is_err());
390
391 let too_many_numeric = (0..10).map(|i| i.to_string()).collect::<Vec<_>>().join(".");
393 assert!(parser.parse_tokens(&too_many_numeric).is_err());
394 }
395
396 #[test]
397 fn test_character_classification() {
398 assert!(StateMachineParser::is_valid_separator('.'));
399 assert!(StateMachineParser::is_valid_separator('-'));
400 assert!(StateMachineParser::is_valid_separator('_'));
401 assert!(StateMachineParser::is_valid_separator('+'));
402 assert!(!StateMachineParser::is_valid_separator('@'));
403
404 assert!(StateMachineParser::is_token_char('a'));
405 assert!(StateMachineParser::is_token_char('1'));
406 assert!(StateMachineParser::is_token_char('_'));
407 assert!(!StateMachineParser::is_token_char('.'));
408 }
409}