statsig_rust/evaluation/user_agent_parsing/statsig_uaparser/
tokenizer.rs1use crate::unwrap_or_return;
2
3use super::{window_iter::WindowIter, Version};
4
5pub struct Tokenizer;
6
7impl Tokenizer {
8 pub fn run(input: &str) -> TokenizerResult<'_> {
10 let mut result = TokenizerResult::default();
11 let mut win: WindowIter<'_> = WindowIter::new(input);
12 while !win.is_empty() {
13 let (curr, next1, next2, next3) = win.get_window();
14 let (curr, next1, next2, next3) = (
15 trim_invalid_chars(curr),
16 trim_invalid_chars(next1),
17 trim_invalid_chars(next2),
18 trim_invalid_chars(next3),
19 );
20
21 let curr = match curr {
22 Some(val) => val,
23 None => {
24 win.slide_window_by(1);
25 continue;
26 }
27 };
28
29 if curr.starts_with("AppleTV") {
30 result.add_tag("ATV OS X", None);
31 win.slide_window_by(1);
32 } else if curr == "like" && next1 == Some("Mac") && next2 == Some("OS") {
33 win.slide_window_by(3);
34 }
35 else if curr == "Mac" && next1 == Some("OS") && next2 == Some("X") {
37 result.macos_hint = true;
38
39 result.add_possible_os_tag("Mac OS X", consume_if_numeric(&mut win, next3));
40
41 win.slide_window_by(2);
42 }
43 else if curr == "iOS" {
45 result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next1));
46 result.ios_hint = true;
47 win.slide_window_by(1);
48 } else if curr == "iPhone" && next1 == Some("OS") {
49 result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next2));
50
51 win.slide_window_by(1);
52 }
53 else if curr.starts_with("iPad") {
55 result.ios_hint = true;
56
57 let mut parts = curr.split("iPad");
58 let _ = parts.next();
59 result.add_tag("iPad", trim_invalid_chars(parts.next()));
60 }
61 else if curr.starts_with("iPhone") {
63 result.ios_hint = true;
64
65 let mut parts = curr.split("iPhone");
66 let _ = parts.next();
67 result.add_tag("iPhone", trim_invalid_chars(parts.next()));
68 }
69 else if curr.starts_with("Apple-iPhone7C2") {
71 result.ios_hint = true;
72
73 result.add_tag("iPhone", None);
74 }
75 else if curr == "CPU" && next1 == Some("OS") {
77 result.add_tag("CPU OS", consume_if_numeric(&mut win, next2));
78 win.slide_window_by(1);
79 }
80 else if curr == "CrOS" {
82 let mut version = consume_if_numeric(&mut win, next1);
83 if version.is_none() {
84 win.slide_window_by(1);
85 version = consume_if_numeric(&mut win, next2);
86 }
87
88 result.add_possible_os_tag("Chrome OS", version);
89 }
90 else if curr == "CrKey" {
92 result.add_possible_os_tag("Chromecast", None);
93 }
94 else if curr == "PlayStation" {
96 result.playstation_hint = true;
97
98 result.add_tag("PlayStation", None);
99 }
100 else if curr == "Android" {
102 result.add_possible_os_tag("Android", consume_if_numeric(&mut win, next1));
103 }
104 else if curr == "Windows" && next1 == Some("Phone") {
106 result.add_possible_os_tag("Windows Phone", consume_if_numeric(&mut win, next2));
107 win.slide_window_by(1);
108 }
109 else if curr.starts_with("Windows") {
111 result.windows_hint = true;
112
113 let version = if next1 == Some("NT") {
114 consume_if_numeric(&mut win, next2).inspect(|_| {
115 win.slide_window_by(1); })
117 } else if next1.is_some_and(|s| s.starts_with("XP")) {
118 win.slide_window_by(1);
119 Some("XP")
120 } else {
121 consume_if_numeric(&mut win, next1)
122 };
123
124 result.add_possible_os_tag("Windows", version);
125 } else if curr == "MSIE" {
126 let version = consume_if_numeric(&mut win, next1);
127 result.add_possible_browser_tag("IE", version);
128 }
129 else if curr == "Yahoo!" && next1 == Some("Slurp") {
131 result.add_tag("Yahoo! Slurp", None);
132 win.slide_window_by(1);
133 }
134 else if curr == "Red" && next1 == Some("Hat") {
136 result.add_possible_os_tag("Red Hat", None);
137
138 win.slide_window_by(1);
139 }
140 else if curr == "Ubuntu" {
142 result.add_possible_os_tag("Ubuntu", consume_if_numeric(&mut win, next1));
143 }
144 else if curr == "Mobile" {
146 result.mobile_hint = true;
147
148 result.add_tag("Mobile", None);
149 }
150 else if curr == "Linux" || curr == "linux" {
152 result.linux_hint = true;
153 result.add_tag("Linux", None);
154 }
155 else if curr == "Nintendo" && next1 == Some("3DS") {
157 result.add_tag("NetFront NX", None);
158 win.slide_window_by(1);
159 }
160 else if curr == "like" || curr.len() <= 2 {
162 win.slide_window_by(1);
163 continue;
164 }
165 else if curr == "Better" && next1 == Some("Uptime") && next2 == Some("Bot") {
167 result.add_possible_browser_tag_for_bot("Better Uptime Bot", None);
168 } else if curr == "Radius" && next1 == Some("Compilance") && next2 == Some("Bot") {
169 result.add_possible_browser_tag_for_bot("Radius Compilance Bot", None);
170 } else if curr == "AdsBot-Google-Mobile" {
171 result.add_possible_browser_tag_for_bot("AdsBot-Google", None);
172 } else if curr == "Uptime" && next1 == Some("Monitoring") && next2 == Some("Bot") {
173 result.add_possible_browser_tag_for_bot("Uptime Monitoring Bot", None);
174 }
175 else {
177 let parts = curr.split_once(['/', ';', ':']);
178 let tag = trim_invalid_chars(parts.map(|(t, _)| t)).unwrap_or(curr);
179 let version = trim_invalid_chars(parts.map(|(_, v)| v));
180
181 if tag == "Kindle" {
182 result.add_possible_os_and_browser_tag("Kindle", version);
183 }
184 else if tag == "FxiOS" {
186 result.add_possible_browser_tag("Firefox iOS", version);
187 } else if tag == "EdgiOS" {
188 if let Some(os_token) = result.possible_os_token.as_ref() {
189 if os_token.tag == "Mac OS X" {
190 result.add_possible_os_tag_override_existing("iOS", None);
191 }
192 }
193 result.add_possible_browser_tag("Edge Mobile", version);
194 }
195 else if tag == "CriOS" {
197 result.ios_hint = true;
198 if let Some(os_token) = result.possible_os_token.as_ref() {
199 if os_token.tag == "Mac OS X" {
200 result.add_possible_os_tag_override_existing("iOS", None);
201 }
202 }
203 result.add_possible_browser_tag("Chrome Mobile iOS", version);
204 }
205 else if tag == "GSA" {
207 result.add_possible_browser_tag("Google", version);
208 }
209 else if tag == "YisouSpider" {
211 result.add_possible_browser_tag("YisouSpider", version);
212 }
213 else if tag == "Edg" || tag == "Edge" {
215 result.add_tag("Edge", version);
216 }
217 else if tag == "OPR" {
219 result.add_tag("Opera", version);
220 }
221 else if tag == "SamsungBrowser" {
223 result.add_possible_browser_tag("Samsung Internet", version);
224 }
225 else if tag == "HuaweiBrowser" {
227 result.huawei_hint = true;
228
229 result.add_tag("HuaweiBrowser", version);
230 }
231 else if tag == "ChatGPT-User" {
233 result.add_possible_browser_tag("ChatGPT-User", version);
234 }
235 else if tag == "OAI-SearchBot" {
237 result.add_possible_browser_tag("OAI-SearchBot", version);
238 }
239 else if tag == "NX" {
241 result.add_possible_browser_tag("NetFront NX", version);
242 }
243 else if tag == "Electron" {
245 result.add_possible_browser_tag("Electron", version);
246 } else if tag == "IEMobile" {
247 result.add_possible_browser_tag("IE Mobile", version);
248 }
249 else if tag.contains("Bot")
251 || tag.contains("bot")
252 || tag.contains("crawler")
253 || tag.contains("Crawler")
254 {
255 result.add_possible_browser_tag_for_bot(tag, version);
256 }
257 else if tag == "Mobile" {
259 result.mobile_hint = true;
260
261 result.add_tag("Mobile", version);
262 }
263 else if tag == "Safari" {
265 result.safari_hint = true;
266
267 result.add_tag("Safari", version);
268 } else if tag == "CFNetwork" {
269 result.cfnetwork_hint = true;
270 result.ios_hint = true;
271 } else if tag.contains("crawler") || version.is_some_and(|v| v.contains("crawler"))
272 {
273 result.crawler_hint = true;
274 } else if tag == "OculusBrowser" {
275 result.add_possible_os_tag_override_existing("Android", None);
277 }
278 else {
280 result.add_tag(tag, version);
281 }
282 }
283
284 win.slide_window_by(1);
285 }
286
287 result
288 }
289}
290
291#[derive(Debug, Default)]
292pub struct TokenizerResult<'a> {
293 pub position: usize,
294 pub tokens: Vec<Token<'a>>,
295 pub possible_os_token: Option<Token<'a>>,
296 pub possible_browser_token: Option<Token<'a>>,
297
298 pub linux_hint: bool,
300 pub ios_hint: bool,
301 pub macos_hint: bool,
302 pub windows_hint: bool,
303 pub mobile_hint: bool,
304 pub safari_hint: bool,
305 pub playstation_hint: bool,
306 pub huawei_hint: bool,
307 pub cfnetwork_hint: bool,
308 pub crawler_hint: bool,
309 pub bot_detected: bool,
310}
311
312impl<'a> TokenizerResult<'a> {
313 pub fn add_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
314 self.tokens.push(Token {
315 position: self.position,
316 tag,
317 version,
318 });
319 self.position += 1;
320 }
321
322 pub fn add_possible_os_and_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
323 self.add_possible_os_tag_impl(tag, version);
324 self.add_possible_browser_tag_impl(tag, version);
325
326 self.add_tag(tag, version);
327 }
328
329 pub fn add_possible_os_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
330 self.add_possible_os_tag_impl(tag, version);
331 self.add_tag(tag, version);
332 }
333
334 pub fn add_possible_os_tag_override_existing(
335 &mut self,
336 tag: &'a str,
337 version: Option<&'a str>,
338 ) {
339 self.possible_os_token = Some(Token {
340 position: self.position,
341 tag,
342 version,
343 });
344 }
345
346 pub fn add_possible_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
347 self.add_possible_browser_tag_impl(tag, version);
348 self.add_tag(tag, version);
349 }
350
351 fn add_possible_os_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
352 if self.possible_os_token.is_some() {
353 return;
354 }
355
356 if version.is_none() {
357 return;
358 }
359
360 self.possible_os_token = Some(Token {
361 position: self.position,
362 tag,
363 version,
364 });
365 }
366
367 fn add_possible_browser_tag_for_bot(&mut self, tag: &'a str, version: Option<&'a str>) {
368 if self.bot_detected {
369 return;
372 }
373 self.bot_detected = true;
374 self.possible_browser_token = Some(Token {
375 position: self.position,
376 tag,
377 version,
378 });
379 }
380
381 fn add_possible_browser_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
382 if version.is_none() {
383 return;
384 }
385
386 self.possible_browser_token = Some(Token {
387 position: self.position,
388 tag,
389 version,
390 });
391 }
392}
393
394#[derive(Debug, Clone)]
395pub struct Token<'a> {
396 pub position: usize,
397 pub tag: &'a str,
398 pub version: Option<&'a str>,
399}
400
401impl<'a> Token<'a> {
402 pub fn get_version(&self) -> Option<Version<'a>> {
403 let version = unwrap_or_return!(self.version, Some(Version::major("0.0.0")));
404
405 if self.tag == "Windows" {
406 let mapped = match version {
407 "5.1" => "XP",
408 "5.2" => "XP",
409 "6.0" => "Vista",
410 "6.1" => "7", "6.3" => "8.1",
412 "10.0" => "10",
413 _ => "0.0.0",
414 };
415
416 return Some(Version::major(mapped));
417 }
418
419 let mut parts = version.split_terminator(['_', ',', '.']);
420
421 let mut version = Version::default();
422 if let Some(major) = parts.next() {
423 version.major = Some(major);
424 }
425
426 if let Some(minor) = parts.next() {
427 let trimmed_minor = take_until_non_numeric(minor);
428 version.minor = Some(trimmed_minor);
429 }
430
431 if let Some(patch) = parts.next() {
432 version.patch = Some(patch);
433 }
434
435 if self.tag == "YaBrowser" || self.tag == "Opera" || self.tag == "NetFront NX" {
437 return Some(version);
438 }
439
440 if let Some(patch_minor) = parts.next() {
441 version.patch_minor = Some(patch_minor);
442 }
443
444 Some(version)
445 }
446}
447
448fn trim_invalid_chars(s: Option<&str>) -> Option<&str> {
449 let trimmed = s.map(|s| {
450 s.trim_matches(|c| {
451 c == '(' || c == ')' || c == ';' || c == ',' || c == '+' || c == '_' || c == '"'
452 })
453 });
454
455 match trimmed {
456 Some("") => None,
457 Some(s) => Some(s),
458 None => None,
459 }
460}
461
462fn starts_with_number(s: Option<&str>) -> bool {
463 s.map(|s| s.chars().next().is_some_and(|c| c.is_ascii_digit()))
464 .unwrap_or(false)
465}
466
467fn consume_if_numeric<'a>(win: &mut WindowIter<'a>, tag: Option<&'a str>) -> Option<&'a str> {
468 if starts_with_number(tag) {
469 win.slide_window_by(1);
470 return tag;
471 }
472
473 None
474}
475
476fn take_until_non_numeric(s: &str) -> &str {
477 let mut slice_index = 0;
478
479 for c in s.chars() {
480 if !c.is_ascii_digit() {
481 break;
482 }
483
484 slice_index += 1;
485 }
486
487 s.get(..slice_index).unwrap_or(s)
488}