statsig_rust/evaluation/user_agent_parsing/statsig_uaparser/
tokenizer.rs1use crate::unwrap_or_return;
2
3use super::{window_iter::WindowIter, Version};
4
5pub struct Tokenizer;
6
7impl Tokenizer {
8 pub fn run(input: &str) -> TokenizerResult<'_> {
10 let mut result = TokenizerResult::default();
11 let mut win: WindowIter<'_> = WindowIter::new(input);
12 while !win.is_empty() {
13 let (curr, next1, next2, next3) = win.get_window();
14 let (curr, next1, next2, next3) = (
15 trim_invalid_chars(curr),
16 trim_invalid_chars(next1),
17 trim_invalid_chars(next2),
18 trim_invalid_chars(next3),
19 );
20
21 let curr = match curr {
22 Some(val) => val,
23 None => {
24 win.slide_window_by(1);
25 continue;
26 }
27 };
28
29 if curr.starts_with("AppleTV") {
30 result.add_tag("ATV OS X", None);
31 win.slide_window_by(1);
32 }
33 else if curr == "Mac" && next1 == Some("OS") && next2 == Some("X") {
35 result.macos_hint = true;
36
37 result.add_possible_os_tag("Mac OS X", consume_if_numeric(&mut win, next3));
38
39 win.slide_window_by(2);
40 }
41 else if curr == "iOS" {
43 result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next1));
44 result.ios_hint = true;
45 win.slide_window_by(1);
46 } else if curr == "iPhone" && next1 == Some("OS") {
47 result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next2));
48
49 win.slide_window_by(1);
50 }
51 else if curr.starts_with("iPad") {
53 result.ios_hint = true;
54
55 let mut parts = curr.split("iPad");
56 let _ = parts.next();
57 result.add_tag("iPad", trim_invalid_chars(parts.next()));
58 }
59 else if curr.starts_with("iPhone") {
61 result.ios_hint = true;
62
63 let mut parts = curr.split("iPhone");
64 let _ = parts.next();
65 result.add_tag("iPhone", trim_invalid_chars(parts.next()));
66 }
67 else if curr.starts_with("Apple-iPhone7C2") {
69 result.ios_hint = true;
70
71 result.add_tag("iPhone", None);
72 }
73 else if curr == "CPU" && next1 == Some("OS") {
75 result.add_tag("CPU OS", consume_if_numeric(&mut win, next2));
76 win.slide_window_by(1);
77 }
78 else if curr == "CrOS" {
80 let mut version = consume_if_numeric(&mut win, next1);
81 if version.is_none() {
82 win.slide_window_by(1);
83 version = consume_if_numeric(&mut win, next2);
84 }
85
86 result.add_possible_os_tag("Chrome OS", version);
87 }
88 else if curr == "CrKey" {
90 result.add_possible_os_tag("Chromecast", None);
91 }
92 else if curr == "PlayStation" {
94 result.playstation_hint = true;
95
96 result.add_tag("PlayStation", None);
97 }
98 else if curr == "Android" {
100 result.add_possible_os_tag("Android", consume_if_numeric(&mut win, next1));
101 }
102 else if curr == "Windows" && next1 == Some("Phone") {
104 result.add_possible_os_tag("Windows Phone", consume_if_numeric(&mut win, next2));
105 win.slide_window_by(1);
106 }
107 else if curr.starts_with("Windows") {
109 result.windows_hint = true;
110
111 let version = if next1 == Some("NT") {
112 consume_if_numeric(&mut win, next2).inspect(|_| {
113 win.slide_window_by(1); })
115 } else if next1.is_some_and(|s| s.starts_with("XP")) {
116 win.slide_window_by(1);
117 Some("XP")
118 } else {
119 consume_if_numeric(&mut win, next1)
120 };
121
122 result.add_possible_os_tag("Windows", version);
123 }
124 else if curr == "Yahoo!" && next1 == Some("Slurp") {
126 result.add_tag("Yahoo! Slurp", None);
127 win.slide_window_by(1);
128 }
129 else if curr == "Red" && next1 == Some("Hat") {
131 result.add_possible_os_tag("Red Hat", None);
132
133 win.slide_window_by(1);
134 }
135 else if curr == "Ubuntu" {
137 result.add_possible_os_tag("Ubuntu", consume_if_numeric(&mut win, next1));
138 }
139 else if curr == "Mobile" {
141 result.mobile_hint = true;
142
143 result.add_tag("Mobile", None);
144 }
145 else if curr == "Linux" {
147 result.linux_hint = true;
148 result.add_tag("Linux", None);
149 }
150 else if curr == "Nintendo" && next1 == Some("3DS") {
152 result.add_tag("NetFront NX", None);
153 win.slide_window_by(1);
154 }
155 else if curr == "like" || curr.len() <= 2 {
157 win.slide_window_by(1);
158 continue;
159 }
160 else {
162 let parts = curr.split_once(['/', ';', ':']);
163 let tag = trim_invalid_chars(parts.map(|(t, _)| t)).unwrap_or(curr);
164 let version = trim_invalid_chars(parts.map(|(_, v)| v));
165
166 if tag == "Kindle" {
167 result.add_possible_os_and_browser_tag("Kindle", version);
168 }
169 else if tag == "FxiOS" {
171 result.add_possible_browser_tag("Firefox iOS", version);
172 }
173 else if tag == "CriOS" {
175 result.add_possible_browser_tag("Chrome Mobile iOS", version);
176 }
177 else if tag == "GSA" {
179 result.add_possible_browser_tag("Google", version);
180 }
181 else if tag == "YisouSpider" {
183 result.add_possible_browser_tag("YisouSpider", version);
184 }
185 else if tag == "Edg" || tag == "Edge" {
187 result.add_tag("Edge", version);
188 }
189 else if tag == "OPR" {
191 result.add_possible_browser_tag("Opera", version);
192 }
193 else if tag == "SamsungBrowser" {
195 result.add_possible_browser_tag("Samsung Internet", version);
196 }
197 else if tag == "HuaweiBrowser" {
199 result.huawei_hint = true;
200
201 result.add_tag("HuaweiBrowser", version);
202 }
203 else if tag == "ChatGPT-User" {
205 result.add_possible_browser_tag("ChatGPT-User", version);
206 }
207 else if tag == "OAI-SearchBot" {
209 result.add_possible_browser_tag("OAI-SearchBot", version);
210 }
211 else if tag == "NX" {
213 result.add_possible_browser_tag("NetFront NX", version);
214 }
215 else if tag == "Electron" {
217 result.add_possible_browser_tag("Electron", version);
218 }
219 else if tag.contains("Bot")
221 || tag.contains("bot")
222 || tag.contains("crawler")
223 || tag.contains("Crawler")
224 {
225 result.add_possible_browser_tag(tag, version);
226 }
227 else if tag == "Mobile" {
229 result.mobile_hint = true;
230
231 result.add_tag("Mobile", version);
232 }
233 else if tag == "Safari" {
235 result.safari_hint = true;
236
237 result.add_tag("Safari", version);
238 } else if tag == "CFNetwork" {
239 result.cfnetwork_hint = true;
240 result.ios_hint = true;
241 } else if tag.contains("crawler") || version.is_some_and(|v| v.contains("crawler"))
242 {
243 result.crawler_hint = true;
244 }
245 else {
247 result.add_tag(tag, version);
248 }
249 }
250
251 win.slide_window_by(1);
252 }
253
254 result
255 }
256}
257
258#[derive(Debug, Default)]
259pub struct TokenizerResult<'a> {
260 pub position: usize,
261 pub tokens: Vec<Token<'a>>,
262 pub possible_os_token: Option<Token<'a>>,
263 pub possible_browser_token: Option<Token<'a>>,
264
265 pub linux_hint: bool,
267 pub ios_hint: bool,
268 pub macos_hint: bool,
269 pub windows_hint: bool,
270 pub mobile_hint: bool,
271 pub safari_hint: bool,
272 pub playstation_hint: bool,
273 pub huawei_hint: bool,
274 pub cfnetwork_hint: bool,
275 pub crawler_hint: bool,
276}
277
278impl<'a> TokenizerResult<'a> {
279 pub fn add_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
280 self.tokens.push(Token {
281 position: self.position,
282 tag,
283 version,
284 });
285 self.position += 1;
286 }
287
288 pub fn add_possible_os_and_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
289 self.add_possible_os_tag_impl(tag, version);
290 self.add_possible_browser_tag_impl(tag, version);
291
292 self.add_tag(tag, version);
293 }
294
295 pub fn add_possible_os_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
296 self.add_possible_os_tag_impl(tag, version);
297 self.add_tag(tag, version);
298 }
299
300 pub fn add_possible_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
301 self.add_possible_browser_tag_impl(tag, version);
302 self.add_tag(tag, version);
303 }
304
305 fn add_possible_os_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
306 if self.possible_os_token.is_some() {
307 return;
308 }
309
310 if version.is_none() {
311 return;
312 }
313
314 self.possible_os_token = Some(Token {
315 position: self.position,
316 tag,
317 version,
318 });
319 }
320
321 fn add_possible_browser_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
322 if version.is_none() {
323 return;
324 }
325 if self.possible_browser_token.is_some()
326 && (tag.contains(".com")
327 || tag.contains(".net")
328 || tag.contains(".org")
329 || tag.contains(".html")
330 || tag.contains("http://")
331 || tag.contains("https://"))
332 {
333 return;
334 }
335
336 self.possible_browser_token = Some(Token {
337 position: self.position,
338 tag,
339 version,
340 });
341 }
342}
343
344#[derive(Debug, Clone)]
345pub struct Token<'a> {
346 pub position: usize,
347 pub tag: &'a str,
348 pub version: Option<&'a str>,
349}
350
351impl<'a> Token<'a> {
352 pub fn get_version(&self) -> Option<Version<'a>> {
353 let version = unwrap_or_return!(self.version, Some(Version::major("0.0.0")));
354
355 if self.tag == "Windows" {
356 let mapped = match version {
357 "5.1" => "XP",
358 "5.2" => "XP",
359 "6.0" => "Vista",
360 "6.1" => "7", "6.3" => "8.1",
362 "10.0" => "10",
363 _ => "0.0.0",
364 };
365
366 return Some(Version::major(mapped));
367 }
368
369 let mut parts = version.split_terminator(['_', ',', '.']);
370
371 let mut version = Version::default();
372 if let Some(major) = parts.next() {
373 version.major = Some(major);
374 }
375
376 if let Some(minor) = parts.next() {
377 let trimmed_minor = take_until_non_numeric(minor);
378 version.minor = Some(trimmed_minor);
379 }
380
381 if let Some(patch) = parts.next() {
382 version.patch = Some(patch);
383 }
384
385 if self.tag == "YaBrowser" || self.tag == "Opera" || self.tag == "NetFront NX" {
387 return Some(version);
388 }
389
390 if let Some(patch_minor) = parts.next() {
391 version.patch_minor = Some(patch_minor);
392 }
393
394 Some(version)
395 }
396}
397
398fn trim_invalid_chars(s: Option<&str>) -> Option<&str> {
399 let trimmed = s.map(|s| {
400 s.trim_matches(|c| c == '(' || c == ')' || c == ';' || c == ',' || c == '+' || c == '_')
401 });
402
403 match trimmed {
404 Some("") => None,
405 Some(s) => Some(s),
406 None => None,
407 }
408}
409
410fn starts_with_number(s: Option<&str>) -> bool {
411 s.map(|s| s.chars().next().is_some_and(|c| c.is_ascii_digit()))
412 .unwrap_or(false)
413}
414
415fn consume_if_numeric<'a>(win: &mut WindowIter<'a>, tag: Option<&'a str>) -> Option<&'a str> {
416 if starts_with_number(tag) {
417 win.slide_window_by(1);
418 return tag;
419 }
420
421 None
422}
423
424fn take_until_non_numeric(s: &str) -> &str {
425 let mut slice_index = 0;
426
427 for c in s.chars() {
428 if !c.is_ascii_digit() {
429 break;
430 }
431
432 slice_index += 1;
433 }
434
435 s.get(..slice_index).unwrap_or(s)
436}