statsig_rust/evaluation/user_agent_parsing/statsig_uaparser/
tokenizer.rs

1use crate::unwrap_or_return;
2
3use super::{window_iter::WindowIter, Version};
4
5pub struct Tokenizer;
6
7impl Tokenizer {
8    // Ideal UserAgent format: <product>/<product-version> (<os-information>) <engine> (<platform-details>) <optional-details>
9    pub fn run(input: &str) -> TokenizerResult<'_> {
10        let mut result = TokenizerResult::default();
11        let mut win: WindowIter<'_> = WindowIter::new(input);
12        while !win.is_empty() {
13            let (curr, next1, next2, next3) = win.get_window();
14            let (curr, next1, next2, next3) = (
15                trim_invalid_chars(curr),
16                trim_invalid_chars(next1),
17                trim_invalid_chars(next2),
18                trim_invalid_chars(next3),
19            );
20
21            let curr = match curr {
22                Some(val) => val,
23                None => {
24                    win.slide_window_by(1);
25                    continue;
26                }
27            };
28
29            if curr.starts_with("AppleTV") {
30                result.add_tag("ATV OS X", None);
31                win.slide_window_by(1);
32            } else if curr == "like" && next1 == Some("Mac") && next2 == Some("OS") {
33                win.slide_window_by(3);
34            }
35            // Mac OS X
36            else if curr == "Mac" && next1 == Some("OS") && next2 == Some("X") {
37                result.macos_hint = true;
38
39                result.add_possible_os_tag("Mac OS X", consume_if_numeric(&mut win, next3));
40
41                win.slide_window_by(2);
42            }
43            // iPhone OS
44            else if curr == "iOS" {
45                result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next1));
46                result.ios_hint = true;
47                win.slide_window_by(1);
48            } else if curr == "iPhone" && next1 == Some("OS") {
49                result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next2));
50
51                win.slide_window_by(1);
52            }
53            // iPad
54            else if curr.starts_with("iPad") {
55                result.ios_hint = true;
56
57                let mut parts = curr.split("iPad");
58                let _ = parts.next();
59                result.add_tag("iPad", trim_invalid_chars(parts.next()));
60            }
61            // iPhone
62            else if curr.starts_with("iPhone") {
63                result.ios_hint = true;
64
65                let mut parts = curr.split("iPhone");
66                let _ = parts.next();
67                result.add_tag("iPhone", trim_invalid_chars(parts.next()));
68            }
69            // iPhone (Apple)
70            else if curr.starts_with("Apple-iPhone7C2") {
71                result.ios_hint = true;
72
73                result.add_tag("iPhone", None);
74            }
75            // CPU OS
76            else if curr == "CPU" && next1 == Some("OS") {
77                result.add_tag("CPU OS", consume_if_numeric(&mut win, next2));
78                win.slide_window_by(1);
79            }
80            // Chrome OS
81            else if curr == "CrOS" {
82                let mut version = consume_if_numeric(&mut win, next1);
83                if version.is_none() {
84                    win.slide_window_by(1);
85                    version = consume_if_numeric(&mut win, next2);
86                }
87
88                result.add_possible_os_tag("Chrome OS", version);
89            }
90            // Chromecast
91            else if curr == "CrKey" {
92                result.add_possible_os_tag("Chromecast", None);
93            }
94            // PlayStation
95            else if curr == "PlayStation" {
96                result.playstation_hint = true;
97
98                result.add_tag("PlayStation", None);
99            }
100            // Android
101            else if curr == "Android" {
102                result.add_possible_os_tag("Android", consume_if_numeric(&mut win, next1));
103            }
104            // Windows Phone
105            else if curr == "Windows" && next1 == Some("Phone") {
106                result.add_possible_os_tag("Windows Phone", consume_if_numeric(&mut win, next2));
107                win.slide_window_by(1);
108            }
109            // Windows
110            else if curr.starts_with("Windows") {
111                result.windows_hint = true;
112
113                let version = if next1 == Some("NT") {
114                    consume_if_numeric(&mut win, next2).inspect(|_| {
115                        win.slide_window_by(1); // extra slide to skip the NT
116                    })
117                } else if next1.is_some_and(|s| s.starts_with("XP")) {
118                    win.slide_window_by(1);
119                    Some("XP")
120                } else {
121                    consume_if_numeric(&mut win, next1)
122                };
123
124                result.add_possible_os_tag("Windows", version);
125            } else if curr == "MSIE" {
126                let version = consume_if_numeric(&mut win, next1);
127                result.add_possible_browser_tag("IE", version);
128            }
129            // Yahoo Slurp
130            else if curr == "Yahoo!" && next1 == Some("Slurp") {
131                result.add_tag("Yahoo! Slurp", None);
132                win.slide_window_by(1);
133            }
134            // Red Hat
135            else if curr == "Red" && next1 == Some("Hat") {
136                result.add_possible_os_tag("Red Hat", None);
137
138                win.slide_window_by(1);
139            }
140            // Ubuntu
141            else if curr == "Ubuntu" {
142                result.add_possible_os_tag("Ubuntu", consume_if_numeric(&mut win, next1));
143            }
144            // Mobile
145            else if curr == "Mobile" {
146                result.mobile_hint = true;
147
148                result.add_tag("Mobile", None);
149            }
150            // Linux
151            else if curr == "Linux" || curr == "linux" {
152                result.linux_hint = true;
153                result.add_tag("Linux", None);
154            }
155            // Nintendo
156            else if curr == "Nintendo" && next1 == Some("3DS") {
157                result.add_tag("NetFront NX", None);
158                win.slide_window_by(1);
159            }
160            // Skip
161            else if curr == "like" || curr.len() <= 2 {
162                win.slide_window_by(1);
163                continue;
164            }
165            // Bot
166            else if curr == "Better" && next1 == Some("Uptime") && next2 == Some("Bot") {
167                result.add_possible_browser_tag_for_bot("Better Uptime Bot", None);
168            } else if curr == "Radius" && next1 == Some("Compilance") && next2 == Some("Bot") {
169                result.add_possible_browser_tag_for_bot("Radius Compilance Bot", None);
170            } else if curr == "AdsBot-Google-Mobile" {
171                result.add_possible_browser_tag_for_bot("AdsBot-Google", None);
172            } else if curr == "Uptime" && next1 == Some("Monitoring") && next2 == Some("Bot") {
173                result.add_possible_browser_tag_for_bot("Uptime Monitoring Bot", None);
174            }
175            // Rest
176            else {
177                let parts = curr.split_once(['/', ';', ':']);
178                let tag = trim_invalid_chars(parts.map(|(t, _)| t)).unwrap_or(curr);
179                let version = trim_invalid_chars(parts.map(|(_, v)| v));
180
181                if tag == "Kindle" {
182                    result.add_possible_os_and_browser_tag("Kindle", version);
183                }
184                //
185                else if tag == "FxiOS" {
186                    result.add_possible_browser_tag("Firefox iOS", version);
187                } else if tag == "EdgiOS" {
188                    if let Some(os_token) = result.possible_os_token.as_ref() {
189                        if os_token.tag == "Mac OS X" {
190                            result.add_possible_os_tag_override_existing("iOS", None);
191                        }
192                    }
193                    result.add_possible_browser_tag("Edge Mobile", version);
194                }
195                //
196                else if tag == "CriOS" {
197                    result.ios_hint = true;
198                    if let Some(os_token) = result.possible_os_token.as_ref() {
199                        if os_token.tag == "Mac OS X" {
200                            result.add_possible_os_tag_override_existing("iOS", None);
201                        }
202                    }
203                    result.add_possible_browser_tag("Chrome Mobile iOS", version);
204                }
205                //
206                else if tag == "GSA" {
207                    result.add_possible_browser_tag("Google", version);
208                }
209                //
210                else if tag == "YisouSpider" {
211                    result.add_possible_browser_tag("YisouSpider", version);
212                }
213                //
214                else if tag == "Edg" || tag == "Edge" {
215                    result.add_tag("Edge", version);
216                }
217                //
218                else if tag == "OPR" {
219                    result.add_tag("Opera", version);
220                }
221                //
222                else if tag == "SamsungBrowser" {
223                    result.add_possible_browser_tag("Samsung Internet", version);
224                }
225                //
226                else if tag == "HuaweiBrowser" {
227                    result.huawei_hint = true;
228
229                    result.add_tag("HuaweiBrowser", version);
230                }
231                //
232                else if tag == "ChatGPT-User" {
233                    result.add_possible_browser_tag("ChatGPT-User", version);
234                }
235                //
236                else if tag == "OAI-SearchBot" {
237                    result.add_possible_browser_tag("OAI-SearchBot", version);
238                }
239                //
240                else if tag == "NX" {
241                    result.add_possible_browser_tag("NetFront NX", version);
242                }
243                //
244                else if tag == "Electron" {
245                    result.add_possible_browser_tag("Electron", version);
246                } else if tag == "IEMobile" {
247                    result.add_possible_browser_tag("IE Mobile", version);
248                }
249                // Bot or crawler
250                else if tag.contains("Bot")
251                    || tag.contains("bot")
252                    || tag.contains("crawler")
253                    || tag.contains("Crawler")
254                {
255                    result.add_possible_browser_tag_for_bot(tag, version);
256                }
257                // Mobile
258                else if tag == "Mobile" {
259                    result.mobile_hint = true;
260
261                    result.add_tag("Mobile", version);
262                }
263                // Safari
264                else if tag == "Safari" {
265                    result.safari_hint = true;
266
267                    result.add_tag("Safari", version);
268                } else if tag == "CFNetwork" {
269                    result.cfnetwork_hint = true;
270                    result.ios_hint = true;
271                } else if tag.contains("crawler") || version.is_some_and(|v| v.contains("crawler"))
272                {
273                    result.crawler_hint = true;
274                } else if tag == "OculusBrowser" {
275                    // Oculus os is android, but fake to be linux
276                    result.add_possible_os_tag_override_existing("Android", None);
277                }
278                //
279                else {
280                    result.add_tag(tag, version);
281                }
282            }
283
284            win.slide_window_by(1);
285        }
286
287        result
288    }
289}
290
291#[derive(Debug, Default)]
292pub struct TokenizerResult<'a> {
293    pub position: usize,
294    pub tokens: Vec<Token<'a>>,
295    pub possible_os_token: Option<Token<'a>>,
296    pub possible_browser_token: Option<Token<'a>>,
297
298    // Hints
299    pub linux_hint: bool,
300    pub ios_hint: bool,
301    pub macos_hint: bool,
302    pub windows_hint: bool,
303    pub mobile_hint: bool,
304    pub safari_hint: bool,
305    pub playstation_hint: bool,
306    pub huawei_hint: bool,
307    pub cfnetwork_hint: bool,
308    pub crawler_hint: bool,
309    pub bot_detected: bool,
310}
311
312impl<'a> TokenizerResult<'a> {
313    pub fn add_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
314        self.tokens.push(Token {
315            position: self.position,
316            tag,
317            version,
318        });
319        self.position += 1;
320    }
321
322    pub fn add_possible_os_and_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
323        self.add_possible_os_tag_impl(tag, version);
324        self.add_possible_browser_tag_impl(tag, version);
325
326        self.add_tag(tag, version);
327    }
328
329    pub fn add_possible_os_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
330        self.add_possible_os_tag_impl(tag, version);
331        self.add_tag(tag, version);
332    }
333
334    pub fn add_possible_os_tag_override_existing(
335        &mut self,
336        tag: &'a str,
337        version: Option<&'a str>,
338    ) {
339        self.possible_os_token = Some(Token {
340            position: self.position,
341            tag,
342            version,
343        });
344    }
345
346    pub fn add_possible_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
347        self.add_possible_browser_tag_impl(tag, version);
348        self.add_tag(tag, version);
349    }
350
351    fn add_possible_os_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
352        if self.possible_os_token.is_some() {
353            return;
354        }
355
356        if version.is_none() {
357            return;
358        }
359
360        self.possible_os_token = Some(Token {
361            position: self.position,
362            tag,
363            version,
364        });
365    }
366
367    fn add_possible_browser_tag_for_bot(&mut self, tag: &'a str, version: Option<&'a str>) {
368        if self.bot_detected {
369            // Most of useragent string from bot attaches url
370            // e.g. Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
371            return;
372        }
373        self.bot_detected = true;
374        self.possible_browser_token = Some(Token {
375            position: self.position,
376            tag,
377            version,
378        });
379    }
380
381    fn add_possible_browser_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
382        if version.is_none() {
383            return;
384        }
385
386        self.possible_browser_token = Some(Token {
387            position: self.position,
388            tag,
389            version,
390        });
391    }
392}
393
394#[derive(Debug, Clone)]
395pub struct Token<'a> {
396    pub position: usize,
397    pub tag: &'a str,
398    pub version: Option<&'a str>,
399}
400
401impl<'a> Token<'a> {
402    pub fn get_version(&self) -> Option<Version<'a>> {
403        let version = unwrap_or_return!(self.version, Some(Version::major("0.0.0")));
404
405        if self.tag == "Windows" {
406            let mapped = match version {
407                "5.1" => "XP",
408                "5.2" => "XP",
409                "6.0" => "Vista",
410                "6.1" => "7", // lol
411                "6.3" => "8.1",
412                "10.0" => "10",
413                _ => "0.0.0",
414            };
415
416            return Some(Version::major(mapped));
417        }
418
419        let mut parts = version.split_terminator(['_', ',', '.']);
420
421        let mut version = Version::default();
422        if let Some(major) = parts.next() {
423            version.major = Some(major);
424        }
425
426        if let Some(minor) = parts.next() {
427            let trimmed_minor = take_until_non_numeric(minor);
428            version.minor = Some(trimmed_minor);
429        }
430
431        if let Some(patch) = parts.next() {
432            version.patch = Some(patch);
433        }
434
435        // odd: Don't include patch_minor here
436        if self.tag == "YaBrowser" || self.tag == "Opera" || self.tag == "NetFront NX" {
437            return Some(version);
438        }
439
440        if let Some(patch_minor) = parts.next() {
441            version.patch_minor = Some(patch_minor);
442        }
443
444        Some(version)
445    }
446}
447
448fn trim_invalid_chars(s: Option<&str>) -> Option<&str> {
449    let trimmed = s.map(|s| {
450        s.trim_matches(|c| {
451            c == '(' || c == ')' || c == ';' || c == ',' || c == '+' || c == '_' || c == '"'
452        })
453    });
454
455    match trimmed {
456        Some("") => None,
457        Some(s) => Some(s),
458        None => None,
459    }
460}
461
462fn starts_with_number(s: Option<&str>) -> bool {
463    s.map(|s| s.chars().next().is_some_and(|c| c.is_ascii_digit()))
464        .unwrap_or(false)
465}
466
467fn consume_if_numeric<'a>(win: &mut WindowIter<'a>, tag: Option<&'a str>) -> Option<&'a str> {
468    if starts_with_number(tag) {
469        win.slide_window_by(1);
470        return tag;
471    }
472
473    None
474}
475
476fn take_until_non_numeric(s: &str) -> &str {
477    let mut slice_index = 0;
478
479    for c in s.chars() {
480        if !c.is_ascii_digit() {
481            break;
482        }
483
484        slice_index += 1;
485    }
486
487    s.get(..slice_index).unwrap_or(s)
488}