statsig_rust/evaluation/user_agent_parsing/statsig_uaparser/
tokenizer.rs

1use super::{window_iter::WindowIter, Version};
2
3pub struct Tokenizer;
4
5impl Tokenizer {
6    // Ideal UserAgent format: <product>/<product-version> (<os-information>) <engine> (<platform-details>) <optional-details>
7    pub fn run(input: &str) -> TokenizerResult {
8        let mut result = TokenizerResult::default();
9        let mut win = WindowIter::new(input);
10
11        while !win.is_empty() {
12            let (curr, next1, next2, next3) = win.get_window();
13            let (curr, next1, next2, next3) = (
14                trim_invalid_chars(curr),
15                trim_invalid_chars(next1),
16                trim_invalid_chars(next2),
17                trim_invalid_chars(next3),
18            );
19
20            let curr = match curr {
21                Some(val) => val,
22                None => {
23                    win.slide_window_by(1);
24                    continue;
25                }
26            };
27
28            if curr.starts_with("AppleTV") {
29                result.add_tag("ATV OS X", None);
30                win.slide_window_by(1);
31            }
32            // Mac OS X
33            else if curr == "Mac" && next1 == Some("OS") && next2 == Some("X") {
34                result.macos_hint = true;
35
36                result.add_possible_os_tag("Mac OS X", consume_if_numeric(&mut win, next3));
37
38                win.slide_window_by(2);
39            }
40            // iPhone OS
41            else if curr == "iPhone" && next1 == Some("OS") {
42                result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next2));
43
44                win.slide_window_by(1);
45            }
46            // iPad
47            else if curr.starts_with("iPad") {
48                result.ios_hint = true;
49
50                let mut parts = curr.split("iPad");
51                let _ = parts.next();
52                result.add_tag("iPad", trim_invalid_chars(parts.next()));
53            }
54            // iPhone
55            else if curr.starts_with("iPhone") {
56                result.ios_hint = true;
57
58                let mut parts = curr.split("iPhone");
59                let _ = parts.next();
60                result.add_tag("iPhone", trim_invalid_chars(parts.next()));
61            }
62            // iPhone (Apple)
63            else if curr.starts_with("Apple-iPhone7C2") {
64                result.ios_hint = true;
65
66                result.add_tag("iPhone", None);
67            }
68            // CPU OS
69            else if curr == "CPU" && next1 == Some("OS") {
70                result.add_tag("CPU OS", consume_if_numeric(&mut win, next2));
71                win.slide_window_by(1);
72            }
73            // Chrome OS
74            else if curr == "CrOS" {
75                let mut version = consume_if_numeric(&mut win, next1);
76                if version.is_none() {
77                    win.slide_window_by(1);
78                    version = consume_if_numeric(&mut win, next2);
79                }
80
81                result.add_possible_os_tag("Chrome OS", version);
82            }
83            // Chromecast
84            else if curr == "CrKey" {
85                result.add_possible_os_tag("Chromecast", None);
86            }
87            // PlayStation
88            else if curr == "PlayStation" {
89                result.playstation_hint = true;
90
91                result.add_tag("PlayStation", None);
92            }
93            // Android
94            else if curr == "Android" {
95                result.add_possible_os_tag("Android", consume_if_numeric(&mut win, next1));
96            }
97            // Windows Phone
98            else if curr == "Windows" && next1 == Some("Phone") {
99                result.add_possible_os_tag("Windows Phone", consume_if_numeric(&mut win, next2));
100                win.slide_window_by(1);
101            }
102            // Windows
103            else if curr.starts_with("Windows") {
104                result.windows_hint = true;
105
106                let version = if next1 == Some("NT") {
107                    consume_if_numeric(&mut win, next2).inspect(|_| {
108                        win.slide_window_by(1); // extra slide to skip the NT
109                    })
110                } else if next1.is_some_and(|s| s.starts_with("XP")) {
111                    win.slide_window_by(1);
112                    Some("XP")
113                } else {
114                    consume_if_numeric(&mut win, next1)
115                };
116
117                result.add_possible_os_tag("Windows", version);
118            }
119            // Yahoo Slurp
120            else if curr == "Yahoo!" && next1 == Some("Slurp") {
121                result.add_tag("Yahoo! Slurp", None);
122                win.slide_window_by(1);
123            }
124            // Red Hat
125            else if curr == "Red" && next1 == Some("Hat") {
126                result.add_possible_os_tag("Red Hat", None);
127
128                win.slide_window_by(1);
129            }
130            // Ubuntu
131            else if curr == "Ubuntu" {
132                result.add_possible_os_tag("Ubuntu", consume_if_numeric(&mut win, next1));
133            }
134            // Mobile
135            else if curr == "Mobile" {
136                result.mobile_hint = true;
137
138                result.add_tag("Mobile", None);
139            }
140            // Linux
141            else if curr == "Linux" {
142                result.linux_hint = true;
143                result.add_tag("Linux", None);
144            }
145            // Nintendo
146            else if curr == "Nintendo" && next1 == Some("3DS") {
147                result.add_tag("NetFront NX", None);
148                win.slide_window_by(1);
149            }
150            // Skip
151            else if curr == "like" || curr.len() <= 2 {
152                win.slide_window_by(1);
153                continue;
154            }
155            // Rest
156            else {
157                let parts = curr.split_once(['/', ';', ':']);
158                let tag = trim_invalid_chars(parts.map(|(t, _)| t)).unwrap_or(curr);
159                let version = trim_invalid_chars(parts.map(|(_, v)| v));
160
161                if tag == "Kindle" {
162                    result.add_possible_os_and_browser_tag("Kindle", version);
163                }
164                //
165                else if tag == "FxiOS" {
166                    result.add_possible_browser_tag("Firefox iOS", version);
167                }
168                //
169                else if tag == "CriOS" {
170                    result.add_possible_browser_tag("Chrome Mobile iOS", version);
171                }
172                //
173                else if tag == "GSA" {
174                    result.add_possible_browser_tag("Google", version);
175                }
176                //
177                else if tag == "YisouSpider" {
178                    result.add_possible_browser_tag("YisouSpider", version);
179                }
180                //
181                else if tag == "Edg" || tag == "Edge" {
182                    result.add_tag("Edge", version);
183                }
184                //
185                else if tag == "OPR" {
186                    result.add_possible_browser_tag("Opera", version);
187                }
188                //
189                else if tag == "SamsungBrowser" {
190                    result.add_possible_browser_tag("Samsung Internet", version);
191                }
192                //
193                else if tag == "HuaweiBrowser" {
194                    result.huawei_hint = true;
195
196                    result.add_tag("HuaweiBrowser", version);
197                }
198                //
199                else if tag == "ChatGPT-User" {
200                    result.add_possible_browser_tag("ChatGPT-User", version);
201                }
202                //
203                else if tag == "OAI-SearchBot" {
204                    result.add_possible_browser_tag("OAI-SearchBot", version);
205                }
206                //
207                else if tag == "NX" {
208                    result.add_possible_browser_tag("NetFront NX", version);
209                }
210                //
211                else if tag == "Electron" {
212                    result.add_possible_browser_tag("Electron", version);
213                }
214                // Bot
215                else if tag.contains("Bot") || tag.contains("bot") {
216                    result.add_possible_browser_tag(tag, version);
217                }
218                // Mobile
219                else if tag == "Mobile" {
220                    result.mobile_hint = true;
221
222                    result.add_tag("Mobile", version);
223                }
224                // Safari
225                else if tag == "Safari" {
226                    result.safari_hint = true;
227
228                    result.add_tag("Safari", version);
229                }
230                //
231                else {
232                    result.add_tag(tag, version);
233                }
234            }
235
236            win.slide_window_by(1);
237        }
238
239        result
240    }
241}
242
243#[derive(Debug, Default)]
244pub struct TokenizerResult<'a> {
245    pub position: usize,
246    pub tokens: Vec<Token<'a>>,
247    pub possible_os_token: Option<Token<'a>>,
248    pub possible_browser_token: Option<Token<'a>>,
249
250    // Hints
251    pub linux_hint: bool,
252    pub ios_hint: bool,
253    pub macos_hint: bool,
254    pub windows_hint: bool,
255    pub mobile_hint: bool,
256    pub safari_hint: bool,
257    pub playstation_hint: bool,
258    pub huawei_hint: bool,
259}
260
261impl<'a> TokenizerResult<'a> {
262    pub fn add_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
263        self.tokens.push(Token {
264            position: self.position,
265            tag,
266            version,
267        });
268        self.position += 1;
269    }
270
271    pub fn add_possible_os_and_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
272        self.add_possible_os_tag_impl(tag, version);
273        self.add_possible_browser_tag_impl(tag, version);
274
275        self.add_tag(tag, version);
276    }
277
278    pub fn add_possible_os_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
279        self.add_possible_os_tag_impl(tag, version);
280        self.add_tag(tag, version);
281    }
282
283    pub fn add_possible_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
284        self.add_possible_browser_tag_impl(tag, version);
285        self.add_tag(tag, version);
286    }
287
288    fn add_possible_os_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
289        if self.possible_os_token.is_some() {
290            return;
291        }
292
293        if version.is_none() {
294            return;
295        }
296
297        self.possible_os_token = Some(Token {
298            position: self.position,
299            tag,
300            version,
301        });
302    }
303
304    fn add_possible_browser_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
305        if version.is_none() {
306            return;
307        }
308
309        self.possible_browser_token = Some(Token {
310            position: self.position,
311            tag,
312            version,
313        });
314    }
315}
316
317#[derive(Debug, Clone)]
318pub struct Token<'a> {
319    pub position: usize,
320    pub tag: &'a str,
321    pub version: Option<&'a str>,
322}
323
324impl<'a> Token<'a> {
325    pub fn get_version(&self) -> Option<Version<'a>> {
326        let version = self.version?;
327
328        if self.tag == "Windows" {
329            let mapped = match version {
330                "5.1" => "XP",
331                "5.2" => "XP",
332                "6.0" => "Vista",
333                "6.1" => "7", // lol
334                "6.3" => "8.1",
335                "10.0" => "10",
336                _ => return None,
337            };
338
339            return Some(Version::major(mapped));
340        }
341
342        let mut parts = version.split_terminator(['_', ',', '.']);
343
344        let mut version = Version::default();
345        if let Some(major) = parts.next() {
346            version.major = Some(major);
347        }
348
349        if let Some(minor) = parts.next() {
350            let trimmed_minor = take_until_non_numeric(minor);
351            version.minor = Some(trimmed_minor);
352        }
353
354        if let Some(patch) = parts.next() {
355            version.patch = Some(patch);
356        }
357
358        // odd: Don't include patch_minor here
359        if self.tag == "YaBrowser" || self.tag == "Opera" || self.tag == "NetFront NX" {
360            return Some(version);
361        }
362
363        if let Some(patch_minor) = parts.next() {
364            version.patch_minor = Some(patch_minor);
365        }
366
367        Some(version)
368    }
369}
370
371fn trim_invalid_chars(s: Option<&str>) -> Option<&str> {
372    let trimmed = s.map(|s| {
373        s.trim_matches(|c| c == '(' || c == ')' || c == ';' || c == ',' || c == '+' || c == '_')
374    });
375
376    match trimmed {
377        Some("") => None,
378        Some(s) => Some(s),
379        None => None,
380    }
381}
382
383fn starts_with_number(s: Option<&str>) -> bool {
384    s.map(|s| s.chars().next().is_some_and(|c| c.is_ascii_digit()))
385        .unwrap_or(false)
386}
387
388fn consume_if_numeric<'a>(win: &mut WindowIter<'a>, tag: Option<&'a str>) -> Option<&'a str> {
389    if starts_with_number(tag) {
390        win.slide_window_by(1);
391        return tag;
392    }
393
394    None
395}
396
397fn take_until_non_numeric(s: &str) -> &str {
398    let mut slice_index = 0;
399
400    for c in s.chars() {
401        if !c.is_ascii_digit() {
402            break;
403        }
404
405        slice_index += 1;
406    }
407
408    s.get(..slice_index).unwrap_or(s)
409}