statsig_rust/evaluation/user_agent_parsing/statsig_uaparser/
tokenizer.rs

1use crate::unwrap_or_return;
2
3use super::{window_iter::WindowIter, Version};
4
5pub struct Tokenizer;
6
7impl Tokenizer {
8    // Ideal UserAgent format: <product>/<product-version> (<os-information>) <engine> (<platform-details>) <optional-details>
9    pub fn run(input: &str) -> TokenizerResult<'_> {
10        let mut result = TokenizerResult::default();
11        let mut win: WindowIter<'_> = WindowIter::new(input);
12        while !win.is_empty() {
13            let (curr, next1, next2, next3) = win.get_window();
14            let (curr, next1, next2, next3) = (
15                trim_invalid_chars(curr),
16                trim_invalid_chars(next1),
17                trim_invalid_chars(next2),
18                trim_invalid_chars(next3),
19            );
20
21            let curr = match curr {
22                Some(val) => val,
23                None => {
24                    win.slide_window_by(1);
25                    continue;
26                }
27            };
28
29            if curr.starts_with("AppleTV") {
30                result.add_tag("ATV OS X", None);
31                win.slide_window_by(1);
32            }
33            // Mac OS X
34            else if curr == "Mac" && next1 == Some("OS") && next2 == Some("X") {
35                result.macos_hint = true;
36
37                result.add_possible_os_tag("Mac OS X", consume_if_numeric(&mut win, next3));
38
39                win.slide_window_by(2);
40            }
41            // iPhone OS
42            else if curr == "iOS" {
43                result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next1));
44                result.ios_hint = true;
45                win.slide_window_by(1);
46            } else if curr == "iPhone" && next1 == Some("OS") {
47                result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next2));
48
49                win.slide_window_by(1);
50            }
51            // iPad
52            else if curr.starts_with("iPad") {
53                result.ios_hint = true;
54
55                let mut parts = curr.split("iPad");
56                let _ = parts.next();
57                result.add_tag("iPad", trim_invalid_chars(parts.next()));
58            }
59            // iPhone
60            else if curr.starts_with("iPhone") {
61                result.ios_hint = true;
62
63                let mut parts = curr.split("iPhone");
64                let _ = parts.next();
65                result.add_tag("iPhone", trim_invalid_chars(parts.next()));
66            }
67            // iPhone (Apple)
68            else if curr.starts_with("Apple-iPhone7C2") {
69                result.ios_hint = true;
70
71                result.add_tag("iPhone", None);
72            }
73            // CPU OS
74            else if curr == "CPU" && next1 == Some("OS") {
75                result.add_tag("CPU OS", consume_if_numeric(&mut win, next2));
76                win.slide_window_by(1);
77            }
78            // Chrome OS
79            else if curr == "CrOS" {
80                let mut version = consume_if_numeric(&mut win, next1);
81                if version.is_none() {
82                    win.slide_window_by(1);
83                    version = consume_if_numeric(&mut win, next2);
84                }
85
86                result.add_possible_os_tag("Chrome OS", version);
87            }
88            // Chromecast
89            else if curr == "CrKey" {
90                result.add_possible_os_tag("Chromecast", None);
91            }
92            // PlayStation
93            else if curr == "PlayStation" {
94                result.playstation_hint = true;
95
96                result.add_tag("PlayStation", None);
97            }
98            // Android
99            else if curr == "Android" {
100                result.add_possible_os_tag("Android", consume_if_numeric(&mut win, next1));
101            }
102            // Windows Phone
103            else if curr == "Windows" && next1 == Some("Phone") {
104                result.add_possible_os_tag("Windows Phone", consume_if_numeric(&mut win, next2));
105                win.slide_window_by(1);
106            }
107            // Windows
108            else if curr.starts_with("Windows") {
109                result.windows_hint = true;
110
111                let version = if next1 == Some("NT") {
112                    consume_if_numeric(&mut win, next2).inspect(|_| {
113                        win.slide_window_by(1); // extra slide to skip the NT
114                    })
115                } else if next1.is_some_and(|s| s.starts_with("XP")) {
116                    win.slide_window_by(1);
117                    Some("XP")
118                } else {
119                    consume_if_numeric(&mut win, next1)
120                };
121
122                result.add_possible_os_tag("Windows", version);
123            }
124            // Yahoo Slurp
125            else if curr == "Yahoo!" && next1 == Some("Slurp") {
126                result.add_tag("Yahoo! Slurp", None);
127                win.slide_window_by(1);
128            }
129            // Red Hat
130            else if curr == "Red" && next1 == Some("Hat") {
131                result.add_possible_os_tag("Red Hat", None);
132
133                win.slide_window_by(1);
134            }
135            // Ubuntu
136            else if curr == "Ubuntu" {
137                result.add_possible_os_tag("Ubuntu", consume_if_numeric(&mut win, next1));
138            }
139            // Mobile
140            else if curr == "Mobile" {
141                result.mobile_hint = true;
142
143                result.add_tag("Mobile", None);
144            }
145            // Linux
146            else if curr == "Linux" {
147                result.linux_hint = true;
148                result.add_tag("Linux", None);
149            }
150            // Nintendo
151            else if curr == "Nintendo" && next1 == Some("3DS") {
152                result.add_tag("NetFront NX", None);
153                win.slide_window_by(1);
154            }
155            // Skip
156            else if curr == "like" || curr.len() <= 2 {
157                win.slide_window_by(1);
158                continue;
159            }
160            // Rest
161            else {
162                let parts = curr.split_once(['/', ';', ':']);
163                let tag = trim_invalid_chars(parts.map(|(t, _)| t)).unwrap_or(curr);
164                let version = trim_invalid_chars(parts.map(|(_, v)| v));
165
166                if tag == "Kindle" {
167                    result.add_possible_os_and_browser_tag("Kindle", version);
168                }
169                //
170                else if tag == "FxiOS" {
171                    result.add_possible_browser_tag("Firefox iOS", version);
172                }
173                //
174                else if tag == "CriOS" {
175                    result.add_possible_browser_tag("Chrome Mobile iOS", version);
176                }
177                //
178                else if tag == "GSA" {
179                    result.add_possible_browser_tag("Google", version);
180                }
181                //
182                else if tag == "YisouSpider" {
183                    result.add_possible_browser_tag("YisouSpider", version);
184                }
185                //
186                else if tag == "Edg" || tag == "Edge" {
187                    result.add_tag("Edge", version);
188                }
189                //
190                else if tag == "OPR" {
191                    result.add_possible_browser_tag("Opera", version);
192                }
193                //
194                else if tag == "SamsungBrowser" {
195                    result.add_possible_browser_tag("Samsung Internet", version);
196                }
197                //
198                else if tag == "HuaweiBrowser" {
199                    result.huawei_hint = true;
200
201                    result.add_tag("HuaweiBrowser", version);
202                }
203                //
204                else if tag == "ChatGPT-User" {
205                    result.add_possible_browser_tag("ChatGPT-User", version);
206                }
207                //
208                else if tag == "OAI-SearchBot" {
209                    result.add_possible_browser_tag("OAI-SearchBot", version);
210                }
211                //
212                else if tag == "NX" {
213                    result.add_possible_browser_tag("NetFront NX", version);
214                }
215                //
216                else if tag == "Electron" {
217                    result.add_possible_browser_tag("Electron", version);
218                }
219                // Bot or crawler
220                else if tag.contains("Bot")
221                    || tag.contains("bot")
222                    || tag.contains("crawler")
223                    || tag.contains("Crawler")
224                {
225                    result.add_possible_browser_tag(tag, version);
226                }
227                // Mobile
228                else if tag == "Mobile" {
229                    result.mobile_hint = true;
230
231                    result.add_tag("Mobile", version);
232                }
233                // Safari
234                else if tag == "Safari" {
235                    result.safari_hint = true;
236
237                    result.add_tag("Safari", version);
238                } else if tag == "CFNetwork" {
239                    result.cfnetwork_hint = true;
240                    result.ios_hint = true;
241                } else if tag.contains("crawler") || version.is_some_and(|v| v.contains("crawler"))
242                {
243                    result.crawler_hint = true;
244                }
245                //
246                else {
247                    result.add_tag(tag, version);
248                }
249            }
250
251            win.slide_window_by(1);
252        }
253
254        result
255    }
256}
257
258#[derive(Debug, Default)]
259pub struct TokenizerResult<'a> {
260    pub position: usize,
261    pub tokens: Vec<Token<'a>>,
262    pub possible_os_token: Option<Token<'a>>,
263    pub possible_browser_token: Option<Token<'a>>,
264
265    // Hints
266    pub linux_hint: bool,
267    pub ios_hint: bool,
268    pub macos_hint: bool,
269    pub windows_hint: bool,
270    pub mobile_hint: bool,
271    pub safari_hint: bool,
272    pub playstation_hint: bool,
273    pub huawei_hint: bool,
274    pub cfnetwork_hint: bool,
275    pub crawler_hint: bool,
276}
277
278impl<'a> TokenizerResult<'a> {
279    pub fn add_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
280        self.tokens.push(Token {
281            position: self.position,
282            tag,
283            version,
284        });
285        self.position += 1;
286    }
287
288    pub fn add_possible_os_and_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
289        self.add_possible_os_tag_impl(tag, version);
290        self.add_possible_browser_tag_impl(tag, version);
291
292        self.add_tag(tag, version);
293    }
294
295    pub fn add_possible_os_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
296        self.add_possible_os_tag_impl(tag, version);
297        self.add_tag(tag, version);
298    }
299
300    pub fn add_possible_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
301        self.add_possible_browser_tag_impl(tag, version);
302        self.add_tag(tag, version);
303    }
304
305    fn add_possible_os_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
306        if self.possible_os_token.is_some() {
307            return;
308        }
309
310        if version.is_none() {
311            return;
312        }
313
314        self.possible_os_token = Some(Token {
315            position: self.position,
316            tag,
317            version,
318        });
319    }
320
321    fn add_possible_browser_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
322        if version.is_none() {
323            return;
324        }
325        if self.possible_browser_token.is_some()
326            && (tag.contains(".com")
327                || tag.contains(".net")
328                || tag.contains(".org")
329                || tag.contains(".html")
330                || tag.contains("http://")
331                || tag.contains("https://"))
332        {
333            return;
334        }
335
336        self.possible_browser_token = Some(Token {
337            position: self.position,
338            tag,
339            version,
340        });
341    }
342}
343
344#[derive(Debug, Clone)]
345pub struct Token<'a> {
346    pub position: usize,
347    pub tag: &'a str,
348    pub version: Option<&'a str>,
349}
350
351impl<'a> Token<'a> {
352    pub fn get_version(&self) -> Option<Version<'a>> {
353        let version = unwrap_or_return!(self.version, Some(Version::major("0.0.0")));
354
355        if self.tag == "Windows" {
356            let mapped = match version {
357                "5.1" => "XP",
358                "5.2" => "XP",
359                "6.0" => "Vista",
360                "6.1" => "7", // lol
361                "6.3" => "8.1",
362                "10.0" => "10",
363                _ => "0.0.0",
364            };
365
366            return Some(Version::major(mapped));
367        }
368
369        let mut parts = version.split_terminator(['_', ',', '.']);
370
371        let mut version = Version::default();
372        if let Some(major) = parts.next() {
373            version.major = Some(major);
374        }
375
376        if let Some(minor) = parts.next() {
377            let trimmed_minor = take_until_non_numeric(minor);
378            version.minor = Some(trimmed_minor);
379        }
380
381        if let Some(patch) = parts.next() {
382            version.patch = Some(patch);
383        }
384
385        // odd: Don't include patch_minor here
386        if self.tag == "YaBrowser" || self.tag == "Opera" || self.tag == "NetFront NX" {
387            return Some(version);
388        }
389
390        if let Some(patch_minor) = parts.next() {
391            version.patch_minor = Some(patch_minor);
392        }
393
394        Some(version)
395    }
396}
397
398fn trim_invalid_chars(s: Option<&str>) -> Option<&str> {
399    let trimmed = s.map(|s| {
400        s.trim_matches(|c| c == '(' || c == ')' || c == ';' || c == ',' || c == '+' || c == '_')
401    });
402
403    match trimmed {
404        Some("") => None,
405        Some(s) => Some(s),
406        None => None,
407    }
408}
409
410fn starts_with_number(s: Option<&str>) -> bool {
411    s.map(|s| s.chars().next().is_some_and(|c| c.is_ascii_digit()))
412        .unwrap_or(false)
413}
414
415fn consume_if_numeric<'a>(win: &mut WindowIter<'a>, tag: Option<&'a str>) -> Option<&'a str> {
416    if starts_with_number(tag) {
417        win.slide_window_by(1);
418        return tag;
419    }
420
421    None
422}
423
424fn take_until_non_numeric(s: &str) -> &str {
425    let mut slice_index = 0;
426
427    for c in s.chars() {
428        if !c.is_ascii_digit() {
429            break;
430        }
431
432        slice_index += 1;
433    }
434
435    s.get(..slice_index).unwrap_or(s)
436}