statsig_rust/evaluation/user_agent_parsing/statsig_uaparser/
tokenizer.rs1use super::{window_iter::WindowIter, Version};
2
3pub struct Tokenizer;
4
5impl Tokenizer {
6 pub fn run(input: &str) -> TokenizerResult {
8 let mut result = TokenizerResult::default();
9 let mut win = WindowIter::new(input);
10
11 while !win.is_empty() {
12 let (curr, next1, next2, next3) = win.get_window();
13 let (curr, next1, next2, next3) = (
14 trim_invalid_chars(curr),
15 trim_invalid_chars(next1),
16 trim_invalid_chars(next2),
17 trim_invalid_chars(next3),
18 );
19
20 let curr = match curr {
21 Some(val) => val,
22 None => {
23 win.slide_window_by(1);
24 continue;
25 }
26 };
27
28 if curr.starts_with("AppleTV") {
29 result.add_tag("ATV OS X", None);
30 win.slide_window_by(1);
31 }
32 else if curr == "Mac" && next1 == Some("OS") && next2 == Some("X") {
34 result.macos_hint = true;
35
36 result.add_possible_os_tag("Mac OS X", consume_if_numeric(&mut win, next3));
37
38 win.slide_window_by(2);
39 }
40 else if curr == "iPhone" && next1 == Some("OS") {
42 result.add_possible_os_tag("iOS", consume_if_numeric(&mut win, next2));
43
44 win.slide_window_by(1);
45 }
46 else if curr.starts_with("iPad") {
48 result.ios_hint = true;
49
50 let mut parts = curr.split("iPad");
51 let _ = parts.next();
52 result.add_tag("iPad", trim_invalid_chars(parts.next()));
53 }
54 else if curr.starts_with("iPhone") {
56 result.ios_hint = true;
57
58 let mut parts = curr.split("iPhone");
59 let _ = parts.next();
60 result.add_tag("iPhone", trim_invalid_chars(parts.next()));
61 }
62 else if curr.starts_with("Apple-iPhone7C2") {
64 result.ios_hint = true;
65
66 result.add_tag("iPhone", None);
67 }
68 else if curr == "CPU" && next1 == Some("OS") {
70 result.add_tag("CPU OS", consume_if_numeric(&mut win, next2));
71 win.slide_window_by(1);
72 }
73 else if curr == "CrOS" {
75 let mut version = consume_if_numeric(&mut win, next1);
76 if version.is_none() {
77 win.slide_window_by(1);
78 version = consume_if_numeric(&mut win, next2);
79 }
80
81 result.add_possible_os_tag("Chrome OS", version);
82 }
83 else if curr == "CrKey" {
85 result.add_possible_os_tag("Chromecast", None);
86 }
87 else if curr == "PlayStation" {
89 result.playstation_hint = true;
90
91 result.add_tag("PlayStation", None);
92 }
93 else if curr == "Android" {
95 result.add_possible_os_tag("Android", consume_if_numeric(&mut win, next1));
96 }
97 else if curr == "Windows" && next1 == Some("Phone") {
99 result.add_possible_os_tag("Windows Phone", consume_if_numeric(&mut win, next2));
100 win.slide_window_by(1);
101 }
102 else if curr.starts_with("Windows") {
104 result.windows_hint = true;
105
106 let version = if next1 == Some("NT") {
107 consume_if_numeric(&mut win, next2).inspect(|_| {
108 win.slide_window_by(1); })
110 } else if next1.is_some_and(|s| s.starts_with("XP")) {
111 win.slide_window_by(1);
112 Some("XP")
113 } else {
114 consume_if_numeric(&mut win, next1)
115 };
116
117 result.add_possible_os_tag("Windows", version);
118 }
119 else if curr == "Yahoo!" && next1 == Some("Slurp") {
121 result.add_tag("Yahoo! Slurp", None);
122 win.slide_window_by(1);
123 }
124 else if curr == "Red" && next1 == Some("Hat") {
126 result.add_possible_os_tag("Red Hat", None);
127
128 win.slide_window_by(1);
129 }
130 else if curr == "Ubuntu" {
132 result.add_possible_os_tag("Ubuntu", consume_if_numeric(&mut win, next1));
133 }
134 else if curr == "Mobile" {
136 result.mobile_hint = true;
137
138 result.add_tag("Mobile", None);
139 }
140 else if curr == "Linux" {
142 result.linux_hint = true;
143 result.add_tag("Linux", None);
144 }
145 else if curr == "Nintendo" && next1 == Some("3DS") {
147 result.add_tag("NetFront NX", None);
148 win.slide_window_by(1);
149 }
150 else if curr == "like" || curr.len() <= 2 {
152 win.slide_window_by(1);
153 continue;
154 }
155 else {
157 let parts = curr.split_once(['/', ';', ':']);
158 let tag = trim_invalid_chars(parts.map(|(t, _)| t)).unwrap_or(curr);
159 let version = trim_invalid_chars(parts.map(|(_, v)| v));
160
161 if tag == "Kindle" {
162 result.add_possible_os_and_browser_tag("Kindle", version);
163 }
164 else if tag == "FxiOS" {
166 result.add_possible_browser_tag("Firefox iOS", version);
167 }
168 else if tag == "CriOS" {
170 result.add_possible_browser_tag("Chrome Mobile iOS", version);
171 }
172 else if tag == "GSA" {
174 result.add_possible_browser_tag("Google", version);
175 }
176 else if tag == "YisouSpider" {
178 result.add_possible_browser_tag("YisouSpider", version);
179 }
180 else if tag == "Edg" || tag == "Edge" {
182 result.add_tag("Edge", version);
183 }
184 else if tag == "OPR" {
186 result.add_possible_browser_tag("Opera", version);
187 }
188 else if tag == "SamsungBrowser" {
190 result.add_possible_browser_tag("Samsung Internet", version);
191 }
192 else if tag == "HuaweiBrowser" {
194 result.huawei_hint = true;
195
196 result.add_tag("HuaweiBrowser", version);
197 }
198 else if tag == "ChatGPT-User" {
200 result.add_possible_browser_tag("ChatGPT-User", version);
201 }
202 else if tag == "OAI-SearchBot" {
204 result.add_possible_browser_tag("OAI-SearchBot", version);
205 }
206 else if tag == "NX" {
208 result.add_possible_browser_tag("NetFront NX", version);
209 }
210 else if tag == "Electron" {
212 result.add_possible_browser_tag("Electron", version);
213 }
214 else if tag.contains("Bot") || tag.contains("bot") {
216 result.add_possible_browser_tag(tag, version);
217 }
218 else if tag == "Mobile" {
220 result.mobile_hint = true;
221
222 result.add_tag("Mobile", version);
223 }
224 else if tag == "Safari" {
226 result.safari_hint = true;
227
228 result.add_tag("Safari", version);
229 }
230 else {
232 result.add_tag(tag, version);
233 }
234 }
235
236 win.slide_window_by(1);
237 }
238
239 result
240 }
241}
242
243#[derive(Debug, Default)]
244pub struct TokenizerResult<'a> {
245 pub position: usize,
246 pub tokens: Vec<Token<'a>>,
247 pub possible_os_token: Option<Token<'a>>,
248 pub possible_browser_token: Option<Token<'a>>,
249
250 pub linux_hint: bool,
252 pub ios_hint: bool,
253 pub macos_hint: bool,
254 pub windows_hint: bool,
255 pub mobile_hint: bool,
256 pub safari_hint: bool,
257 pub playstation_hint: bool,
258 pub huawei_hint: bool,
259}
260
261impl<'a> TokenizerResult<'a> {
262 pub fn add_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
263 self.tokens.push(Token {
264 position: self.position,
265 tag,
266 version,
267 });
268 self.position += 1;
269 }
270
271 pub fn add_possible_os_and_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
272 self.add_possible_os_tag_impl(tag, version);
273 self.add_possible_browser_tag_impl(tag, version);
274
275 self.add_tag(tag, version);
276 }
277
278 pub fn add_possible_os_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
279 self.add_possible_os_tag_impl(tag, version);
280 self.add_tag(tag, version);
281 }
282
283 pub fn add_possible_browser_tag(&mut self, tag: &'a str, version: Option<&'a str>) {
284 self.add_possible_browser_tag_impl(tag, version);
285 self.add_tag(tag, version);
286 }
287
288 fn add_possible_os_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
289 if self.possible_os_token.is_some() {
290 return;
291 }
292
293 if version.is_none() {
294 return;
295 }
296
297 self.possible_os_token = Some(Token {
298 position: self.position,
299 tag,
300 version,
301 });
302 }
303
304 fn add_possible_browser_tag_impl(&mut self, tag: &'a str, version: Option<&'a str>) {
305 if version.is_none() {
306 return;
307 }
308
309 self.possible_browser_token = Some(Token {
310 position: self.position,
311 tag,
312 version,
313 });
314 }
315}
316
317#[derive(Debug, Clone)]
318pub struct Token<'a> {
319 pub position: usize,
320 pub tag: &'a str,
321 pub version: Option<&'a str>,
322}
323
324impl<'a> Token<'a> {
325 pub fn get_version(&self) -> Option<Version<'a>> {
326 let version = self.version?;
327
328 if self.tag == "Windows" {
329 let mapped = match version {
330 "5.1" => "XP",
331 "5.2" => "XP",
332 "6.0" => "Vista",
333 "6.1" => "7", "6.3" => "8.1",
335 "10.0" => "10",
336 _ => return None,
337 };
338
339 return Some(Version::major(mapped));
340 }
341
342 let mut parts = version.split_terminator(['_', ',', '.']);
343
344 let mut version = Version::default();
345 if let Some(major) = parts.next() {
346 version.major = Some(major);
347 }
348
349 if let Some(minor) = parts.next() {
350 let trimmed_minor = take_until_non_numeric(minor);
351 version.minor = Some(trimmed_minor);
352 }
353
354 if let Some(patch) = parts.next() {
355 version.patch = Some(patch);
356 }
357
358 if self.tag == "YaBrowser" || self.tag == "Opera" || self.tag == "NetFront NX" {
360 return Some(version);
361 }
362
363 if let Some(patch_minor) = parts.next() {
364 version.patch_minor = Some(patch_minor);
365 }
366
367 Some(version)
368 }
369}
370
371fn trim_invalid_chars(s: Option<&str>) -> Option<&str> {
372 let trimmed = s.map(|s| {
373 s.trim_matches(|c| c == '(' || c == ')' || c == ';' || c == ',' || c == '+' || c == '_')
374 });
375
376 match trimmed {
377 Some("") => None,
378 Some(s) => Some(s),
379 None => None,
380 }
381}
382
383fn starts_with_number(s: Option<&str>) -> bool {
384 s.map(|s| s.chars().next().is_some_and(|c| c.is_ascii_digit()))
385 .unwrap_or(false)
386}
387
388fn consume_if_numeric<'a>(win: &mut WindowIter<'a>, tag: Option<&'a str>) -> Option<&'a str> {
389 if starts_with_number(tag) {
390 win.slide_window_by(1);
391 return tag;
392 }
393
394 None
395}
396
397fn take_until_non_numeric(s: &str) -> &str {
398 let mut slice_index = 0;
399
400 for c in s.chars() {
401 if !c.is_ascii_digit() {
402 break;
403 }
404
405 slice_index += 1;
406 }
407
408 s.get(..slice_index).unwrap_or(s)
409}