Skip to main content

har/
classify.rs

1use serde::Serialize;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
4#[serde(rename_all = "lowercase")]
5pub enum ResourceType {
6    Api,
7    Media,
8    Static,
9    Analytics,
10    Document,
11    Other,
12}
13
14const ANALYTICS_HOSTS: &[&str] = &[
15    "google-analytics.com",
16    "analytics.google.com",
17    "doubleclick.net",
18    "googletagmanager.com",
19    "segment.io",
20    "mixpanel.com",
21    "amplitude.com",
22    "sentry.io",
23    "crashlytics.com",
24];
25
26/// Classify an entry by content-type, then URL extension, then host.
27pub fn classify(content_type: Option<&str>, url: &str) -> ResourceType {
28    let host = host_of(url);
29    if ANALYTICS_HOSTS.iter().any(|h| host.ends_with(h)) {
30        return ResourceType::Analytics;
31    }
32    if let Some(ct) = content_type {
33        let ct = ct
34            .split(';')
35            .next()
36            .unwrap_or(ct)
37            .trim()
38            .to_ascii_lowercase();
39        if let Some(rt) = by_mime(&ct) {
40            return rt;
41        }
42    }
43    by_extension(url)
44}
45
46fn by_mime(ct: &str) -> Option<ResourceType> {
47    if ct.contains("json")
48        || ct.contains("graphql")
49        || ct.contains("grpc")
50        || ct.contains("protobuf")
51    {
52        return Some(ResourceType::Api);
53    }
54    if ct.contains("xml") && !ct.contains("html") {
55        return Some(ResourceType::Api);
56    }
57    if ct.starts_with("image/") || ct.starts_with("video/") || ct.starts_with("audio/") {
58        return Some(ResourceType::Media);
59    }
60    if ct.contains("javascript")
61        || ct.contains("css")
62        || ct.contains("font")
63        || ct.contains("ecmascript")
64    {
65        return Some(ResourceType::Static);
66    }
67    if ct.contains("html") {
68        return Some(ResourceType::Document);
69    }
70    None
71}
72
73fn by_extension(url: &str) -> ResourceType {
74    let path = url.split(['?', '#']).next().unwrap_or(url);
75    let ext = path.rsplit('.').next().unwrap_or("").to_ascii_lowercase();
76    match ext.as_str() {
77        "png" | "jpg" | "jpeg" | "gif" | "webp" | "svg" | "ico" | "mp4" | "webm" | "ts" | "m4s"
78        | "mp3" | "aac" | "m3u8" => ResourceType::Media,
79        "js" | "mjs" | "css" | "woff" | "woff2" | "ttf" | "otf" | "eot" => ResourceType::Static,
80        "json" => ResourceType::Api,
81        "html" | "htm" => ResourceType::Document,
82        _ => ResourceType::Other,
83    }
84}
85
86fn host_of(url: &str) -> String {
87    url::Url::parse(url)
88        .ok()
89        .and_then(|u| u.host_str().map(|h| h.to_string()))
90        .unwrap_or_default()
91}
92
93#[cfg(test)]
94mod tests {
95    use super::{ResourceType, classify};
96
97    #[test]
98    fn json_is_api() {
99        assert_eq!(
100            classify(Some("application/json"), "https://api.x/v1/y"),
101            ResourceType::Api
102        );
103    }
104
105    #[test]
106    fn image_is_media() {
107        assert_eq!(
108            classify(Some("image/png"), "https://x/a.png"),
109            ResourceType::Media
110        );
111    }
112
113    #[test]
114    fn video_is_media() {
115        assert_eq!(
116            classify(Some("video/mp4"), "https://x/a.mp4"),
117            ResourceType::Media
118        );
119    }
120
121    #[test]
122    fn javascript_is_static() {
123        assert_eq!(
124            classify(Some("application/javascript"), "https://x/a.js"),
125            ResourceType::Static
126        );
127    }
128
129    #[test]
130    fn falls_back_to_extension() {
131        assert_eq!(classify(None, "https://x/styles.css"), ResourceType::Static);
132    }
133
134    #[test]
135    fn analytics_host() {
136        assert_eq!(
137            classify(
138                Some("application/json"),
139                "https://www.google-analytics.com/collect"
140            ),
141            ResourceType::Analytics
142        );
143    }
144}