Skip to main content

systemprompt_analytics/services/
extractor.rs

1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3
4#[cfg(feature = "web")]
5use axum::extract::Request;
6
7use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
8use super::detection;
9use super::user_agent::parse_user_agent;
10use crate::GeoIpReader;
11
12#[derive(Debug, Clone, Default)]
13pub struct SessionAnalytics {
14    pub ip_address: Option<String>,
15    pub user_agent: Option<String>,
16    pub device_type: Option<String>,
17    pub browser: Option<String>,
18    pub os: Option<String>,
19    pub fingerprint_hash: Option<String>,
20    pub preferred_locale: Option<String>,
21    pub country: Option<String>,
22    pub region: Option<String>,
23    pub city: Option<String>,
24    pub referrer_source: Option<String>,
25    pub referrer_url: Option<String>,
26    pub landing_page: Option<String>,
27    pub entry_url: Option<String>,
28    pub utm_source: Option<String>,
29    pub utm_medium: Option<String>,
30    pub utm_campaign: Option<String>,
31}
32
33impl SessionAnalytics {
34    pub fn from_headers(headers: &HeaderMap) -> Self {
35        Self::from_headers_with_geoip(headers, None)
36    }
37
38    pub fn from_headers_with_geoip(
39        headers: &HeaderMap,
40        geoip_reader: Option<&GeoIpReader>,
41    ) -> Self {
42        Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
43    }
44
45    pub fn from_headers_with_geoip_and_socket(
46        headers: &HeaderMap,
47        geoip_reader: Option<&GeoIpReader>,
48        socket_addr: Option<std::net::SocketAddr>,
49    ) -> Self {
50        let user_agent = headers
51            .get("user-agent")
52            .and_then(|v| v.to_str().ok())
53            .map(ToString::to_string);
54
55        let ip_address = headers
56            .get("x-forwarded-for")
57            .and_then(|v| v.to_str().ok())
58            .and_then(|s| s.split(',').next())
59            .map(|s| s.trim().to_string())
60            .or_else(|| {
61                headers
62                    .get("x-real-ip")
63                    .and_then(|v| v.to_str().ok())
64                    .map(ToString::to_string)
65            })
66            .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
67
68        let fingerprint_hash = headers
69            .get("x-fingerprint")
70            .and_then(|v| v.to_str().ok())
71            .map(ToString::to_string);
72
73        let preferred_locale = headers
74            .get("accept-language")
75            .and_then(|v| v.to_str().ok())
76            .and_then(|s| s.split(',').next())
77            .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
78
79        let (device_type, browser, os) = user_agent
80            .as_ref()
81            .map_or((None, None, None), |ua| parse_user_agent(ua));
82
83        let (country, region, city) = ip_address
84            .as_ref()
85            .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
86            .unwrap_or((None, None, None));
87
88        let referrer_url = headers
89            .get("referer")
90            .and_then(|v| v.to_str().ok())
91            .map(ToString::to_string);
92
93        let referrer_source = referrer_url
94            .as_ref()
95            .and_then(|url| Self::parse_referrer_source(url));
96
97        Self {
98            ip_address,
99            user_agent,
100            device_type,
101            browser,
102            os,
103            fingerprint_hash,
104            preferred_locale,
105            country,
106            region,
107            city,
108            referrer_source,
109            referrer_url,
110            landing_page: None,
111            entry_url: None,
112            utm_source: None,
113            utm_medium: None,
114            utm_campaign: None,
115        }
116    }
117
118    pub fn from_headers_and_uri(
119        headers: &HeaderMap,
120        uri: Option<&Uri>,
121        geoip_reader: Option<&GeoIpReader>,
122        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
123    ) -> Self {
124        let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
125
126        if let Some(uri) = uri {
127            let query_params = Self::parse_query_params(uri);
128
129            analytics.utm_source = query_params.get("utm_source").cloned();
130            analytics.utm_medium = query_params.get("utm_medium").cloned();
131            analytics.utm_campaign = query_params.get("utm_campaign").cloned();
132
133            let is_html_page =
134                content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
135
136            if is_html_page {
137                analytics.entry_url = Some(uri.to_string());
138                analytics.landing_page = Some(uri.path().to_string());
139            }
140        }
141
142        analytics
143    }
144
145    #[cfg(feature = "web")]
146    pub fn from_request(
147        request: &Request,
148        geoip_reader: Option<&GeoIpReader>,
149        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
150    ) -> Self {
151        Self::from_headers_and_uri(
152            request.headers(),
153            Some(request.uri()),
154            geoip_reader,
155            content_routing,
156        )
157    }
158
159    fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
160        uri.query().map_or_else(HashMap::new, |q| {
161            q.split('&')
162                .filter_map(|param| {
163                    let mut parts = param.splitn(2, '=');
164                    Some((parts.next()?.to_string(), parts.next()?.to_string()))
165                })
166                .collect()
167        })
168    }
169
170    fn lookup_geoip(
171        ip_str: &str,
172        geoip_reader: Option<&GeoIpReader>,
173    ) -> Option<(Option<String>, Option<String>, Option<String>)> {
174        let reader = geoip_reader?;
175        let ip: std::net::IpAddr = ip_str.parse().ok()?;
176
177        let city_data: maxminddb::geoip2::City = reader.lookup(ip).ok()?.decode().ok()??;
178
179        let country = city_data.country.iso_code.map(ToString::to_string);
180
181        let region = city_data
182            .subdivisions
183            .first()
184            .and_then(|s| s.iso_code)
185            .map(ToString::to_string);
186
187        let city_name = city_data.city.names.english.map(ToString::to_string);
188
189        Some((country, region, city_name))
190    }
191
192    fn parse_referrer_source(url: &str) -> Option<String> {
193        url::Url::parse(url)
194            .ok()
195            .and_then(|parsed_url| parsed_url.host_str().map(ToString::to_string))
196            .and_then(|host| {
197                if host.parse::<std::net::IpAddr>().is_ok() {
198                    None
199                } else {
200                    Some(host)
201                }
202            })
203    }
204
205    pub fn is_bot(&self) -> bool {
206        self.user_agent
207            .as_ref()
208            .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
209    }
210
211    pub fn is_bot_ip(&self) -> bool {
212        self.ip_address
213            .as_ref()
214            .is_some_and(|ip| matches_bot_ip_range(ip))
215    }
216
217    pub fn is_spam_referrer(&self) -> bool {
218        self.referrer_url
219            .as_ref()
220            .is_some_and(|url| detection::is_spam_referrer(url))
221    }
222
223    pub fn is_datacenter_ip(&self) -> bool {
224        self.ip_address
225            .as_ref()
226            .is_some_and(|ip| detection::is_datacenter_ip(ip))
227    }
228
229    pub fn is_high_risk_country(&self) -> bool {
230        self.country
231            .as_ref()
232            .is_some_and(|c| detection::is_high_risk_country(c))
233    }
234
235    pub fn should_skip_tracking(&self) -> bool {
236        self.is_bot()
237            || self.is_bot_ip()
238            || self.is_datacenter_ip()
239            || self.is_high_risk_country()
240            || self.is_spam_referrer()
241    }
242}