Skip to main content

systemprompt_analytics/services/extractor/
mod.rs

1//! HTTP request → [`SessionAnalytics`] extraction.
2//!
3//! `HeaderValue::to_str().ok()` is used liberally below. That is deliberate:
4//! a non-ASCII header value is not actionable for the analytics pipeline and
5//! must not abort session creation. Treating those headers as absent is the
6//! correct fallback — the corresponding session field stays `None` and
7//! downstream consumers treat the request as un-attributed for that
8//! dimension.
9
10use http::{HeaderMap, Uri};
11use std::collections::HashMap;
12
13use axum::extract::Request;
14
15use super::ai_crawler_keywords::matches_ai_crawler;
16use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
17use super::detection;
18use super::user_agent::parse_user_agent;
19use crate::GeoIpReader;
20
21mod geoip;
22
23#[derive(Debug, Clone, Default)]
24pub struct SessionAnalytics {
25    pub ip_address: Option<String>,
26    pub user_agent: Option<String>,
27    pub device_type: Option<String>,
28    pub browser: Option<String>,
29    pub os: Option<String>,
30    pub fingerprint_hash: Option<String>,
31    pub preferred_locale: Option<String>,
32    pub country: Option<String>,
33    pub region: Option<String>,
34    pub city: Option<String>,
35    pub referrer_source: Option<String>,
36    pub referrer_url: Option<String>,
37    pub landing_page: Option<String>,
38    pub entry_url: Option<String>,
39    pub utm_source: Option<String>,
40    pub utm_medium: Option<String>,
41    pub utm_campaign: Option<String>,
42    pub utm_content: Option<String>,
43    pub utm_term: Option<String>,
44}
45
46impl SessionAnalytics {
47    pub fn from_headers(headers: &HeaderMap) -> Self {
48        Self::from_headers_with_geoip(headers, None)
49    }
50
51    pub fn from_headers_with_geoip(
52        headers: &HeaderMap,
53        geoip_reader: Option<&GeoIpReader>,
54    ) -> Self {
55        Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
56    }
57
58    pub fn from_headers_with_geoip_and_socket(
59        headers: &HeaderMap,
60        geoip_reader: Option<&GeoIpReader>,
61        socket_addr: Option<std::net::SocketAddr>,
62    ) -> Self {
63        let user_agent = headers
64            .get("user-agent")
65            .and_then(|v| v.to_str().ok())
66            .map(ToString::to_string);
67
68        let ip_address = headers
69            .get("x-forwarded-for")
70            .and_then(|v| v.to_str().ok())
71            .and_then(|s| s.split(',').next())
72            .map(|s| s.trim().to_string())
73            .or_else(|| {
74                headers
75                    .get("x-real-ip")
76                    .and_then(|v| v.to_str().ok())
77                    .map(ToString::to_string)
78            })
79            .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
80
81        let fingerprint_hash = headers
82            .get("x-fingerprint")
83            .and_then(|v| v.to_str().ok())
84            .map(ToString::to_string);
85
86        let preferred_locale = headers
87            .get("accept-language")
88            .and_then(|v| v.to_str().ok())
89            .and_then(|s| s.split(',').next())
90            .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
91
92        let (device_type, browser, os) = user_agent
93            .as_ref()
94            .map_or((None, None, None), |ua| parse_user_agent(ua));
95
96        let (country, region, city) = ip_address
97            .as_ref()
98            .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
99            .unwrap_or((None, None, None));
100
101        let referrer_url = headers
102            .get("referer")
103            .and_then(|v| v.to_str().ok())
104            .map(ToString::to_string);
105
106        let referrer_source = referrer_url
107            .as_ref()
108            .and_then(|url| Self::parse_referrer_source(url));
109
110        Self {
111            ip_address,
112            user_agent,
113            device_type,
114            browser,
115            os,
116            fingerprint_hash,
117            preferred_locale,
118            country,
119            region,
120            city,
121            referrer_source,
122            referrer_url,
123            landing_page: None,
124            entry_url: None,
125            utm_source: None,
126            utm_medium: None,
127            utm_campaign: None,
128            utm_content: None,
129            utm_term: None,
130        }
131    }
132
133    pub fn from_headers_and_uri(
134        headers: &HeaderMap,
135        uri: Option<&Uri>,
136        geoip_reader: Option<&GeoIpReader>,
137        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
138    ) -> Self {
139        let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
140
141        if let Some(uri) = uri {
142            let query_params = Self::parse_query_params(uri);
143
144            analytics.utm_source = query_params.get("utm_source").cloned();
145            analytics.utm_medium = query_params.get("utm_medium").cloned();
146            analytics.utm_campaign = query_params.get("utm_campaign").cloned();
147            analytics.utm_content = query_params.get("utm_content").cloned();
148            analytics.utm_term = query_params.get("utm_term").cloned();
149
150            let is_html_page =
151                content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
152
153            if is_html_page {
154                analytics.entry_url = Some(uri.to_string());
155                analytics.landing_page = Some(uri.path().to_string());
156            }
157        }
158
159        analytics
160    }
161
162    pub fn from_request(
163        request: &Request,
164        geoip_reader: Option<&GeoIpReader>,
165        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
166    ) -> Self {
167        Self::from_headers_and_uri(
168            request.headers(),
169            Some(request.uri()),
170            geoip_reader,
171            content_routing,
172        )
173    }
174
175    fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
176        uri.query().map_or_else(HashMap::new, |q| {
177            q.split('&')
178                .filter_map(|param| {
179                    let mut parts = param.splitn(2, '=');
180                    Some((parts.next()?.to_string(), parts.next()?.to_string()))
181                })
182                .collect()
183        })
184    }
185
186    fn lookup_geoip(
187        ip_str: &str,
188        geoip_reader: Option<&GeoIpReader>,
189    ) -> Option<(Option<String>, Option<String>, Option<String>)> {
190        geoip::lookup_geoip(ip_str, geoip_reader)
191    }
192
193    fn parse_referrer_source(url: &str) -> Option<String> {
194        geoip::parse_referrer_source(url)
195    }
196
197    pub fn is_bot(&self) -> bool {
198        if self.is_ai_crawler() {
199            return false;
200        }
201        self.user_agent
202            .as_ref()
203            .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
204    }
205
206    pub fn is_ai_crawler(&self) -> bool {
207        self.user_agent
208            .as_ref()
209            .is_some_and(|ua| matches_ai_crawler(ua))
210    }
211
212    pub fn is_bot_ip(&self) -> bool {
213        self.ip_address
214            .as_ref()
215            .is_some_and(|ip| matches_bot_ip_range(ip))
216    }
217
218    pub fn is_spam_referrer(&self) -> bool {
219        self.referrer_url
220            .as_ref()
221            .is_some_and(|url| detection::is_spam_referrer(url))
222    }
223
224    pub fn is_datacenter_ip(&self) -> bool {
225        self.ip_address
226            .as_ref()
227            .is_some_and(|ip| detection::is_datacenter_ip(ip))
228    }
229
230    pub fn is_high_risk_country(&self) -> bool {
231        self.country
232            .as_ref()
233            .is_some_and(|c| detection::is_high_risk_country(c))
234    }
235
236    pub fn should_skip_tracking(&self) -> bool {
237        if self.is_ai_crawler() {
238            return false;
239        }
240        self.is_bot()
241            || self.is_bot_ip()
242            || self.is_datacenter_ip()
243            || self.is_high_risk_country()
244            || self.is_spam_referrer()
245    }
246}