Skip to main content

systemprompt_analytics/services/
extractor.rs

1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3
4use axum::extract::Request;
5
6use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
7use super::detection;
8use super::user_agent::parse_user_agent;
9use crate::GeoIpReader;
10
11#[derive(Debug, Clone, Default)]
12pub struct SessionAnalytics {
13    pub ip_address: Option<String>,
14    pub user_agent: Option<String>,
15    pub device_type: Option<String>,
16    pub browser: Option<String>,
17    pub os: Option<String>,
18    pub fingerprint_hash: Option<String>,
19    pub preferred_locale: Option<String>,
20    pub country: Option<String>,
21    pub region: Option<String>,
22    pub city: Option<String>,
23    pub referrer_source: Option<String>,
24    pub referrer_url: Option<String>,
25    pub landing_page: Option<String>,
26    pub entry_url: Option<String>,
27    pub utm_source: Option<String>,
28    pub utm_medium: Option<String>,
29    pub utm_campaign: Option<String>,
30}
31
32impl SessionAnalytics {
33    pub fn from_headers(headers: &HeaderMap) -> Self {
34        Self::from_headers_with_geoip(headers, None)
35    }
36
37    pub fn from_headers_with_geoip(
38        headers: &HeaderMap,
39        geoip_reader: Option<&GeoIpReader>,
40    ) -> Self {
41        Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
42    }
43
44    pub fn from_headers_with_geoip_and_socket(
45        headers: &HeaderMap,
46        geoip_reader: Option<&GeoIpReader>,
47        socket_addr: Option<std::net::SocketAddr>,
48    ) -> Self {
49        let user_agent = headers
50            .get("user-agent")
51            .and_then(|v| v.to_str().ok())
52            .map(ToString::to_string);
53
54        let ip_address = headers
55            .get("x-forwarded-for")
56            .and_then(|v| v.to_str().ok())
57            .and_then(|s| s.split(',').next())
58            .map(|s| s.trim().to_string())
59            .or_else(|| {
60                headers
61                    .get("x-real-ip")
62                    .and_then(|v| v.to_str().ok())
63                    .map(ToString::to_string)
64            })
65            .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
66
67        let fingerprint_hash = headers
68            .get("x-fingerprint")
69            .and_then(|v| v.to_str().ok())
70            .map(ToString::to_string);
71
72        let preferred_locale = headers
73            .get("accept-language")
74            .and_then(|v| v.to_str().ok())
75            .and_then(|s| s.split(',').next())
76            .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
77
78        let (device_type, browser, os) = user_agent
79            .as_ref()
80            .map_or((None, None, None), |ua| parse_user_agent(ua));
81
82        let (country, region, city) = ip_address
83            .as_ref()
84            .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
85            .unwrap_or((None, None, None));
86
87        let referrer_url = headers
88            .get("referer")
89            .and_then(|v| v.to_str().ok())
90            .map(ToString::to_string);
91
92        let referrer_source = referrer_url
93            .as_ref()
94            .and_then(|url| Self::parse_referrer_source(url));
95
96        Self {
97            ip_address,
98            user_agent,
99            device_type,
100            browser,
101            os,
102            fingerprint_hash,
103            preferred_locale,
104            country,
105            region,
106            city,
107            referrer_source,
108            referrer_url,
109            landing_page: None,
110            entry_url: None,
111            utm_source: None,
112            utm_medium: None,
113            utm_campaign: None,
114        }
115    }
116
117    pub fn from_headers_and_uri(
118        headers: &HeaderMap,
119        uri: Option<&Uri>,
120        geoip_reader: Option<&GeoIpReader>,
121        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
122    ) -> Self {
123        let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
124
125        if let Some(uri) = uri {
126            let query_params = Self::parse_query_params(uri);
127
128            analytics.utm_source = query_params.get("utm_source").cloned();
129            analytics.utm_medium = query_params.get("utm_medium").cloned();
130            analytics.utm_campaign = query_params.get("utm_campaign").cloned();
131
132            let is_html_page =
133                content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
134
135            if is_html_page {
136                analytics.entry_url = Some(uri.to_string());
137                analytics.landing_page = Some(uri.path().to_string());
138            }
139        }
140
141        analytics
142    }
143
144    pub fn from_request(
145        request: &Request,
146        geoip_reader: Option<&GeoIpReader>,
147        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
148    ) -> Self {
149        Self::from_headers_and_uri(
150            request.headers(),
151            Some(request.uri()),
152            geoip_reader,
153            content_routing,
154        )
155    }
156
157    fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
158        uri.query().map_or_else(HashMap::new, |q| {
159            q.split('&')
160                .filter_map(|param| {
161                    let mut parts = param.splitn(2, '=');
162                    Some((parts.next()?.to_string(), parts.next()?.to_string()))
163                })
164                .collect()
165        })
166    }
167
168    #[cfg(feature = "geolocation")]
169    fn lookup_geoip(
170        ip_str: &str,
171        geoip_reader: Option<&GeoIpReader>,
172    ) -> Option<(Option<String>, Option<String>, Option<String>)> {
173        let Some(reader) = geoip_reader else {
174            tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: reader not configured");
175            return None;
176        };
177
178        let ip: std::net::IpAddr = match ip_str.parse() {
179            Ok(ip) => ip,
180            Err(e) => {
181                tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: invalid IP address");
182                return None;
183            },
184        };
185
186        if ip.is_loopback() || ip.is_unspecified() {
187            tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: loopback or unspecified address");
188            return None;
189        }
190
191        if let std::net::IpAddr::V4(ipv4) = ip {
192            if ipv4.is_private() || ipv4.is_link_local() {
193                tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: private or link-local address");
194                return None;
195            }
196        }
197
198        let lookup_result = match reader.lookup(ip) {
199            Ok(result) => result,
200            Err(e) => {
201                tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: database lookup error");
202                return None;
203            },
204        };
205
206        let city_data: maxminddb::geoip2::City = match lookup_result.decode() {
207            Ok(Some(data)) => data,
208            Ok(None) => {
209                tracing::debug!(ip = %ip_str, "GeoIP lookup returned empty result");
210                return None;
211            },
212            Err(e) => {
213                tracing::debug!(ip = %ip_str, error = %e, "GeoIP decode failed");
214                return None;
215            },
216        };
217
218        let country = city_data.country.iso_code.map(ToString::to_string);
219
220        let region = city_data
221            .subdivisions
222            .first()
223            .and_then(|s| s.iso_code)
224            .map(ToString::to_string);
225
226        let city_name = city_data.city.names.english.map(ToString::to_string);
227
228        Some((country, region, city_name))
229    }
230
231    #[cfg(not(feature = "geolocation"))]
232    const fn lookup_geoip(
233        _ip_str: &str,
234        _geoip_reader: Option<&GeoIpReader>,
235    ) -> Option<(Option<String>, Option<String>, Option<String>)> {
236        None
237    }
238
239    fn parse_referrer_source(url: &str) -> Option<String> {
240        url::Url::parse(url)
241            .ok()
242            .and_then(|parsed_url| parsed_url.host_str().map(ToString::to_string))
243            .and_then(|host| {
244                if host.parse::<std::net::IpAddr>().is_ok() {
245                    None
246                } else {
247                    Some(host)
248                }
249            })
250    }
251
252    pub fn is_bot(&self) -> bool {
253        self.user_agent
254            .as_ref()
255            .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
256    }
257
258    pub fn is_bot_ip(&self) -> bool {
259        self.ip_address
260            .as_ref()
261            .is_some_and(|ip| matches_bot_ip_range(ip))
262    }
263
264    pub fn is_spam_referrer(&self) -> bool {
265        self.referrer_url
266            .as_ref()
267            .is_some_and(|url| detection::is_spam_referrer(url))
268    }
269
270    pub fn is_datacenter_ip(&self) -> bool {
271        self.ip_address
272            .as_ref()
273            .is_some_and(|ip| detection::is_datacenter_ip(ip))
274    }
275
276    pub fn is_high_risk_country(&self) -> bool {
277        self.country
278            .as_ref()
279            .is_some_and(|c| detection::is_high_risk_country(c))
280    }
281
282    pub fn should_skip_tracking(&self) -> bool {
283        self.is_bot()
284            || self.is_bot_ip()
285            || self.is_datacenter_ip()
286            || self.is_high_risk_country()
287            || self.is_spam_referrer()
288    }
289}