Skip to main content

systemprompt_analytics/services/
extractor.rs

1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3
4use axum::extract::Request;
5
6use super::ai_crawler_keywords::matches_ai_crawler;
7use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
8use super::detection;
9use super::user_agent::parse_user_agent;
10use crate::GeoIpReader;
11
12#[derive(Debug, Clone, Default)]
13pub struct SessionAnalytics {
14    pub ip_address: Option<String>,
15    pub user_agent: Option<String>,
16    pub device_type: Option<String>,
17    pub browser: Option<String>,
18    pub os: Option<String>,
19    pub fingerprint_hash: Option<String>,
20    pub preferred_locale: Option<String>,
21    pub country: Option<String>,
22    pub region: Option<String>,
23    pub city: Option<String>,
24    pub referrer_source: Option<String>,
25    pub referrer_url: Option<String>,
26    pub landing_page: Option<String>,
27    pub entry_url: Option<String>,
28    pub utm_source: Option<String>,
29    pub utm_medium: Option<String>,
30    pub utm_campaign: Option<String>,
31    pub utm_content: Option<String>,
32    pub utm_term: Option<String>,
33}
34
35impl SessionAnalytics {
36    pub fn from_headers(headers: &HeaderMap) -> Self {
37        Self::from_headers_with_geoip(headers, None)
38    }
39
40    pub fn from_headers_with_geoip(
41        headers: &HeaderMap,
42        geoip_reader: Option<&GeoIpReader>,
43    ) -> Self {
44        Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
45    }
46
47    pub fn from_headers_with_geoip_and_socket(
48        headers: &HeaderMap,
49        geoip_reader: Option<&GeoIpReader>,
50        socket_addr: Option<std::net::SocketAddr>,
51    ) -> Self {
52        let user_agent = headers
53            .get("user-agent")
54            .and_then(|v| v.to_str().ok())
55            .map(ToString::to_string);
56
57        let ip_address = headers
58            .get("x-forwarded-for")
59            .and_then(|v| v.to_str().ok())
60            .and_then(|s| s.split(',').next())
61            .map(|s| s.trim().to_string())
62            .or_else(|| {
63                headers
64                    .get("x-real-ip")
65                    .and_then(|v| v.to_str().ok())
66                    .map(ToString::to_string)
67            })
68            .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
69
70        let fingerprint_hash = headers
71            .get("x-fingerprint")
72            .and_then(|v| v.to_str().ok())
73            .map(ToString::to_string);
74
75        let preferred_locale = headers
76            .get("accept-language")
77            .and_then(|v| v.to_str().ok())
78            .and_then(|s| s.split(',').next())
79            .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
80
81        let (device_type, browser, os) = user_agent
82            .as_ref()
83            .map_or((None, None, None), |ua| parse_user_agent(ua));
84
85        let (country, region, city) = ip_address
86            .as_ref()
87            .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
88            .unwrap_or((None, None, None));
89
90        let referrer_url = headers
91            .get("referer")
92            .and_then(|v| v.to_str().ok())
93            .map(ToString::to_string);
94
95        let referrer_source = referrer_url
96            .as_ref()
97            .and_then(|url| Self::parse_referrer_source(url));
98
99        Self {
100            ip_address,
101            user_agent,
102            device_type,
103            browser,
104            os,
105            fingerprint_hash,
106            preferred_locale,
107            country,
108            region,
109            city,
110            referrer_source,
111            referrer_url,
112            landing_page: None,
113            entry_url: None,
114            utm_source: None,
115            utm_medium: None,
116            utm_campaign: None,
117            utm_content: None,
118            utm_term: None,
119        }
120    }
121
122    pub fn from_headers_and_uri(
123        headers: &HeaderMap,
124        uri: Option<&Uri>,
125        geoip_reader: Option<&GeoIpReader>,
126        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
127    ) -> Self {
128        let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
129
130        if let Some(uri) = uri {
131            let query_params = Self::parse_query_params(uri);
132
133            analytics.utm_source = query_params.get("utm_source").cloned();
134            analytics.utm_medium = query_params.get("utm_medium").cloned();
135            analytics.utm_campaign = query_params.get("utm_campaign").cloned();
136            analytics.utm_content = query_params.get("utm_content").cloned();
137            analytics.utm_term = query_params.get("utm_term").cloned();
138
139            let is_html_page =
140                content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
141
142            if is_html_page {
143                analytics.entry_url = Some(uri.to_string());
144                analytics.landing_page = Some(uri.path().to_string());
145            }
146        }
147
148        analytics
149    }
150
151    pub fn from_request(
152        request: &Request,
153        geoip_reader: Option<&GeoIpReader>,
154        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
155    ) -> Self {
156        Self::from_headers_and_uri(
157            request.headers(),
158            Some(request.uri()),
159            geoip_reader,
160            content_routing,
161        )
162    }
163
164    fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
165        uri.query().map_or_else(HashMap::new, |q| {
166            q.split('&')
167                .filter_map(|param| {
168                    let mut parts = param.splitn(2, '=');
169                    Some((parts.next()?.to_string(), parts.next()?.to_string()))
170                })
171                .collect()
172        })
173    }
174
175    #[cfg(feature = "geolocation")]
176    fn lookup_geoip(
177        ip_str: &str,
178        geoip_reader: Option<&GeoIpReader>,
179    ) -> Option<(Option<String>, Option<String>, Option<String>)> {
180        let Some(reader) = geoip_reader else {
181            tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: reader not configured");
182            return None;
183        };
184
185        let ip: std::net::IpAddr = match ip_str.parse() {
186            Ok(ip) => ip,
187            Err(e) => {
188                tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: invalid IP address");
189                return None;
190            },
191        };
192
193        if ip.is_loopback() || ip.is_unspecified() {
194            tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: loopback or unspecified address");
195            return None;
196        }
197
198        if let std::net::IpAddr::V4(ipv4) = ip {
199            if ipv4.is_private() || ipv4.is_link_local() {
200                tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: private or link-local address");
201                return None;
202            }
203        }
204
205        let lookup_result = match reader.lookup(ip) {
206            Ok(result) => result,
207            Err(e) => {
208                tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: database lookup error");
209                return None;
210            },
211        };
212
213        let city_data: maxminddb::geoip2::City = match lookup_result.decode() {
214            Ok(Some(data)) => data,
215            Ok(None) => {
216                tracing::debug!(ip = %ip_str, "GeoIP lookup returned empty result");
217                return None;
218            },
219            Err(e) => {
220                tracing::debug!(ip = %ip_str, error = %e, "GeoIP decode failed");
221                return None;
222            },
223        };
224
225        let country = city_data.country.iso_code.map(ToString::to_string);
226
227        let region = city_data
228            .subdivisions
229            .first()
230            .and_then(|s| s.iso_code)
231            .map(ToString::to_string);
232
233        let city_name = city_data.city.names.english.map(ToString::to_string);
234
235        Some((country, region, city_name))
236    }
237
238    #[cfg(not(feature = "geolocation"))]
239    const fn lookup_geoip(
240        _ip_str: &str,
241        _geoip_reader: Option<&GeoIpReader>,
242    ) -> Option<(Option<String>, Option<String>, Option<String>)> {
243        None
244    }
245
246    fn parse_referrer_source(url: &str) -> Option<String> {
247        match url::Url::parse(url) {
248            Ok(parsed_url) => parsed_url
249                .host_str()
250                .map(ToString::to_string)
251                .filter(|host| host.parse::<std::net::IpAddr>().is_err()),
252            Err(err) => {
253                tracing::debug!(url = %url, error = %err, "failed to parse referrer URL");
254                None
255            },
256        }
257    }
258
259    pub fn is_bot(&self) -> bool {
260        if self.is_ai_crawler() {
261            return false;
262        }
263        self.user_agent
264            .as_ref()
265            .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
266    }
267
268    pub fn is_ai_crawler(&self) -> bool {
269        self.user_agent
270            .as_ref()
271            .is_some_and(|ua| matches_ai_crawler(ua))
272    }
273
274    pub fn is_bot_ip(&self) -> bool {
275        self.ip_address
276            .as_ref()
277            .is_some_and(|ip| matches_bot_ip_range(ip))
278    }
279
280    pub fn is_spam_referrer(&self) -> bool {
281        self.referrer_url
282            .as_ref()
283            .is_some_and(|url| detection::is_spam_referrer(url))
284    }
285
286    pub fn is_datacenter_ip(&self) -> bool {
287        self.ip_address
288            .as_ref()
289            .is_some_and(|ip| detection::is_datacenter_ip(ip))
290    }
291
292    pub fn is_high_risk_country(&self) -> bool {
293        self.country
294            .as_ref()
295            .is_some_and(|c| detection::is_high_risk_country(c))
296    }
297
298    pub fn should_skip_tracking(&self) -> bool {
299        if self.is_ai_crawler() {
300            return false;
301        }
302        self.is_bot()
303            || self.is_bot_ip()
304            || self.is_datacenter_ip()
305            || self.is_high_risk_country()
306            || self.is_spam_referrer()
307    }
308}