Skip to main content

systemprompt_analytics/services/
extractor.rs

1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3use tracing::debug;
4
5#[cfg(feature = "web")]
6use axum::extract::Request;
7
8use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
9use super::detection;
10use super::user_agent::parse_user_agent;
11use crate::GeoIpReader;
12
13#[derive(Debug, Clone, Default)]
14pub struct SessionAnalytics {
15    pub ip_address: Option<String>,
16    pub user_agent: Option<String>,
17    pub device_type: Option<String>,
18    pub browser: Option<String>,
19    pub os: Option<String>,
20    pub fingerprint_hash: Option<String>,
21    pub preferred_locale: Option<String>,
22    pub country: Option<String>,
23    pub region: Option<String>,
24    pub city: Option<String>,
25    pub referrer_source: Option<String>,
26    pub referrer_url: Option<String>,
27    pub landing_page: Option<String>,
28    pub entry_url: Option<String>,
29    pub utm_source: Option<String>,
30    pub utm_medium: Option<String>,
31    pub utm_campaign: Option<String>,
32}
33
34impl SessionAnalytics {
35    pub fn from_headers(headers: &HeaderMap) -> Self {
36        Self::from_headers_with_geoip(headers, None)
37    }
38
39    pub fn from_headers_with_geoip(
40        headers: &HeaderMap,
41        geoip_reader: Option<&GeoIpReader>,
42    ) -> Self {
43        Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
44    }
45
46    pub fn from_headers_with_geoip_and_socket(
47        headers: &HeaderMap,
48        geoip_reader: Option<&GeoIpReader>,
49        socket_addr: Option<std::net::SocketAddr>,
50    ) -> Self {
51        let user_agent = headers
52            .get("user-agent")
53            .and_then(|v| v.to_str().ok())
54            .map(ToString::to_string);
55
56        let ip_address = headers
57            .get("x-forwarded-for")
58            .and_then(|v| v.to_str().ok())
59            .and_then(|s| s.split(',').next())
60            .map(|s| s.trim().to_string())
61            .or_else(|| {
62                headers
63                    .get("x-real-ip")
64                    .and_then(|v| v.to_str().ok())
65                    .map(ToString::to_string)
66            })
67            .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
68
69        let fingerprint_hash = headers
70            .get("x-fingerprint")
71            .and_then(|v| v.to_str().ok())
72            .map(ToString::to_string);
73
74        let preferred_locale = headers
75            .get("accept-language")
76            .and_then(|v| v.to_str().ok())
77            .and_then(|s| s.split(',').next())
78            .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
79
80        let (device_type, browser, os) = user_agent
81            .as_ref()
82            .map_or((None, None, None), |ua| parse_user_agent(ua));
83
84        let (country, region, city) = ip_address
85            .as_ref()
86            .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
87            .unwrap_or((None, None, None));
88
89        let referrer_url = headers
90            .get("referer")
91            .and_then(|v| v.to_str().ok())
92            .map(ToString::to_string);
93
94        let referrer_source = referrer_url
95            .as_ref()
96            .and_then(|url| Self::parse_referrer_source(url));
97
98        Self {
99            ip_address,
100            user_agent,
101            device_type,
102            browser,
103            os,
104            fingerprint_hash,
105            preferred_locale,
106            country,
107            region,
108            city,
109            referrer_source,
110            referrer_url,
111            landing_page: None,
112            entry_url: None,
113            utm_source: None,
114            utm_medium: None,
115            utm_campaign: None,
116        }
117    }
118
119    pub fn from_headers_and_uri(
120        headers: &HeaderMap,
121        uri: Option<&Uri>,
122        geoip_reader: Option<&GeoIpReader>,
123        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
124    ) -> Self {
125        let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
126
127        if let Some(uri) = uri {
128            let query_params = Self::parse_query_params(uri);
129
130            analytics.utm_source = query_params.get("utm_source").cloned();
131            analytics.utm_medium = query_params.get("utm_medium").cloned();
132            analytics.utm_campaign = query_params.get("utm_campaign").cloned();
133
134            let is_html_page =
135                content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
136
137            if is_html_page {
138                analytics.entry_url = Some(uri.to_string());
139                analytics.landing_page = Some(uri.path().to_string());
140            }
141        }
142
143        analytics
144    }
145
146    #[cfg(feature = "web")]
147    pub fn from_request(
148        request: &Request,
149        geoip_reader: Option<&GeoIpReader>,
150        content_routing: Option<&dyn systemprompt_models::ContentRouting>,
151    ) -> Self {
152        Self::from_headers_and_uri(
153            request.headers(),
154            Some(request.uri()),
155            geoip_reader,
156            content_routing,
157        )
158    }
159
160    fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
161        uri.query().map_or_else(HashMap::new, |q| {
162            q.split('&')
163                .filter_map(|param| {
164                    let mut parts = param.splitn(2, '=');
165                    Some((parts.next()?.to_string(), parts.next()?.to_string()))
166                })
167                .collect()
168        })
169    }
170
171    fn lookup_geoip(
172        ip_str: &str,
173        geoip_reader: Option<&GeoIpReader>,
174    ) -> Option<(Option<String>, Option<String>, Option<String>)> {
175        let Some(reader) = geoip_reader else {
176            debug!(ip = %ip_str, "GeoIP lookup skipped: reader not configured");
177            return None;
178        };
179
180        let ip: std::net::IpAddr = match ip_str.parse() {
181            Ok(ip) => ip,
182            Err(e) => {
183                debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: invalid IP address");
184                return None;
185            },
186        };
187
188        if ip.is_loopback() || ip.is_unspecified() {
189            debug!(ip = %ip_str, "GeoIP lookup skipped: loopback or unspecified address");
190            return None;
191        }
192
193        if let std::net::IpAddr::V4(ipv4) = ip {
194            if ipv4.is_private() || ipv4.is_link_local() {
195                debug!(ip = %ip_str, "GeoIP lookup skipped: private or link-local address");
196                return None;
197            }
198        }
199
200        let lookup_result = match reader.lookup(ip) {
201            Ok(result) => result,
202            Err(e) => {
203                debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: database lookup error");
204                return None;
205            },
206        };
207
208        let city_data: maxminddb::geoip2::City = match lookup_result.decode() {
209            Ok(Some(data)) => data,
210            Ok(None) => {
211                debug!(ip = %ip_str, "GeoIP lookup returned empty result");
212                return None;
213            },
214            Err(e) => {
215                debug!(ip = %ip_str, error = %e, "GeoIP decode failed");
216                return None;
217            },
218        };
219
220        let country = city_data.country.iso_code.map(ToString::to_string);
221
222        let region = city_data
223            .subdivisions
224            .first()
225            .and_then(|s| s.iso_code)
226            .map(ToString::to_string);
227
228        let city_name = city_data.city.names.english.map(ToString::to_string);
229
230        Some((country, region, city_name))
231    }
232
233    fn parse_referrer_source(url: &str) -> Option<String> {
234        url::Url::parse(url)
235            .ok()
236            .and_then(|parsed_url| parsed_url.host_str().map(ToString::to_string))
237            .and_then(|host| {
238                if host.parse::<std::net::IpAddr>().is_ok() {
239                    None
240                } else {
241                    Some(host)
242                }
243            })
244    }
245
246    pub fn is_bot(&self) -> bool {
247        self.user_agent
248            .as_ref()
249            .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
250    }
251
252    pub fn is_bot_ip(&self) -> bool {
253        self.ip_address
254            .as_ref()
255            .is_some_and(|ip| matches_bot_ip_range(ip))
256    }
257
258    pub fn is_spam_referrer(&self) -> bool {
259        self.referrer_url
260            .as_ref()
261            .is_some_and(|url| detection::is_spam_referrer(url))
262    }
263
264    pub fn is_datacenter_ip(&self) -> bool {
265        self.ip_address
266            .as_ref()
267            .is_some_and(|ip| detection::is_datacenter_ip(ip))
268    }
269
270    pub fn is_high_risk_country(&self) -> bool {
271        self.country
272            .as_ref()
273            .is_some_and(|c| detection::is_high_risk_country(c))
274    }
275
276    pub fn should_skip_tracking(&self) -> bool {
277        self.is_bot()
278            || self.is_bot_ip()
279            || self.is_datacenter_ip()
280            || self.is_high_risk_country()
281            || self.is_spam_referrer()
282    }
283}