systemprompt_analytics/services/
extractor.rs1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3use tracing::debug;
4
5#[cfg(feature = "web")]
6use axum::extract::Request;
7
8use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
9use super::detection;
10use super::user_agent::parse_user_agent;
11use crate::GeoIpReader;
12
13#[derive(Debug, Clone, Default)]
14pub struct SessionAnalytics {
15 pub ip_address: Option<String>,
16 pub user_agent: Option<String>,
17 pub device_type: Option<String>,
18 pub browser: Option<String>,
19 pub os: Option<String>,
20 pub fingerprint_hash: Option<String>,
21 pub preferred_locale: Option<String>,
22 pub country: Option<String>,
23 pub region: Option<String>,
24 pub city: Option<String>,
25 pub referrer_source: Option<String>,
26 pub referrer_url: Option<String>,
27 pub landing_page: Option<String>,
28 pub entry_url: Option<String>,
29 pub utm_source: Option<String>,
30 pub utm_medium: Option<String>,
31 pub utm_campaign: Option<String>,
32}
33
34impl SessionAnalytics {
35 pub fn from_headers(headers: &HeaderMap) -> Self {
36 Self::from_headers_with_geoip(headers, None)
37 }
38
39 pub fn from_headers_with_geoip(
40 headers: &HeaderMap,
41 geoip_reader: Option<&GeoIpReader>,
42 ) -> Self {
43 Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
44 }
45
46 pub fn from_headers_with_geoip_and_socket(
47 headers: &HeaderMap,
48 geoip_reader: Option<&GeoIpReader>,
49 socket_addr: Option<std::net::SocketAddr>,
50 ) -> Self {
51 let user_agent = headers
52 .get("user-agent")
53 .and_then(|v| v.to_str().ok())
54 .map(ToString::to_string);
55
56 let ip_address = headers
57 .get("x-forwarded-for")
58 .and_then(|v| v.to_str().ok())
59 .and_then(|s| s.split(',').next())
60 .map(|s| s.trim().to_string())
61 .or_else(|| {
62 headers
63 .get("x-real-ip")
64 .and_then(|v| v.to_str().ok())
65 .map(ToString::to_string)
66 })
67 .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
68
69 let fingerprint_hash = headers
70 .get("x-fingerprint")
71 .and_then(|v| v.to_str().ok())
72 .map(ToString::to_string);
73
74 let preferred_locale = headers
75 .get("accept-language")
76 .and_then(|v| v.to_str().ok())
77 .and_then(|s| s.split(',').next())
78 .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
79
80 let (device_type, browser, os) = user_agent
81 .as_ref()
82 .map_or((None, None, None), |ua| parse_user_agent(ua));
83
84 let (country, region, city) = ip_address
85 .as_ref()
86 .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
87 .unwrap_or((None, None, None));
88
89 let referrer_url = headers
90 .get("referer")
91 .and_then(|v| v.to_str().ok())
92 .map(ToString::to_string);
93
94 let referrer_source = referrer_url
95 .as_ref()
96 .and_then(|url| Self::parse_referrer_source(url));
97
98 Self {
99 ip_address,
100 user_agent,
101 device_type,
102 browser,
103 os,
104 fingerprint_hash,
105 preferred_locale,
106 country,
107 region,
108 city,
109 referrer_source,
110 referrer_url,
111 landing_page: None,
112 entry_url: None,
113 utm_source: None,
114 utm_medium: None,
115 utm_campaign: None,
116 }
117 }
118
119 pub fn from_headers_and_uri(
120 headers: &HeaderMap,
121 uri: Option<&Uri>,
122 geoip_reader: Option<&GeoIpReader>,
123 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
124 ) -> Self {
125 let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
126
127 if let Some(uri) = uri {
128 let query_params = Self::parse_query_params(uri);
129
130 analytics.utm_source = query_params.get("utm_source").cloned();
131 analytics.utm_medium = query_params.get("utm_medium").cloned();
132 analytics.utm_campaign = query_params.get("utm_campaign").cloned();
133
134 let is_html_page =
135 content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
136
137 if is_html_page {
138 analytics.entry_url = Some(uri.to_string());
139 analytics.landing_page = Some(uri.path().to_string());
140 }
141 }
142
143 analytics
144 }
145
146 #[cfg(feature = "web")]
147 pub fn from_request(
148 request: &Request,
149 geoip_reader: Option<&GeoIpReader>,
150 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
151 ) -> Self {
152 Self::from_headers_and_uri(
153 request.headers(),
154 Some(request.uri()),
155 geoip_reader,
156 content_routing,
157 )
158 }
159
160 fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
161 uri.query().map_or_else(HashMap::new, |q| {
162 q.split('&')
163 .filter_map(|param| {
164 let mut parts = param.splitn(2, '=');
165 Some((parts.next()?.to_string(), parts.next()?.to_string()))
166 })
167 .collect()
168 })
169 }
170
171 fn lookup_geoip(
172 ip_str: &str,
173 geoip_reader: Option<&GeoIpReader>,
174 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
175 let Some(reader) = geoip_reader else {
176 debug!(ip = %ip_str, "GeoIP lookup skipped: reader not configured");
177 return None;
178 };
179
180 let ip: std::net::IpAddr = match ip_str.parse() {
181 Ok(ip) => ip,
182 Err(e) => {
183 debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: invalid IP address");
184 return None;
185 },
186 };
187
188 if ip.is_loopback() || ip.is_unspecified() {
189 debug!(ip = %ip_str, "GeoIP lookup skipped: loopback or unspecified address");
190 return None;
191 }
192
193 if let std::net::IpAddr::V4(ipv4) = ip {
194 if ipv4.is_private() || ipv4.is_link_local() {
195 debug!(ip = %ip_str, "GeoIP lookup skipped: private or link-local address");
196 return None;
197 }
198 }
199
200 let lookup_result = match reader.lookup(ip) {
201 Ok(result) => result,
202 Err(e) => {
203 debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: database lookup error");
204 return None;
205 },
206 };
207
208 let city_data: maxminddb::geoip2::City = match lookup_result.decode() {
209 Ok(Some(data)) => data,
210 Ok(None) => {
211 debug!(ip = %ip_str, "GeoIP lookup returned empty result");
212 return None;
213 },
214 Err(e) => {
215 debug!(ip = %ip_str, error = %e, "GeoIP decode failed");
216 return None;
217 },
218 };
219
220 let country = city_data.country.iso_code.map(ToString::to_string);
221
222 let region = city_data
223 .subdivisions
224 .first()
225 .and_then(|s| s.iso_code)
226 .map(ToString::to_string);
227
228 let city_name = city_data.city.names.english.map(ToString::to_string);
229
230 Some((country, region, city_name))
231 }
232
233 fn parse_referrer_source(url: &str) -> Option<String> {
234 url::Url::parse(url)
235 .ok()
236 .and_then(|parsed_url| parsed_url.host_str().map(ToString::to_string))
237 .and_then(|host| {
238 if host.parse::<std::net::IpAddr>().is_ok() {
239 None
240 } else {
241 Some(host)
242 }
243 })
244 }
245
246 pub fn is_bot(&self) -> bool {
247 self.user_agent
248 .as_ref()
249 .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
250 }
251
252 pub fn is_bot_ip(&self) -> bool {
253 self.ip_address
254 .as_ref()
255 .is_some_and(|ip| matches_bot_ip_range(ip))
256 }
257
258 pub fn is_spam_referrer(&self) -> bool {
259 self.referrer_url
260 .as_ref()
261 .is_some_and(|url| detection::is_spam_referrer(url))
262 }
263
264 pub fn is_datacenter_ip(&self) -> bool {
265 self.ip_address
266 .as_ref()
267 .is_some_and(|ip| detection::is_datacenter_ip(ip))
268 }
269
270 pub fn is_high_risk_country(&self) -> bool {
271 self.country
272 .as_ref()
273 .is_some_and(|c| detection::is_high_risk_country(c))
274 }
275
276 pub fn should_skip_tracking(&self) -> bool {
277 self.is_bot()
278 || self.is_bot_ip()
279 || self.is_datacenter_ip()
280 || self.is_high_risk_country()
281 || self.is_spam_referrer()
282 }
283}