systemprompt_analytics/services/
extractor.rs1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3
4use axum::extract::Request;
5
6use super::ai_crawler_keywords::matches_ai_crawler;
7use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
8use super::detection;
9use super::user_agent::parse_user_agent;
10use crate::GeoIpReader;
11
12#[derive(Debug, Clone, Default)]
13pub struct SessionAnalytics {
14 pub ip_address: Option<String>,
15 pub user_agent: Option<String>,
16 pub device_type: Option<String>,
17 pub browser: Option<String>,
18 pub os: Option<String>,
19 pub fingerprint_hash: Option<String>,
20 pub preferred_locale: Option<String>,
21 pub country: Option<String>,
22 pub region: Option<String>,
23 pub city: Option<String>,
24 pub referrer_source: Option<String>,
25 pub referrer_url: Option<String>,
26 pub landing_page: Option<String>,
27 pub entry_url: Option<String>,
28 pub utm_source: Option<String>,
29 pub utm_medium: Option<String>,
30 pub utm_campaign: Option<String>,
31 pub utm_content: Option<String>,
32 pub utm_term: Option<String>,
33}
34
35impl SessionAnalytics {
36 pub fn from_headers(headers: &HeaderMap) -> Self {
37 Self::from_headers_with_geoip(headers, None)
38 }
39
40 pub fn from_headers_with_geoip(
41 headers: &HeaderMap,
42 geoip_reader: Option<&GeoIpReader>,
43 ) -> Self {
44 Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
45 }
46
47 pub fn from_headers_with_geoip_and_socket(
48 headers: &HeaderMap,
49 geoip_reader: Option<&GeoIpReader>,
50 socket_addr: Option<std::net::SocketAddr>,
51 ) -> Self {
52 let user_agent = headers
53 .get("user-agent")
54 .and_then(|v| v.to_str().ok())
55 .map(ToString::to_string);
56
57 let ip_address = headers
58 .get("x-forwarded-for")
59 .and_then(|v| v.to_str().ok())
60 .and_then(|s| s.split(',').next())
61 .map(|s| s.trim().to_string())
62 .or_else(|| {
63 headers
64 .get("x-real-ip")
65 .and_then(|v| v.to_str().ok())
66 .map(ToString::to_string)
67 })
68 .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
69
70 let fingerprint_hash = headers
71 .get("x-fingerprint")
72 .and_then(|v| v.to_str().ok())
73 .map(ToString::to_string);
74
75 let preferred_locale = headers
76 .get("accept-language")
77 .and_then(|v| v.to_str().ok())
78 .and_then(|s| s.split(',').next())
79 .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
80
81 let (device_type, browser, os) = user_agent
82 .as_ref()
83 .map_or((None, None, None), |ua| parse_user_agent(ua));
84
85 let (country, region, city) = ip_address
86 .as_ref()
87 .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
88 .unwrap_or((None, None, None));
89
90 let referrer_url = headers
91 .get("referer")
92 .and_then(|v| v.to_str().ok())
93 .map(ToString::to_string);
94
95 let referrer_source = referrer_url
96 .as_ref()
97 .and_then(|url| Self::parse_referrer_source(url));
98
99 Self {
100 ip_address,
101 user_agent,
102 device_type,
103 browser,
104 os,
105 fingerprint_hash,
106 preferred_locale,
107 country,
108 region,
109 city,
110 referrer_source,
111 referrer_url,
112 landing_page: None,
113 entry_url: None,
114 utm_source: None,
115 utm_medium: None,
116 utm_campaign: None,
117 utm_content: None,
118 utm_term: None,
119 }
120 }
121
122 pub fn from_headers_and_uri(
123 headers: &HeaderMap,
124 uri: Option<&Uri>,
125 geoip_reader: Option<&GeoIpReader>,
126 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
127 ) -> Self {
128 let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
129
130 if let Some(uri) = uri {
131 let query_params = Self::parse_query_params(uri);
132
133 analytics.utm_source = query_params.get("utm_source").cloned();
134 analytics.utm_medium = query_params.get("utm_medium").cloned();
135 analytics.utm_campaign = query_params.get("utm_campaign").cloned();
136 analytics.utm_content = query_params.get("utm_content").cloned();
137 analytics.utm_term = query_params.get("utm_term").cloned();
138
139 let is_html_page =
140 content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
141
142 if is_html_page {
143 analytics.entry_url = Some(uri.to_string());
144 analytics.landing_page = Some(uri.path().to_string());
145 }
146 }
147
148 analytics
149 }
150
151 pub fn from_request(
152 request: &Request,
153 geoip_reader: Option<&GeoIpReader>,
154 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
155 ) -> Self {
156 Self::from_headers_and_uri(
157 request.headers(),
158 Some(request.uri()),
159 geoip_reader,
160 content_routing,
161 )
162 }
163
164 fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
165 uri.query().map_or_else(HashMap::new, |q| {
166 q.split('&')
167 .filter_map(|param| {
168 let mut parts = param.splitn(2, '=');
169 Some((parts.next()?.to_string(), parts.next()?.to_string()))
170 })
171 .collect()
172 })
173 }
174
175 #[cfg(feature = "geolocation")]
176 fn lookup_geoip(
177 ip_str: &str,
178 geoip_reader: Option<&GeoIpReader>,
179 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
180 let Some(reader) = geoip_reader else {
181 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: reader not configured");
182 return None;
183 };
184
185 let ip: std::net::IpAddr = match ip_str.parse() {
186 Ok(ip) => ip,
187 Err(e) => {
188 tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: invalid IP address");
189 return None;
190 },
191 };
192
193 if ip.is_loopback() || ip.is_unspecified() {
194 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: loopback or unspecified address");
195 return None;
196 }
197
198 if let std::net::IpAddr::V4(ipv4) = ip {
199 if ipv4.is_private() || ipv4.is_link_local() {
200 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: private or link-local address");
201 return None;
202 }
203 }
204
205 let lookup_result = match reader.lookup(ip) {
206 Ok(result) => result,
207 Err(e) => {
208 tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: database lookup error");
209 return None;
210 },
211 };
212
213 let city_data: maxminddb::geoip2::City = match lookup_result.decode() {
214 Ok(Some(data)) => data,
215 Ok(None) => {
216 tracing::debug!(ip = %ip_str, "GeoIP lookup returned empty result");
217 return None;
218 },
219 Err(e) => {
220 tracing::debug!(ip = %ip_str, error = %e, "GeoIP decode failed");
221 return None;
222 },
223 };
224
225 let country = city_data.country.iso_code.map(ToString::to_string);
226
227 let region = city_data
228 .subdivisions
229 .first()
230 .and_then(|s| s.iso_code)
231 .map(ToString::to_string);
232
233 let city_name = city_data.city.names.english.map(ToString::to_string);
234
235 Some((country, region, city_name))
236 }
237
238 #[cfg(not(feature = "geolocation"))]
239 const fn lookup_geoip(
240 _ip_str: &str,
241 _geoip_reader: Option<&GeoIpReader>,
242 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
243 None
244 }
245
246 fn parse_referrer_source(url: &str) -> Option<String> {
247 match url::Url::parse(url) {
248 Ok(parsed_url) => parsed_url
249 .host_str()
250 .map(ToString::to_string)
251 .filter(|host| host.parse::<std::net::IpAddr>().is_err()),
252 Err(err) => {
253 tracing::debug!(url = %url, error = %err, "failed to parse referrer URL");
254 None
255 },
256 }
257 }
258
259 pub fn is_bot(&self) -> bool {
260 if self.is_ai_crawler() {
261 return false;
262 }
263 self.user_agent
264 .as_ref()
265 .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
266 }
267
268 pub fn is_ai_crawler(&self) -> bool {
269 self.user_agent
270 .as_ref()
271 .is_some_and(|ua| matches_ai_crawler(ua))
272 }
273
274 pub fn is_bot_ip(&self) -> bool {
275 self.ip_address
276 .as_ref()
277 .is_some_and(|ip| matches_bot_ip_range(ip))
278 }
279
280 pub fn is_spam_referrer(&self) -> bool {
281 self.referrer_url
282 .as_ref()
283 .is_some_and(|url| detection::is_spam_referrer(url))
284 }
285
286 pub fn is_datacenter_ip(&self) -> bool {
287 self.ip_address
288 .as_ref()
289 .is_some_and(|ip| detection::is_datacenter_ip(ip))
290 }
291
292 pub fn is_high_risk_country(&self) -> bool {
293 self.country
294 .as_ref()
295 .is_some_and(|c| detection::is_high_risk_country(c))
296 }
297
298 pub fn should_skip_tracking(&self) -> bool {
299 if self.is_ai_crawler() {
300 return false;
301 }
302 self.is_bot()
303 || self.is_bot_ip()
304 || self.is_datacenter_ip()
305 || self.is_high_risk_country()
306 || self.is_spam_referrer()
307 }
308}