systemprompt_analytics/services/
extractor.rs1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3
4use axum::extract::Request;
5
6use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
7use super::detection;
8use super::user_agent::parse_user_agent;
9use crate::GeoIpReader;
10
11#[derive(Debug, Clone, Default)]
12pub struct SessionAnalytics {
13 pub ip_address: Option<String>,
14 pub user_agent: Option<String>,
15 pub device_type: Option<String>,
16 pub browser: Option<String>,
17 pub os: Option<String>,
18 pub fingerprint_hash: Option<String>,
19 pub preferred_locale: Option<String>,
20 pub country: Option<String>,
21 pub region: Option<String>,
22 pub city: Option<String>,
23 pub referrer_source: Option<String>,
24 pub referrer_url: Option<String>,
25 pub landing_page: Option<String>,
26 pub entry_url: Option<String>,
27 pub utm_source: Option<String>,
28 pub utm_medium: Option<String>,
29 pub utm_campaign: Option<String>,
30}
31
32impl SessionAnalytics {
33 pub fn from_headers(headers: &HeaderMap) -> Self {
34 Self::from_headers_with_geoip(headers, None)
35 }
36
37 pub fn from_headers_with_geoip(
38 headers: &HeaderMap,
39 geoip_reader: Option<&GeoIpReader>,
40 ) -> Self {
41 Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
42 }
43
44 pub fn from_headers_with_geoip_and_socket(
45 headers: &HeaderMap,
46 geoip_reader: Option<&GeoIpReader>,
47 socket_addr: Option<std::net::SocketAddr>,
48 ) -> Self {
49 let user_agent = headers
50 .get("user-agent")
51 .and_then(|v| v.to_str().ok())
52 .map(ToString::to_string);
53
54 let ip_address = headers
55 .get("x-forwarded-for")
56 .and_then(|v| v.to_str().ok())
57 .and_then(|s| s.split(',').next())
58 .map(|s| s.trim().to_string())
59 .or_else(|| {
60 headers
61 .get("x-real-ip")
62 .and_then(|v| v.to_str().ok())
63 .map(ToString::to_string)
64 })
65 .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
66
67 let fingerprint_hash = headers
68 .get("x-fingerprint")
69 .and_then(|v| v.to_str().ok())
70 .map(ToString::to_string);
71
72 let preferred_locale = headers
73 .get("accept-language")
74 .and_then(|v| v.to_str().ok())
75 .and_then(|s| s.split(',').next())
76 .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
77
78 let (device_type, browser, os) = user_agent
79 .as_ref()
80 .map_or((None, None, None), |ua| parse_user_agent(ua));
81
82 let (country, region, city) = ip_address
83 .as_ref()
84 .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
85 .unwrap_or((None, None, None));
86
87 let referrer_url = headers
88 .get("referer")
89 .and_then(|v| v.to_str().ok())
90 .map(ToString::to_string);
91
92 let referrer_source = referrer_url
93 .as_ref()
94 .and_then(|url| Self::parse_referrer_source(url));
95
96 Self {
97 ip_address,
98 user_agent,
99 device_type,
100 browser,
101 os,
102 fingerprint_hash,
103 preferred_locale,
104 country,
105 region,
106 city,
107 referrer_source,
108 referrer_url,
109 landing_page: None,
110 entry_url: None,
111 utm_source: None,
112 utm_medium: None,
113 utm_campaign: None,
114 }
115 }
116
117 pub fn from_headers_and_uri(
118 headers: &HeaderMap,
119 uri: Option<&Uri>,
120 geoip_reader: Option<&GeoIpReader>,
121 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
122 ) -> Self {
123 let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
124
125 if let Some(uri) = uri {
126 let query_params = Self::parse_query_params(uri);
127
128 analytics.utm_source = query_params.get("utm_source").cloned();
129 analytics.utm_medium = query_params.get("utm_medium").cloned();
130 analytics.utm_campaign = query_params.get("utm_campaign").cloned();
131
132 let is_html_page =
133 content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
134
135 if is_html_page {
136 analytics.entry_url = Some(uri.to_string());
137 analytics.landing_page = Some(uri.path().to_string());
138 }
139 }
140
141 analytics
142 }
143
144 pub fn from_request(
145 request: &Request,
146 geoip_reader: Option<&GeoIpReader>,
147 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
148 ) -> Self {
149 Self::from_headers_and_uri(
150 request.headers(),
151 Some(request.uri()),
152 geoip_reader,
153 content_routing,
154 )
155 }
156
157 fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
158 uri.query().map_or_else(HashMap::new, |q| {
159 q.split('&')
160 .filter_map(|param| {
161 let mut parts = param.splitn(2, '=');
162 Some((parts.next()?.to_string(), parts.next()?.to_string()))
163 })
164 .collect()
165 })
166 }
167
168 #[cfg(feature = "geolocation")]
169 fn lookup_geoip(
170 ip_str: &str,
171 geoip_reader: Option<&GeoIpReader>,
172 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
173 let Some(reader) = geoip_reader else {
174 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: reader not configured");
175 return None;
176 };
177
178 let ip: std::net::IpAddr = match ip_str.parse() {
179 Ok(ip) => ip,
180 Err(e) => {
181 tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: invalid IP address");
182 return None;
183 },
184 };
185
186 if ip.is_loopback() || ip.is_unspecified() {
187 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: loopback or unspecified address");
188 return None;
189 }
190
191 if let std::net::IpAddr::V4(ipv4) = ip {
192 if ipv4.is_private() || ipv4.is_link_local() {
193 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: private or link-local address");
194 return None;
195 }
196 }
197
198 let lookup_result = match reader.lookup(ip) {
199 Ok(result) => result,
200 Err(e) => {
201 tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: database lookup error");
202 return None;
203 },
204 };
205
206 let city_data: maxminddb::geoip2::City = match lookup_result.decode() {
207 Ok(Some(data)) => data,
208 Ok(None) => {
209 tracing::debug!(ip = %ip_str, "GeoIP lookup returned empty result");
210 return None;
211 },
212 Err(e) => {
213 tracing::debug!(ip = %ip_str, error = %e, "GeoIP decode failed");
214 return None;
215 },
216 };
217
218 let country = city_data.country.iso_code.map(ToString::to_string);
219
220 let region = city_data
221 .subdivisions
222 .first()
223 .and_then(|s| s.iso_code)
224 .map(ToString::to_string);
225
226 let city_name = city_data.city.names.english.map(ToString::to_string);
227
228 Some((country, region, city_name))
229 }
230
231 #[cfg(not(feature = "geolocation"))]
232 const fn lookup_geoip(
233 _ip_str: &str,
234 _geoip_reader: Option<&GeoIpReader>,
235 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
236 None
237 }
238
239 fn parse_referrer_source(url: &str) -> Option<String> {
240 url::Url::parse(url)
241 .ok()
242 .and_then(|parsed_url| parsed_url.host_str().map(ToString::to_string))
243 .and_then(|host| {
244 if host.parse::<std::net::IpAddr>().is_ok() {
245 None
246 } else {
247 Some(host)
248 }
249 })
250 }
251
252 pub fn is_bot(&self) -> bool {
253 self.user_agent
254 .as_ref()
255 .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
256 }
257
258 pub fn is_bot_ip(&self) -> bool {
259 self.ip_address
260 .as_ref()
261 .is_some_and(|ip| matches_bot_ip_range(ip))
262 }
263
264 pub fn is_spam_referrer(&self) -> bool {
265 self.referrer_url
266 .as_ref()
267 .is_some_and(|url| detection::is_spam_referrer(url))
268 }
269
270 pub fn is_datacenter_ip(&self) -> bool {
271 self.ip_address
272 .as_ref()
273 .is_some_and(|ip| detection::is_datacenter_ip(ip))
274 }
275
276 pub fn is_high_risk_country(&self) -> bool {
277 self.country
278 .as_ref()
279 .is_some_and(|c| detection::is_high_risk_country(c))
280 }
281
282 pub fn should_skip_tracking(&self) -> bool {
283 self.is_bot()
284 || self.is_bot_ip()
285 || self.is_datacenter_ip()
286 || self.is_high_risk_country()
287 || self.is_spam_referrer()
288 }
289}