systemprompt_analytics/services/
extractor.rs1use http::{HeaderMap, Uri};
2use std::collections::HashMap;
3
4#[cfg(feature = "web")]
5use axum::extract::Request;
6
7use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
8use super::detection;
9use super::user_agent::parse_user_agent;
10use crate::GeoIpReader;
11
12#[derive(Debug, Clone, Default)]
13pub struct SessionAnalytics {
14 pub ip_address: Option<String>,
15 pub user_agent: Option<String>,
16 pub device_type: Option<String>,
17 pub browser: Option<String>,
18 pub os: Option<String>,
19 pub fingerprint_hash: Option<String>,
20 pub preferred_locale: Option<String>,
21 pub country: Option<String>,
22 pub region: Option<String>,
23 pub city: Option<String>,
24 pub referrer_source: Option<String>,
25 pub referrer_url: Option<String>,
26 pub landing_page: Option<String>,
27 pub entry_url: Option<String>,
28 pub utm_source: Option<String>,
29 pub utm_medium: Option<String>,
30 pub utm_campaign: Option<String>,
31}
32
33impl SessionAnalytics {
34 pub fn from_headers(headers: &HeaderMap) -> Self {
35 Self::from_headers_with_geoip(headers, None)
36 }
37
38 pub fn from_headers_with_geoip(
39 headers: &HeaderMap,
40 geoip_reader: Option<&GeoIpReader>,
41 ) -> Self {
42 Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
43 }
44
45 pub fn from_headers_with_geoip_and_socket(
46 headers: &HeaderMap,
47 geoip_reader: Option<&GeoIpReader>,
48 socket_addr: Option<std::net::SocketAddr>,
49 ) -> Self {
50 let user_agent = headers
51 .get("user-agent")
52 .and_then(|v| v.to_str().ok())
53 .map(ToString::to_string);
54
55 let ip_address = headers
56 .get("x-forwarded-for")
57 .and_then(|v| v.to_str().ok())
58 .and_then(|s| s.split(',').next())
59 .map(|s| s.trim().to_string())
60 .or_else(|| {
61 headers
62 .get("x-real-ip")
63 .and_then(|v| v.to_str().ok())
64 .map(ToString::to_string)
65 })
66 .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
67
68 let fingerprint_hash = headers
69 .get("x-fingerprint")
70 .and_then(|v| v.to_str().ok())
71 .map(ToString::to_string);
72
73 let preferred_locale = headers
74 .get("accept-language")
75 .and_then(|v| v.to_str().ok())
76 .and_then(|s| s.split(',').next())
77 .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
78
79 let (device_type, browser, os) = user_agent
80 .as_ref()
81 .map_or((None, None, None), |ua| parse_user_agent(ua));
82
83 let (country, region, city) = ip_address
84 .as_ref()
85 .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
86 .unwrap_or((None, None, None));
87
88 let referrer_url = headers
89 .get("referer")
90 .and_then(|v| v.to_str().ok())
91 .map(ToString::to_string);
92
93 let referrer_source = referrer_url
94 .as_ref()
95 .and_then(|url| Self::parse_referrer_source(url));
96
97 Self {
98 ip_address,
99 user_agent,
100 device_type,
101 browser,
102 os,
103 fingerprint_hash,
104 preferred_locale,
105 country,
106 region,
107 city,
108 referrer_source,
109 referrer_url,
110 landing_page: None,
111 entry_url: None,
112 utm_source: None,
113 utm_medium: None,
114 utm_campaign: None,
115 }
116 }
117
118 pub fn from_headers_and_uri(
119 headers: &HeaderMap,
120 uri: Option<&Uri>,
121 geoip_reader: Option<&GeoIpReader>,
122 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
123 ) -> Self {
124 let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
125
126 if let Some(uri) = uri {
127 let query_params = Self::parse_query_params(uri);
128
129 analytics.utm_source = query_params.get("utm_source").cloned();
130 analytics.utm_medium = query_params.get("utm_medium").cloned();
131 analytics.utm_campaign = query_params.get("utm_campaign").cloned();
132
133 let is_html_page =
134 content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
135
136 if is_html_page {
137 analytics.entry_url = Some(uri.to_string());
138 analytics.landing_page = Some(uri.path().to_string());
139 }
140 }
141
142 analytics
143 }
144
145 #[cfg(feature = "web")]
146 pub fn from_request(
147 request: &Request,
148 geoip_reader: Option<&GeoIpReader>,
149 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
150 ) -> Self {
151 Self::from_headers_and_uri(
152 request.headers(),
153 Some(request.uri()),
154 geoip_reader,
155 content_routing,
156 )
157 }
158
159 fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
160 uri.query().map_or_else(HashMap::new, |q| {
161 q.split('&')
162 .filter_map(|param| {
163 let mut parts = param.splitn(2, '=');
164 Some((parts.next()?.to_string(), parts.next()?.to_string()))
165 })
166 .collect()
167 })
168 }
169
170 #[cfg(feature = "geolocation")]
171 fn lookup_geoip(
172 ip_str: &str,
173 geoip_reader: Option<&GeoIpReader>,
174 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
175 let Some(reader) = geoip_reader else {
176 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: reader not configured");
177 return None;
178 };
179
180 let ip: std::net::IpAddr = match ip_str.parse() {
181 Ok(ip) => ip,
182 Err(e) => {
183 tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: invalid IP address");
184 return None;
185 },
186 };
187
188 if ip.is_loopback() || ip.is_unspecified() {
189 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: loopback or unspecified address");
190 return None;
191 }
192
193 if let std::net::IpAddr::V4(ipv4) = ip {
194 if ipv4.is_private() || ipv4.is_link_local() {
195 tracing::debug!(ip = %ip_str, "GeoIP lookup skipped: private or link-local address");
196 return None;
197 }
198 }
199
200 let lookup_result = match reader.lookup(ip) {
201 Ok(result) => result,
202 Err(e) => {
203 tracing::debug!(ip = %ip_str, error = %e, "GeoIP lookup failed: database lookup error");
204 return None;
205 },
206 };
207
208 let city_data: maxminddb::geoip2::City = match lookup_result.decode() {
209 Ok(Some(data)) => data,
210 Ok(None) => {
211 tracing::debug!(ip = %ip_str, "GeoIP lookup returned empty result");
212 return None;
213 },
214 Err(e) => {
215 tracing::debug!(ip = %ip_str, error = %e, "GeoIP decode failed");
216 return None;
217 },
218 };
219
220 let country = city_data.country.iso_code.map(ToString::to_string);
221
222 let region = city_data
223 .subdivisions
224 .first()
225 .and_then(|s| s.iso_code)
226 .map(ToString::to_string);
227
228 let city_name = city_data.city.names.english.map(ToString::to_string);
229
230 Some((country, region, city_name))
231 }
232
233 #[cfg(not(feature = "geolocation"))]
234 const fn lookup_geoip(
235 _ip_str: &str,
236 _geoip_reader: Option<&GeoIpReader>,
237 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
238 None
239 }
240
241 fn parse_referrer_source(url: &str) -> Option<String> {
242 url::Url::parse(url)
243 .ok()
244 .and_then(|parsed_url| parsed_url.host_str().map(ToString::to_string))
245 .and_then(|host| {
246 if host.parse::<std::net::IpAddr>().is_ok() {
247 None
248 } else {
249 Some(host)
250 }
251 })
252 }
253
254 pub fn is_bot(&self) -> bool {
255 self.user_agent
256 .as_ref()
257 .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
258 }
259
260 pub fn is_bot_ip(&self) -> bool {
261 self.ip_address
262 .as_ref()
263 .is_some_and(|ip| matches_bot_ip_range(ip))
264 }
265
266 pub fn is_spam_referrer(&self) -> bool {
267 self.referrer_url
268 .as_ref()
269 .is_some_and(|url| detection::is_spam_referrer(url))
270 }
271
272 pub fn is_datacenter_ip(&self) -> bool {
273 self.ip_address
274 .as_ref()
275 .is_some_and(|ip| detection::is_datacenter_ip(ip))
276 }
277
278 pub fn is_high_risk_country(&self) -> bool {
279 self.country
280 .as_ref()
281 .is_some_and(|c| detection::is_high_risk_country(c))
282 }
283
284 pub fn should_skip_tracking(&self) -> bool {
285 self.is_bot()
286 || self.is_bot_ip()
287 || self.is_datacenter_ip()
288 || self.is_high_risk_country()
289 || self.is_spam_referrer()
290 }
291}