systemprompt_analytics/services/extractor/
mod.rs1use http::{HeaderMap, Uri};
11use std::collections::HashMap;
12
13use axum::extract::Request;
14
15use super::ai_crawler_keywords::matches_ai_crawler;
16use super::bot_keywords::{matches_bot_ip_range, matches_bot_pattern};
17use super::detection;
18use super::user_agent::parse_user_agent;
19use crate::GeoIpReader;
20
21mod geoip;
22
23#[derive(Debug, Clone, Default)]
24pub struct SessionAnalytics {
25 pub ip_address: Option<String>,
26 pub user_agent: Option<String>,
27 pub device_type: Option<String>,
28 pub browser: Option<String>,
29 pub os: Option<String>,
30 pub fingerprint_hash: Option<String>,
31 pub preferred_locale: Option<String>,
32 pub country: Option<String>,
33 pub region: Option<String>,
34 pub city: Option<String>,
35 pub referrer_source: Option<String>,
36 pub referrer_url: Option<String>,
37 pub landing_page: Option<String>,
38 pub entry_url: Option<String>,
39 pub utm_source: Option<String>,
40 pub utm_medium: Option<String>,
41 pub utm_campaign: Option<String>,
42 pub utm_content: Option<String>,
43 pub utm_term: Option<String>,
44}
45
46impl SessionAnalytics {
47 pub fn from_headers(headers: &HeaderMap) -> Self {
48 Self::from_headers_with_geoip(headers, None)
49 }
50
51 pub fn from_headers_with_geoip(
52 headers: &HeaderMap,
53 geoip_reader: Option<&GeoIpReader>,
54 ) -> Self {
55 Self::from_headers_with_geoip_and_socket(headers, geoip_reader, None)
56 }
57
58 pub fn from_headers_with_geoip_and_socket(
59 headers: &HeaderMap,
60 geoip_reader: Option<&GeoIpReader>,
61 socket_addr: Option<std::net::SocketAddr>,
62 ) -> Self {
63 let user_agent = headers
64 .get("user-agent")
65 .and_then(|v| v.to_str().ok())
66 .map(ToString::to_string);
67
68 let ip_address = headers
69 .get("x-forwarded-for")
70 .and_then(|v| v.to_str().ok())
71 .and_then(|s| s.split(',').next())
72 .map(|s| s.trim().to_string())
73 .or_else(|| {
74 headers
75 .get("x-real-ip")
76 .and_then(|v| v.to_str().ok())
77 .map(ToString::to_string)
78 })
79 .or_else(|| socket_addr.map(|addr| addr.ip().to_string()));
80
81 let fingerprint_hash = headers
82 .get("x-fingerprint")
83 .and_then(|v| v.to_str().ok())
84 .map(ToString::to_string);
85
86 let preferred_locale = headers
87 .get("accept-language")
88 .and_then(|v| v.to_str().ok())
89 .and_then(|s| s.split(',').next())
90 .map(|s| s.trim().split(';').next().unwrap_or(s).to_string());
91
92 let (device_type, browser, os) = user_agent
93 .as_ref()
94 .map_or((None, None, None), |ua| parse_user_agent(ua));
95
96 let (country, region, city) = ip_address
97 .as_ref()
98 .and_then(|ip_str| Self::lookup_geoip(ip_str, geoip_reader))
99 .unwrap_or((None, None, None));
100
101 let referrer_url = headers
102 .get("referer")
103 .and_then(|v| v.to_str().ok())
104 .map(ToString::to_string);
105
106 let referrer_source = referrer_url
107 .as_ref()
108 .and_then(|url| Self::parse_referrer_source(url));
109
110 Self {
111 ip_address,
112 user_agent,
113 device_type,
114 browser,
115 os,
116 fingerprint_hash,
117 preferred_locale,
118 country,
119 region,
120 city,
121 referrer_source,
122 referrer_url,
123 landing_page: None,
124 entry_url: None,
125 utm_source: None,
126 utm_medium: None,
127 utm_campaign: None,
128 utm_content: None,
129 utm_term: None,
130 }
131 }
132
133 pub fn from_headers_and_uri(
134 headers: &HeaderMap,
135 uri: Option<&Uri>,
136 geoip_reader: Option<&GeoIpReader>,
137 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
138 ) -> Self {
139 let mut analytics = Self::from_headers_with_geoip(headers, geoip_reader);
140
141 if let Some(uri) = uri {
142 let query_params = Self::parse_query_params(uri);
143
144 analytics.utm_source = query_params.get("utm_source").cloned();
145 analytics.utm_medium = query_params.get("utm_medium").cloned();
146 analytics.utm_campaign = query_params.get("utm_campaign").cloned();
147 analytics.utm_content = query_params.get("utm_content").cloned();
148 analytics.utm_term = query_params.get("utm_term").cloned();
149
150 let is_html_page =
151 content_routing.is_some_and(|routing| routing.is_html_page(uri.path()));
152
153 if is_html_page {
154 analytics.entry_url = Some(uri.to_string());
155 analytics.landing_page = Some(uri.path().to_string());
156 }
157 }
158
159 analytics
160 }
161
162 pub fn from_request(
163 request: &Request,
164 geoip_reader: Option<&GeoIpReader>,
165 content_routing: Option<&dyn systemprompt_models::ContentRouting>,
166 ) -> Self {
167 Self::from_headers_and_uri(
168 request.headers(),
169 Some(request.uri()),
170 geoip_reader,
171 content_routing,
172 )
173 }
174
175 fn parse_query_params(uri: &Uri) -> HashMap<String, String> {
176 uri.query().map_or_else(HashMap::new, |q| {
177 q.split('&')
178 .filter_map(|param| {
179 let mut parts = param.splitn(2, '=');
180 Some((parts.next()?.to_string(), parts.next()?.to_string()))
181 })
182 .collect()
183 })
184 }
185
186 fn lookup_geoip(
187 ip_str: &str,
188 geoip_reader: Option<&GeoIpReader>,
189 ) -> Option<(Option<String>, Option<String>, Option<String>)> {
190 geoip::lookup_geoip(ip_str, geoip_reader)
191 }
192
193 fn parse_referrer_source(url: &str) -> Option<String> {
194 geoip::parse_referrer_source(url)
195 }
196
197 pub fn is_bot(&self) -> bool {
198 if self.is_ai_crawler() {
199 return false;
200 }
201 self.user_agent
202 .as_ref()
203 .is_none_or(|ua| ua.is_empty() || matches_bot_pattern(ua))
204 }
205
206 pub fn is_ai_crawler(&self) -> bool {
207 self.user_agent
208 .as_ref()
209 .is_some_and(|ua| matches_ai_crawler(ua))
210 }
211
212 pub fn is_bot_ip(&self) -> bool {
213 self.ip_address
214 .as_ref()
215 .is_some_and(|ip| matches_bot_ip_range(ip))
216 }
217
218 pub fn is_spam_referrer(&self) -> bool {
219 self.referrer_url
220 .as_ref()
221 .is_some_and(|url| detection::is_spam_referrer(url))
222 }
223
224 pub fn is_datacenter_ip(&self) -> bool {
225 self.ip_address
226 .as_ref()
227 .is_some_and(|ip| detection::is_datacenter_ip(ip))
228 }
229
230 pub fn is_high_risk_country(&self) -> bool {
231 self.country
232 .as_ref()
233 .is_some_and(|c| detection::is_high_risk_country(c))
234 }
235
236 pub fn should_skip_tracking(&self) -> bool {
237 if self.is_ai_crawler() {
238 return false;
239 }
240 self.is_bot()
241 || self.is_bot_ip()
242 || self.is_datacenter_ip()
243 || self.is_high_risk_country()
244 || self.is_spam_referrer()
245 }
246}