1use async_trait::async_trait;
13use dashmap::DashMap;
14use moka::sync::Cache;
15use reqwest::header::{HeaderValue, USER_AGENT};
16use serde::{Deserialize, Serialize};
17use std::fmt::Debug;
18use std::fs::File;
19use std::io::{BufRead, BufReader};
20use std::path::{Path, PathBuf};
21use std::sync::Arc;
22use std::sync::atomic::{AtomicUsize, Ordering};
23use std::time::Duration;
24use tracing::{debug, info, warn};
25
26use rand::seq::SliceRandom;
27
28use crate::error::SpiderError;
29use crate::middleware::{Middleware, MiddlewareAction};
30use crate::request::Request;
31
32#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
34pub enum UserAgentRotationStrategy {
35 #[default]
37 Random,
38 Sequential,
40}
41
42#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub enum BuiltinUserAgentList {
45 Desktop,
47 Mobile,
49 RandomPopular,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
55#[serde(untagged)]
56pub enum UserAgentSource {
57 List(Vec<String>),
59 File(PathBuf),
61 Builtin(BuiltinUserAgentList),
63 None,
65}
66
67impl Default for UserAgentSource {
68 fn default() -> Self {
69 UserAgentSource::Builtin(BuiltinUserAgentList::RandomPopular)
70 }
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct UserAgentProfile {
76 pub user_agent: Arc<String>,
78 #[serde(default)]
80 pub headers: DashMap<String, String>,
81}
82
83impl From<String> for UserAgentProfile {
84 fn from(user_agent: String) -> Self {
85 UserAgentProfile {
86 user_agent: Arc::new(user_agent),
87 headers: DashMap::new(),
88 }
89 }
90}
91
92impl From<&str> for UserAgentProfile {
93 fn from(user_agent: &str) -> Self {
94 UserAgentProfile {
95 user_agent: Arc::new(user_agent.to_string()),
96 headers: DashMap::new(),
97 }
98 }
99}
100
101#[derive(Debug, Clone, Default, Serialize, Deserialize)]
103pub struct UserAgentMiddlewareBuilder {
104 source: UserAgentSource,
105 strategy: UserAgentRotationStrategy,
106 fallback_user_agent: Option<String>,
107 per_domain: DashMap<String, UserAgentSource>,
108}
109
110impl UserAgentMiddlewareBuilder {
111 pub fn source(mut self, source: UserAgentSource) -> Self {
113 self.source = source;
114 self
115 }
116
117 pub fn strategy(mut self, strategy: UserAgentRotationStrategy) -> Self {
119 self.strategy = strategy;
120 self
121 }
122
123 pub fn fallback_user_agent(mut self, fallback_user_agent: String) -> Self {
125 self.fallback_user_agent = Some(fallback_user_agent);
126 self
127 }
128
129 pub fn per_domain(self, domain: String, source: UserAgentSource) -> Self {
131 self.per_domain.insert(domain, source);
132 self
133 }
134
135 pub fn build(self) -> Result<UserAgentMiddleware, SpiderError> {
138 let default_pool = Arc::new(UserAgentMiddleware::load_user_agents(&self.source)?);
139
140 let domain_cache = Cache::builder()
141 .time_to_live(Duration::from_secs(30 * 60)) .build();
143
144 for entry in self.per_domain.iter() {
145 let domain = entry.key().clone();
146 let source = entry.value().clone();
147 let pool = Arc::new(UserAgentMiddleware::load_user_agents(&source)?);
148 domain_cache.insert(domain, pool);
149 }
150
151 let middleware = UserAgentMiddleware {
152 strategy: self.strategy,
153 fallback_user_agent: self.fallback_user_agent,
154 domain_cache,
155 default_pool,
156 current_index: AtomicUsize::new(0),
157 };
158
159 info!(
160 "Initializing UserAgentMiddleware with config: {:?}",
161 middleware
162 );
163
164 Ok(middleware)
165 }
166}
167
168pub struct UserAgentMiddleware {
169 strategy: UserAgentRotationStrategy,
170 fallback_user_agent: Option<String>,
171 domain_cache: Cache<String, Arc<Vec<UserAgentProfile>>>,
172 default_pool: Arc<Vec<UserAgentProfile>>,
173 current_index: AtomicUsize,
174}
175
176impl Debug for UserAgentMiddleware {
177 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
178 f.debug_struct("UserAgentMiddleware")
179 .field("strategy", &self.strategy)
180 .field("fallback_user_agent", &self.fallback_user_agent)
181 .field(
182 "domain_cache",
183 &format!("Cache({})", self.domain_cache.weighted_size()),
184 )
185 .field(
186 "default_pool",
187 &format!("Pool({})", self.default_pool.len()),
188 )
189 .field("current_index", &self.current_index)
190 .finish()
191 }
192}
193
194impl UserAgentMiddleware {
195 pub fn builder() -> UserAgentMiddlewareBuilder {
197 UserAgentMiddlewareBuilder::default()
198 }
199
200 fn load_user_agents(source: &UserAgentSource) -> Result<Vec<UserAgentProfile>, SpiderError> {
201 match source {
202 UserAgentSource::List(list) => Ok(list
203 .iter()
204 .map(|ua| UserAgentProfile::from(ua.clone()))
205 .collect()),
206 UserAgentSource::File(path) => Self::load_from_file(path),
207 UserAgentSource::Builtin(builtin_list) => {
208 Ok(Self::load_builtin_user_agents(builtin_list))
209 }
210 UserAgentSource::None => Ok(Vec::new()),
211 }
212 }
213
214 fn load_from_file(path: &Path) -> Result<Vec<UserAgentProfile>, SpiderError> {
215 if !path.exists() {
216 return Err(SpiderError::IoError(
217 std::io::Error::new(
218 std::io::ErrorKind::NotFound,
219 format!("User-agent file not found: {}", path.display()),
220 )
221 .to_string(),
222 ));
223 }
224 let file = File::open(path)?;
225 let reader = BufReader::new(file);
226 let user_agents: Vec<UserAgentProfile> = reader
227 .lines()
228 .map_while(Result::ok)
229 .filter(|line| !line.trim().is_empty())
230 .map(UserAgentProfile::from)
231 .collect();
232
233 if user_agents.is_empty() {
234 warn!(
235 "User-Agent file {:?} is empty or contains no valid User-Agents.",
236 path
237 );
238 }
239 Ok(user_agents)
240 }
241
242 fn load_builtin_user_agents(list_type: &BuiltinUserAgentList) -> Vec<UserAgentProfile> {
244 match list_type {
245 BuiltinUserAgentList::Desktop => vec![
246 UserAgentProfile::from(
247 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
248 ),
249 UserAgentProfile::from(
250 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
251 ),
252 UserAgentProfile::from(
253 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
254 ),
255 ],
256 BuiltinUserAgentList::Mobile => vec![
257 UserAgentProfile::from(
258 "Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.193 Mobile Safari/537.36",
259 ),
260 UserAgentProfile::from(
261 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
262 ),
263 ],
264 BuiltinUserAgentList::RandomPopular => vec![
265 UserAgentProfile::from(
266 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
267 ),
268 UserAgentProfile::from(
269 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
270 ),
271 UserAgentProfile::from(
272 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
273 ),
274 UserAgentProfile::from(
275 "Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.193 Mobile Safari/537.36",
276 ),
277 UserAgentProfile::from(
278 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
279 ),
280 ],
281 }
282 }
283
284 fn get_user_agent(&self, domain: Option<&str>) -> Option<UserAgentProfile> {
285 let mut rng = rand::thread_rng();
286
287 let pool = domain
288 .and_then(|d| self.domain_cache.get(d))
289 .unwrap_or_else(|| self.default_pool.clone());
290
291 if pool.is_empty() {
292 debug!("User-Agent pool is empty.");
293 return self
294 .fallback_user_agent
295 .as_ref()
296 .map(|ua| UserAgentProfile::from(ua.clone()));
297 }
298
299 match self.strategy {
300 UserAgentRotationStrategy::Random => pool.choose(&mut rng).cloned(),
301 UserAgentRotationStrategy::Sequential => {
302 let current = self.current_index.fetch_add(1, Ordering::SeqCst);
303 let index = current % pool.len();
304 pool.get(index).cloned()
305 }
306 }
307 }
308}
309
310#[async_trait]
311impl<C: Send + Sync> Middleware<C> for UserAgentMiddleware {
312 fn name(&self) -> &str {
313 "UserAgentMiddleware"
314 }
315
316 async fn process_request(
317 &mut self,
318 _client: &C,
319 mut request: Request,
320 ) -> Result<MiddlewareAction<Request>, SpiderError> {
321 let domain = request.url.domain();
322 if let Some(profile) = self.get_user_agent(domain) {
323 debug!("Applying User-Agent: {}", profile.user_agent);
324 request.headers.insert(
325 USER_AGENT,
326 HeaderValue::from_str(&profile.user_agent).map_err(|e| {
327 SpiderError::HeaderValueError(format!(
328 "Invalid User-Agent string '{}': {}",
329 profile.user_agent, e
330 ))
331 })?,
332 );
333 for header in profile.headers.iter() {
334 request.headers.insert(
335 reqwest::header::HeaderName::from_bytes(header.key().as_bytes()).map_err(
336 |e| SpiderError::HeaderValueError(format!("Invalid header name: {}", e)),
337 )?,
338 HeaderValue::from_str(header.value().as_str()).map_err(|e| {
339 SpiderError::HeaderValueError(format!(
340 "Invalid header value for {}: {}",
341 header.key(),
342 e
343 ))
344 })?,
345 );
346 }
347 } else {
348 debug!("No User-Agent applied.");
349 }
350 Ok(MiddlewareAction::Continue(request))
351 }
352}