Skip to main content

spider_lib/middlewares/
user_agent.rs

1//! User-Agent Middleware for rotating User-Agents during crawling.
2//!
3//! This module provides the `UserAgentMiddleware` which allows for managing and rotating
4//! User-Agent strings for outgoing requests. It supports various strategies for User-Agent
5//! selection (random, sequential) and sources for User-Agent lists (direct list, file,
6//! or built-in common User-Agent sets). Additionally, it can be configured to use
7//! different User-Agent sources for specific domains.
8//!
9//! The middleware integrates with the `spider-lib`'s request processing pipeline,
10//! modifying the `User-Agent` header of requests before they are sent.
11
12use async_trait::async_trait;
13use dashmap::DashMap;
14use moka::sync::Cache;
15use reqwest::header::{HeaderValue, USER_AGENT};
16use serde::{Deserialize, Serialize};
17use std::fmt::Debug;
18use std::fs::File;
19use std::io::{BufRead, BufReader};
20use std::path::{Path, PathBuf};
21use std::sync::Arc;
22use std::sync::atomic::{AtomicUsize, Ordering};
23use std::time::Duration;
24use tracing::{debug, info, warn};
25
26use rand::seq::SliceRandom;
27
28use crate::error::SpiderError;
29use crate::middleware::{Middleware, MiddlewareAction};
30use crate::request::Request;
31
32/// Defines the strategy for rotating User-Agents.
33#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
34pub enum UserAgentRotationStrategy {
35    /// Randomly selects a User-Agent from the available pool.
36    #[default]
37    Random,
38    /// Sequentially cycles through the available User-Agents.
39    Sequential,
40}
41
42/// Predefined lists of User-Agents for common scenarios.
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub enum BuiltinUserAgentList {
45    /// A list of popular desktop browser User-Agents.
46    Desktop,
47    /// A list of popular mobile browser User-Agents.
48    Mobile,
49    /// A mixed list of popular desktop and mobile browser User-Agents.
50    RandomPopular,
51}
52
53/// Defines the source from which User-Agents are loaded.
54#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
55#[serde(untagged)]
56pub enum UserAgentSource {
57    /// A direct list of User-Agent strings.
58    List(Vec<String>),
59    /// Path to a file containing User-Agent strings, one per line.
60    File(PathBuf),
61    /// Use a predefined, built-in list of User-Agents.
62    Builtin(BuiltinUserAgentList),
63    /// No User-Agent source specified, will fallback to a default if available.
64    None,
65}
66
67impl Default for UserAgentSource {
68    fn default() -> Self {
69        UserAgentSource::Builtin(BuiltinUserAgentList::RandomPopular)
70    }
71}
72
73/// Represents a User-Agent profile, including the User-Agent string and other associated headers.
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct UserAgentProfile {
76    /// The User-Agent string.
77    pub user_agent: Arc<String>,
78    /// Additional headers that should be sent with this User-Agent to mimic a real browser.
79    #[serde(default)]
80    pub headers: DashMap<String, String>,
81}
82
83impl From<String> for UserAgentProfile {
84    fn from(user_agent: String) -> Self {
85        UserAgentProfile {
86            user_agent: Arc::new(user_agent),
87            headers: DashMap::new(),
88        }
89    }
90}
91
92impl From<&str> for UserAgentProfile {
93    fn from(user_agent: &str) -> Self {
94        UserAgentProfile {
95            user_agent: Arc::new(user_agent.to_string()),
96            headers: DashMap::new(),
97        }
98    }
99}
100
101/// Builder for creating a `UserAgentMiddleware`.
102#[derive(Debug, Clone, Default, Serialize, Deserialize)]
103pub struct UserAgentMiddlewareBuilder {
104    source: UserAgentSource,
105    strategy: UserAgentRotationStrategy,
106    fallback_user_agent: Option<String>,
107    per_domain: DashMap<String, UserAgentSource>,
108}
109
110impl UserAgentMiddlewareBuilder {
111    /// Sets the primary source for User-Agents.
112    pub fn source(mut self, source: UserAgentSource) -> Self {
113        self.source = source;
114        self
115    }
116
117    /// Sets the strategy to use for rotating User-Agents.
118    pub fn strategy(mut self, strategy: UserAgentRotationStrategy) -> Self {
119        self.strategy = strategy;
120        self
121    }
122
123    /// Sets a fallback User-Agent to use if no other User-Agents are available.
124    pub fn fallback_user_agent(mut self, fallback_user_agent: String) -> Self {
125        self.fallback_user_agent = Some(fallback_user_agent);
126        self
127    }
128
129    /// Adds a domain-specific User-Agent source.
130    pub fn per_domain(self, domain: String, source: UserAgentSource) -> Self {
131        self.per_domain.insert(domain, source);
132        self
133    }
134
135    /// Builds the `UserAgentMiddleware`.
136    /// This can fail if a User-Agent source file is specified but cannot be read.
137    pub fn build(self) -> Result<UserAgentMiddleware, SpiderError> {
138        let default_pool = Arc::new(UserAgentMiddleware::load_user_agents(&self.source)?);
139
140        let domain_cache = Cache::builder()
141            .time_to_live(Duration::from_secs(30 * 60)) // 30 minutes
142            .build();
143
144        for entry in self.per_domain.iter() {
145            let domain = entry.key().clone();
146            let source = entry.value().clone();
147            let pool = Arc::new(UserAgentMiddleware::load_user_agents(&source)?);
148            domain_cache.insert(domain, pool);
149        }
150
151        let middleware = UserAgentMiddleware {
152            strategy: self.strategy,
153            fallback_user_agent: self.fallback_user_agent,
154            domain_cache,
155            default_pool,
156            current_index: AtomicUsize::new(0),
157        };
158
159        info!(
160            "Initializing UserAgentMiddleware with config: {:?}",
161            middleware
162        );
163
164        Ok(middleware)
165    }
166}
167
168pub struct UserAgentMiddleware {
169    strategy: UserAgentRotationStrategy,
170    fallback_user_agent: Option<String>,
171    domain_cache: Cache<String, Arc<Vec<UserAgentProfile>>>,
172    default_pool: Arc<Vec<UserAgentProfile>>,
173    current_index: AtomicUsize,
174}
175
176impl Debug for UserAgentMiddleware {
177    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
178        f.debug_struct("UserAgentMiddleware")
179            .field("strategy", &self.strategy)
180            .field("fallback_user_agent", &self.fallback_user_agent)
181            .field(
182                "domain_cache",
183                &format!("Cache({})", self.domain_cache.weighted_size()),
184            )
185            .field(
186                "default_pool",
187                &format!("Pool({})", self.default_pool.len()),
188            )
189            .field("current_index", &self.current_index)
190            .finish()
191    }
192}
193
194impl UserAgentMiddleware {
195    /// Creates a new `UserAgentMiddlewareBuilder` to start building a `UserAgentMiddleware`.
196    pub fn builder() -> UserAgentMiddlewareBuilder {
197        UserAgentMiddlewareBuilder::default()
198    }
199
200    fn load_user_agents(source: &UserAgentSource) -> Result<Vec<UserAgentProfile>, SpiderError> {
201        match source {
202            UserAgentSource::List(list) => Ok(list
203                .iter()
204                .map(|ua| UserAgentProfile::from(ua.clone()))
205                .collect()),
206            UserAgentSource::File(path) => Self::load_from_file(path),
207            UserAgentSource::Builtin(builtin_list) => {
208                Ok(Self::load_builtin_user_agents(builtin_list))
209            }
210            UserAgentSource::None => Ok(Vec::new()),
211        }
212    }
213
214    fn load_from_file(path: &Path) -> Result<Vec<UserAgentProfile>, SpiderError> {
215        if !path.exists() {
216            return Err(SpiderError::IoError(
217                std::io::Error::new(
218                    std::io::ErrorKind::NotFound,
219                    format!("User-agent file not found: {}", path.display()),
220                )
221                .to_string(),
222            ));
223        }
224        let file = File::open(path)?;
225        let reader = BufReader::new(file);
226        let user_agents: Vec<UserAgentProfile> = reader
227            .lines()
228            .map_while(Result::ok)
229            .filter(|line| !line.trim().is_empty())
230            .map(UserAgentProfile::from)
231            .collect();
232
233        if user_agents.is_empty() {
234            warn!(
235                "User-Agent file {:?} is empty or contains no valid User-Agents.",
236                path
237            );
238        }
239        Ok(user_agents)
240    }
241
242    //TODO: provide a list of user agents from third parties
243    fn load_builtin_user_agents(list_type: &BuiltinUserAgentList) -> Vec<UserAgentProfile> {
244        match list_type {
245            BuiltinUserAgentList::Desktop => vec![
246                UserAgentProfile::from(
247                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
248                ),
249                UserAgentProfile::from(
250                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
251                ),
252                UserAgentProfile::from(
253                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
254                ),
255            ],
256            BuiltinUserAgentList::Mobile => vec![
257                UserAgentProfile::from(
258                    "Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.193 Mobile Safari/537.36",
259                ),
260                UserAgentProfile::from(
261                    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
262                ),
263            ],
264            BuiltinUserAgentList::RandomPopular => vec![
265                UserAgentProfile::from(
266                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
267                ),
268                UserAgentProfile::from(
269                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
270                ),
271                UserAgentProfile::from(
272                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
273                ),
274                UserAgentProfile::from(
275                    "Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.193 Mobile Safari/537.36",
276                ),
277                UserAgentProfile::from(
278                    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
279                ),
280            ],
281        }
282    }
283
284    fn get_user_agent(&self, domain: Option<&str>) -> Option<UserAgentProfile> {
285        let mut rng = rand::thread_rng();
286
287        let pool = domain
288            .and_then(|d| self.domain_cache.get(d))
289            .unwrap_or_else(|| self.default_pool.clone());
290
291        if pool.is_empty() {
292            debug!("User-Agent pool is empty.");
293            return self
294                .fallback_user_agent
295                .as_ref()
296                .map(|ua| UserAgentProfile::from(ua.clone()));
297        }
298
299        match self.strategy {
300            UserAgentRotationStrategy::Random => pool.choose(&mut rng).cloned(),
301            UserAgentRotationStrategy::Sequential => {
302                let current = self.current_index.fetch_add(1, Ordering::SeqCst);
303                let index = current % pool.len();
304                pool.get(index).cloned()
305            }
306        }
307    }
308}
309
310#[async_trait]
311impl<C: Send + Sync> Middleware<C> for UserAgentMiddleware {
312    fn name(&self) -> &str {
313        "UserAgentMiddleware"
314    }
315
316    async fn process_request(
317        &mut self,
318        _client: &C,
319        mut request: Request,
320    ) -> Result<MiddlewareAction<Request>, SpiderError> {
321        let domain = request.url.domain();
322        if let Some(profile) = self.get_user_agent(domain) {
323            debug!("Applying User-Agent: {}", profile.user_agent);
324            request.headers.insert(
325                USER_AGENT,
326                HeaderValue::from_str(&profile.user_agent).map_err(|e| {
327                    SpiderError::HeaderValueError(format!(
328                        "Invalid User-Agent string '{}': {}",
329                        profile.user_agent, e
330                    ))
331                })?,
332            );
333            for header in profile.headers.iter() {
334                request.headers.insert(
335                    reqwest::header::HeaderName::from_bytes(header.key().as_bytes()).map_err(
336                        |e| SpiderError::HeaderValueError(format!("Invalid header name: {}", e)),
337                    )?,
338                    HeaderValue::from_str(header.value().as_str()).map_err(|e| {
339                        SpiderError::HeaderValueError(format!(
340                            "Invalid header value for {}: {}",
341                            header.key(),
342                            e
343                        ))
344                    })?,
345                );
346            }
347        } else {
348            debug!("No User-Agent applied.");
349        }
350        Ok(MiddlewareAction::Continue(request))
351    }
352}