Skip to main content

rust_scraper/
user_agent.rs

1//! User-Agent module with TTL-based caching
2//!
3//! Provides lazy-loaded user agents with 1-year cache validity.
4//! Following rust-skills: perf-cache-with-ttl, err-graceful-degradation, config-externalize
5//!
6//! # Cache Strategy
7//!
8//! 1. Check cache at `~/.cache/rust_scraper/user_agents.json`
9//! 2. Extract Chrome year from cached version → if year >= current_year - 1 → USE cache
10//! 3. If cache is old → download from API → save cache
11//! 4. If download fails → fallback to hardcoded 2026 list
12//!
13//! # Examples
14//!
15//! ```no_run
16//! use rust_scraper::user_agent::UserAgentCache;
17//!
18//! # #[tokio::main]
19//! # async fn main() {
20//! let agents = UserAgentCache::load().await;
21//! assert!(!agents.is_empty());
22//! # }
23//! ```
24
25use chrono::{DateTime, Datelike, Utc};
26use reqwest::Client;
27use serde::{Deserialize, Serialize};
28use std::fs;
29use std::path::PathBuf;
30use std::time::Duration;
31use tracing;
32
33/// API URL for fresh user agents
34const UA_LIST_URL: &str =
35    "https://raw.githubusercontent.com/user-agents-api/data/main/user-agents.json";
36
37/// Minimum acceptable Chrome version (2025+)
38/// Chrome 131 = Enero 2025, Chrome 132 = Marzo 2026
39const MIN_CHROME_VERSION: u32 = 131;
40
41/// Cache metadata
42#[derive(Debug, Deserialize, Serialize)]
43pub struct UserAgentCache {
44    agents: Vec<String>,
45    chrome_version: u32,
46    downloaded_at: DateTime<Utc>,
47}
48
49impl UserAgentCache {
50    /// Get cache file path: ~/.cache/rust_scraper/user_agents.json
51    fn cache_path() -> PathBuf {
52        dirs::cache_dir()
53            .unwrap_or_else(|| PathBuf::from("."))
54            .join("rust_scraper")
55            .join("user_agents.json")
56    }
57
58    /// Load UAs: cache if valid, else fetch fresh
59    ///
60    /// # Returns
61    ///
62    /// Vec<String> - List of user agent strings (Chrome 131+ or fallback)
63    ///
64    /// # Errors
65    ///
66    /// Returns fallback agents if:
67    /// - Cache read fails
68    /// - API download fails
69    /// - Cache is older than 1 year
70    pub async fn load() -> Vec<String> {
71        let current_year = Utc::now().year();
72
73        // Try load from cache
74        if let Ok(cache) = Self::load_from_cache() {
75            // Chrome 120 = 2023, Chrome 131 = 2025, Chrome 132 = 2026
76            // Formula: chrome_year = 2023 + (chrome_version - 120)
77            let cache_chrome_year = 2023 + (cache.chrome_version - 120) as i32;
78
79            // Cache valid if <= 1 year old
80            if cache_chrome_year >= current_year - 1 {
81                tracing::info!("Using cached user agents (Chrome {})", cache.chrome_version);
82                return cache.agents;
83            }
84
85            tracing::warn!(
86                "Cached user agents outdated (Chrome {}), fetching fresh...",
87                cache.chrome_version
88            );
89        }
90
91        // Fetch fresh
92        match Self::fetch_and_cache().await {
93            Ok(agents) => agents,
94            Err(e) => {
95                tracing::warn!("Failed to fetch user agents: {}", e);
96                Self::fallback_agents()
97            }
98        }
99    }
100
101    /// Load user agents from cache file
102    fn load_from_cache() -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
103        let content = fs::read_to_string(Self::cache_path())?;
104        let cache: Self = serde_json::from_str(&content)?;
105        Ok(cache)
106    }
107
108    /// Fetch user agents from API and save to cache
109    async fn fetch_and_cache() -> Result<Vec<String>, Box<dyn std::error::Error + Send + Sync>> {
110        let client = Client::builder().timeout(Duration::from_secs(5)).build()?;
111
112        // Fetch from API
113        let agents = match client.get(UA_LIST_URL).send().await {
114            Ok(resp) if resp.status().is_success() => {
115                // Extract JSON from response
116                let json: serde_json::Value = resp.json().await?;
117
118                // Filter Chrome 131+ UAs
119                json.as_array()
120                    .map(|arr| {
121                        arr.iter()
122                            .filter_map(|v| v.as_str())
123                            .filter(|ua| {
124                                ua.contains("Chrome/") && {
125                                    ua.split("Chrome/")
126                                        .nth(1)
127                                        .and_then(|s| s.split('.').next())
128                                        .and_then(|v| v.parse::<u32>().ok())
129                                        .map(|ver| ver >= MIN_CHROME_VERSION)
130                                        .unwrap_or(false)
131                                }
132                            })
133                            .map(String::from)
134                            .collect()
135                    })
136                    .unwrap_or_else(Self::fallback_agents)
137            }
138            _ => Self::fallback_agents(),
139        };
140
141        // Extract Chrome version from first UA
142        let chrome_version = agents
143            .first()
144            .and_then(|ua| ua.split("Chrome/").nth(1))
145            .and_then(|s| s.split('.').next())
146            .and_then(|v| v.parse::<u32>().ok())
147            .unwrap_or(MIN_CHROME_VERSION);
148
149        // Save cache (ignore errors - read-only FS, containers, etc.)
150        let cache = UserAgentCache {
151            agents: agents.clone(),
152            chrome_version,
153            downloaded_at: Utc::now(),
154        };
155
156        if let Some(parent) = Self::cache_path().parent() {
157            let _ = fs::create_dir_all(parent); // Ignore errors
158        }
159
160        // Silently ignore write errors (read-only FS, containers, etc.)
161        if let Ok(json) = serde_json::to_string_pretty(&cache) {
162            let _ = fs::write(Self::cache_path(), json);
163        }
164
165        tracing::info!(
166            "Cached {} user agents (Chrome {})",
167            agents.len(),
168            chrome_version
169        );
170
171        Ok(agents)
172    }
173
174    /// Fallback: hardcoded list updated 2026
175    /// Chrome 131 (Enero 2025) y Chrome 132 (Marzo 2026)
176    pub fn fallback_agents() -> Vec<String> {
177        vec![
178            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36".to_string(),
179            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36".to_string(),
180            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36".to_string(),
181            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36".to_string(),
182            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36".to_string(),
183            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0".to_string(),
184            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0".to_string(),
185        ]
186    }
187}
188
189/// Get a random user agent from pool
190///
191/// # Arguments
192///
193/// * `pool` - Slice of user agent strings
194///
195/// # Returns
196///
197/// A randomly selected user agent string
198///
199/// # Examples
200///
201/// ```
202/// use rust_scraper::user_agent::get_random_user_agent_from_pool;
203///
204/// let agents = vec!["Chrome/131".to_string(), "Firefox/123".to_string()];
205/// let ua = get_random_user_agent_from_pool(&agents);
206/// assert!(ua == "Chrome/131" || ua == "Firefox/123");
207/// ```
208#[must_use]
209pub fn get_random_user_agent_from_pool(pool: &[String]) -> String {
210    let rand_idx = rand::random::<usize>() % pool.len();
211    pool[rand_idx].clone()
212}
213
214/// Legacy function for backward compatibility (DEPRECATED)
215///
216/// # Deprecated
217///
218/// Since 0.4.0: Use [`UserAgentCache::load()`] instead for TTL-based caching.
219#[deprecated(since = "0.4.0", note = "Use UserAgentCache::load() instead")]
220#[must_use]
221pub fn get_random_user_agent() -> String {
222    // Fallback directly (no cache)
223    let agents = UserAgentCache::fallback_agents();
224    get_random_user_agent_from_pool(&agents)
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    #[tokio::test]
232    async fn test_user_agent_cache_load() {
233        let agents = UserAgentCache::load().await;
234        assert!(!agents.is_empty());
235        // At least one should contain Chrome/13x or Firefox
236        assert!(agents
237            .iter()
238            .any(|ua| ua.contains("Chrome/") || ua.contains("Firefox/")));
239    }
240
241    #[test]
242    fn test_fallback_agents_chrome_version() {
243        let agents = UserAgentCache::fallback_agents();
244        assert!(!agents.is_empty());
245        for agent in &agents {
246            assert!(
247                agent.contains("Chrome/13") || agent.contains("Firefox/"),
248                "Agent '{}' should contain Chrome/13x or Firefox/",
249                agent
250            );
251        }
252    }
253
254    #[test]
255    fn test_fallback_agents_are_unique() {
256        let agents = UserAgentCache::fallback_agents();
257        let mut unique_agents = agents.clone();
258        unique_agents.sort();
259        unique_agents.dedup();
260        assert_eq!(
261            agents.len(),
262            unique_agents.len(),
263            "Fallback agents should be unique"
264        );
265    }
266
267    #[test]
268    fn test_get_random_user_agent_from_pool() {
269        let pool = vec!["Agent1".to_string(), "Agent2".to_string()];
270        let ua = get_random_user_agent_from_pool(&pool);
271        assert!(ua == "Agent1" || ua == "Agent2");
272    }
273
274    #[test]
275    fn test_cache_path_construction() {
276        let path = UserAgentCache::cache_path();
277        // Should end with rust_scraper/user_agents.json
278        assert!(path.ends_with("user_agents.json"));
279        assert!(path.to_string_lossy().contains("rust_scraper"));
280    }
281}