rust_scraper/
user_agent.rs1use chrono::{DateTime, Datelike, Utc};
26use reqwest::Client;
27use serde::{Deserialize, Serialize};
28use std::fs;
29use std::path::PathBuf;
30use std::time::Duration;
31use tracing;
32
33const UA_LIST_URL: &str =
35 "https://raw.githubusercontent.com/user-agents-api/data/main/user-agents.json";
36
37const MIN_CHROME_VERSION: u32 = 131;
40
41#[derive(Debug, Deserialize, Serialize)]
43pub struct UserAgentCache {
44 agents: Vec<String>,
45 chrome_version: u32,
46 downloaded_at: DateTime<Utc>,
47}
48
49impl UserAgentCache {
50 fn cache_path() -> PathBuf {
52 dirs::cache_dir()
53 .unwrap_or_else(|| PathBuf::from("."))
54 .join("rust_scraper")
55 .join("user_agents.json")
56 }
57
58 pub async fn load() -> Vec<String> {
71 let current_year = Utc::now().year();
72
73 if let Ok(cache) = Self::load_from_cache() {
75 let cache_chrome_year = 2023 + (cache.chrome_version - 120) as i32;
78
79 if cache_chrome_year >= current_year - 1 {
81 tracing::info!("Using cached user agents (Chrome {})", cache.chrome_version);
82 return cache.agents;
83 }
84
85 tracing::warn!(
86 "Cached user agents outdated (Chrome {}), fetching fresh...",
87 cache.chrome_version
88 );
89 }
90
91 match Self::fetch_and_cache().await {
93 Ok(agents) => agents,
94 Err(e) => {
95 tracing::warn!("Failed to fetch user agents: {}", e);
96 Self::fallback_agents()
97 }
98 }
99 }
100
101 fn load_from_cache() -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
103 let content = fs::read_to_string(Self::cache_path())?;
104 let cache: Self = serde_json::from_str(&content)?;
105 Ok(cache)
106 }
107
108 async fn fetch_and_cache() -> Result<Vec<String>, Box<dyn std::error::Error + Send + Sync>> {
110 let client = Client::builder().timeout(Duration::from_secs(5)).build()?;
111
112 let agents = match client.get(UA_LIST_URL).send().await {
114 Ok(resp) if resp.status().is_success() => {
115 let json: serde_json::Value = resp.json().await?;
117
118 json.as_array()
120 .map(|arr| {
121 arr.iter()
122 .filter_map(|v| v.as_str())
123 .filter(|ua| {
124 ua.contains("Chrome/") && {
125 ua.split("Chrome/")
126 .nth(1)
127 .and_then(|s| s.split('.').next())
128 .and_then(|v| v.parse::<u32>().ok())
129 .map(|ver| ver >= MIN_CHROME_VERSION)
130 .unwrap_or(false)
131 }
132 })
133 .map(String::from)
134 .collect()
135 })
136 .unwrap_or_else(Self::fallback_agents)
137 }
138 _ => Self::fallback_agents(),
139 };
140
141 let chrome_version = agents
143 .first()
144 .and_then(|ua| ua.split("Chrome/").nth(1))
145 .and_then(|s| s.split('.').next())
146 .and_then(|v| v.parse::<u32>().ok())
147 .unwrap_or(MIN_CHROME_VERSION);
148
149 let cache = UserAgentCache {
151 agents: agents.clone(),
152 chrome_version,
153 downloaded_at: Utc::now(),
154 };
155
156 if let Some(parent) = Self::cache_path().parent() {
157 let _ = fs::create_dir_all(parent); }
159
160 if let Ok(json) = serde_json::to_string_pretty(&cache) {
162 let _ = fs::write(Self::cache_path(), json);
163 }
164
165 tracing::info!(
166 "Cached {} user agents (Chrome {})",
167 agents.len(),
168 chrome_version
169 );
170
171 Ok(agents)
172 }
173
174 pub fn fallback_agents() -> Vec<String> {
177 vec![
178 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36".to_string(),
179 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36".to_string(),
180 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36".to_string(),
181 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36".to_string(),
182 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36".to_string(),
183 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0".to_string(),
184 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0".to_string(),
185 ]
186 }
187}
188
189#[must_use]
209pub fn get_random_user_agent_from_pool(pool: &[String]) -> String {
210 let rand_idx = rand::random::<usize>() % pool.len();
211 pool[rand_idx].clone()
212}
213
214#[deprecated(since = "0.4.0", note = "Use UserAgentCache::load() instead")]
220#[must_use]
221pub fn get_random_user_agent() -> String {
222 let agents = UserAgentCache::fallback_agents();
224 get_random_user_agent_from_pool(&agents)
225}
226
227#[cfg(test)]
228mod tests {
229 use super::*;
230
231 #[tokio::test]
232 async fn test_user_agent_cache_load() {
233 let agents = UserAgentCache::load().await;
234 assert!(!agents.is_empty());
235 assert!(agents
237 .iter()
238 .any(|ua| ua.contains("Chrome/") || ua.contains("Firefox/")));
239 }
240
241 #[test]
242 fn test_fallback_agents_chrome_version() {
243 let agents = UserAgentCache::fallback_agents();
244 assert!(!agents.is_empty());
245 for agent in &agents {
246 assert!(
247 agent.contains("Chrome/13") || agent.contains("Firefox/"),
248 "Agent '{}' should contain Chrome/13x or Firefox/",
249 agent
250 );
251 }
252 }
253
254 #[test]
255 fn test_fallback_agents_are_unique() {
256 let agents = UserAgentCache::fallback_agents();
257 let mut unique_agents = agents.clone();
258 unique_agents.sort();
259 unique_agents.dedup();
260 assert_eq!(
261 agents.len(),
262 unique_agents.len(),
263 "Fallback agents should be unique"
264 );
265 }
266
267 #[test]
268 fn test_get_random_user_agent_from_pool() {
269 let pool = vec!["Agent1".to_string(), "Agent2".to_string()];
270 let ua = get_random_user_agent_from_pool(&pool);
271 assert!(ua == "Agent1" || ua == "Agent2");
272 }
273
274 #[test]
275 fn test_cache_path_construction() {
276 let path = UserAgentCache::cache_path();
277 assert!(path.ends_with("user_agents.json"));
279 assert!(path.to_string_lossy().contains("rust_scraper"));
280 }
281}