stygian_proxy/fetcher.rs
1//! Proxy list fetching — port trait and free-list HTTP adapter.
2//!
3//! [`ProxyFetcher`] is the port trait. Implement it to pull proxies from any
4//! source (remote HTTP list, database, commercial API, etc.) and integrate with
5//! [`ProxyManager`] via [`load_from_fetcher`].
6//!
7//! The built-in [`FreeListFetcher`] downloads plain-text `host:port` proxy
8//! lists from public URLs (e.g. the `TheSpeedX/PROXY-List` feeds on GitHub)
9//! and parses them into [`Proxy`] records. It is suitable for development,
10//! testing, and low-stakes scraping where proxy quality is less critical.
11//!
12//! ## Example — load from a free list and populate the pool
13//!
14//! ```no_run
15//! use std::sync::Arc;
16//! use stygian_proxy::{
17//! ProxyManager,
18//! storage::MemoryProxyStore,
19//! fetcher::{FreeListFetcher, ProxyFetcher, FreeListSource},
20//! };
21//!
22//! # async fn run() -> stygian_proxy::error::ProxyResult<()> {
23//! let fetcher = FreeListFetcher::new(vec![
24//! FreeListSource::TheSpeedXHttp,
25//! ]);
26//!
27//! let manager = ProxyManager::builder()
28//! .storage(Arc::new(MemoryProxyStore::default()))
29//! .build()?;
30//! let loaded = stygian_proxy::fetcher::load_from_fetcher(&manager, &fetcher).await?;
31//! println!("Loaded {loaded} proxies");
32//! # Ok(())
33//! # }
34//! ```
35
36use std::time::Duration;
37
38use async_trait::async_trait;
39use futures::future::join_all;
40use reqwest::Client;
41use tracing::{debug, warn};
42
43use crate::{
44 Proxy, ProxyManager, ProxyType,
45 error::{ProxyError, ProxyResult},
46};
47
48// ─── Port trait ───────────────────────────────────────────────────────────────
49
50/// A source that can produce a list of [`Proxy`] records asynchronously.
51///
52/// Implement this trait to integrate any proxy source (remote HTTP list,
53/// commercial API, database, file) with [`load_from_fetcher`].
54///
55/// # Example
56///
57/// ```
58/// use async_trait::async_trait;
59/// use stygian_proxy::{Proxy, ProxyType};
60/// use stygian_proxy::fetcher::ProxyFetcher;
61/// use stygian_proxy::error::ProxyResult;
62///
63/// struct MyStaticFetcher;
64///
65/// #[async_trait]
66/// impl ProxyFetcher for MyStaticFetcher {
67/// async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
68/// Ok(vec![Proxy {
69/// url: "http://192.168.1.1:8080".into(),
70/// proxy_type: ProxyType::Http,
71/// username: None,
72/// password: None,
73/// weight: 1,
74/// tags: vec!["static".into()],
75/// }])
76/// }
77/// }
78/// ```
79#[async_trait]
80pub trait ProxyFetcher: Send + Sync {
81 /// Fetch the current proxy list.
82 ///
83 /// # Errors
84 ///
85 /// Returns [`ProxyError::FetchFailed`] if the source is unreachable or
86 /// returns malformed data.
87 async fn fetch(&self) -> ProxyResult<Vec<Proxy>>;
88}
89
90// ─── Free-list sources ────────────────────────────────────────────────────────
91
92/// A well-known free/public proxy list feed.
93///
94/// These lists are community-maintained and quality varies. They are suitable
95/// for development and testing. For production use, prefer a commercial
96/// provider adapter.
97///
98/// # Example
99///
100/// ```
101/// use stygian_proxy::fetcher::FreeListSource;
102/// let _src = FreeListSource::TheSpeedXHttp;
103/// ```
104#[derive(Debug, Clone, PartialEq, Eq)]
105#[non_exhaustive]
106pub enum FreeListSource {
107 /// HTTP proxies from `TheSpeedX/PROXY-List` (GitHub, plain `host:port`).
108 TheSpeedXHttp,
109 #[cfg(feature = "socks")]
110 /// SOCKS4 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
111 TheSpeedXSocks4,
112 #[cfg(feature = "socks")]
113 /// SOCKS5 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
114 TheSpeedXSocks5,
115 /// HTTP proxies from `clarketm/proxy-list` (GitHub, plain `host:port`).
116 ClarketmHttp,
117 /// Mixed HTTP proxies from `openproxylist.xyz`.
118 OpenProxyListHttp,
119 /// A custom URL. Content must be one `host:port` entry per line.
120 Custom {
121 /// The URL to fetch.
122 url: String,
123 /// The [`ProxyType`] to assign all parsed entries.
124 proxy_type: ProxyType,
125 },
126}
127
128impl FreeListSource {
129 fn url(&self) -> &str {
130 match self {
131 Self::TheSpeedXHttp => {
132 "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt"
133 }
134 #[cfg(feature = "socks")]
135 Self::TheSpeedXSocks4 => {
136 "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"
137 }
138 #[cfg(feature = "socks")]
139 Self::TheSpeedXSocks5 => {
140 "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"
141 }
142 Self::ClarketmHttp => {
143 "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt"
144 }
145 Self::OpenProxyListHttp => "https://openproxylist.xyz/http.txt",
146 Self::Custom { url, .. } => url.as_str(),
147 }
148 }
149
150 fn proxy_type(&self) -> ProxyType {
151 match self {
152 Self::TheSpeedXHttp | Self::ClarketmHttp | Self::OpenProxyListHttp => ProxyType::Http,
153 #[cfg(feature = "socks")]
154 Self::TheSpeedXSocks4 => ProxyType::Socks4,
155 #[cfg(feature = "socks")]
156 Self::TheSpeedXSocks5 => ProxyType::Socks5,
157 Self::Custom { proxy_type, .. } => *proxy_type,
158 }
159 }
160}
161
162// ─── FreeListFetcher ──────────────────────────────────────────────────────────
163
164/// Fetches plain-text `host:port` proxy lists from one or more public URLs.
165///
166/// Each source is fetched concurrently. Lines that do not parse as valid
167/// `host:port` entries are silently skipped. An empty or unreachable source
168/// logs a warning but does not fail the entire fetch — at least one source
169/// must return results for the call to succeed.
170///
171/// # Example
172///
173/// ```no_run
174/// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource, ProxyFetcher};
175///
176/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
177/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
178/// let proxies = fetcher.fetch().await?;
179/// println!("Got {} proxies", proxies.len());
180/// # Ok(())
181/// # }
182/// ```
183pub struct FreeListFetcher {
184 sources: Vec<FreeListSource>,
185 client: Client,
186 tags: Vec<String>,
187}
188
189impl FreeListFetcher {
190 /// Create a fetcher for the given sources with default HTTP client settings
191 /// (10 s timeout, TLS enabled).
192 ///
193 /// # Example
194 ///
195 /// ```
196 /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
197 /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
198 /// ```
199 pub fn new(sources: Vec<FreeListSource>) -> Self {
200 let client = Client::builder()
201 .timeout(Duration::from_secs(10))
202 .build()
203 .unwrap_or_else(|e| {
204 warn!("Failed to build HTTP client with 10 s timeout (TLS backend issue?): {e}; falling back to default client with per-request timeout enforcement");
205 Client::default()
206 });
207 Self {
208 sources,
209 client,
210 tags: vec!["free-list".into()],
211 }
212 }
213
214 /// Attach extra tags to every proxy produced by this fetcher.
215 ///
216 /// # Example
217 ///
218 /// ```
219 /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
220 /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp])
221 /// .with_tags(vec!["dev".into(), "http".into()]);
222 /// ```
223 #[must_use]
224 pub fn with_tags(mut self, tags: Vec<String>) -> Self {
225 self.tags.extend(tags);
226 self
227 }
228
229 /// Parse one `host:port` line, including bracketed IPv6 addresses.
230 fn parse_host_port_line(line: &str) -> Option<(String, u16)> {
231 let line = line.trim();
232 if line.is_empty() || line.starts_with('#') {
233 return None;
234 }
235
236 let (host, port_str) = if line.starts_with('[') {
237 let end = line.find(']')?;
238 let host = line.get(..=end)?.trim();
239 let remainder = line.get(end + 1..)?.trim();
240 let (_, port_str) = remainder.rsplit_once(':')?;
241 (host, port_str.trim())
242 } else {
243 let (host, port_str) = line.rsplit_once(':')?;
244 let host = host.trim();
245 if host.contains(':') {
246 return None;
247 }
248 (host, port_str.trim())
249 };
250
251 if host.is_empty() || host == "[]" {
252 return None;
253 }
254
255 let port = port_str.parse::<u16>().ok()?;
256 if port == 0 {
257 return None;
258 }
259
260 Some((host.to_string(), port))
261 }
262
263 /// Fetch a single source, returning parsed proxies (empty on failure).
264 async fn fetch_source(&self, source: &FreeListSource) -> Vec<Proxy> {
265 let url = source.url();
266 let proxy_type = source.proxy_type();
267
268 let body = match self
269 .client
270 .get(url)
271 .timeout(Duration::from_secs(10))
272 .send()
273 .await
274 {
275 Ok(resp) if resp.status().is_success() => match resp.text().await {
276 Ok(t) => t,
277 Err(e) => {
278 warn!("Failed to read body from {url}: {e}");
279 return vec![];
280 }
281 },
282 Ok(resp) => {
283 warn!(
284 "Non-success status {} fetching proxy list from {url}",
285 resp.status()
286 );
287 return vec![];
288 }
289 Err(e) => {
290 warn!("Failed to fetch proxy list from {url}: {e}");
291 return vec![];
292 }
293 };
294
295 let proxies: Vec<Proxy> = body
296 .lines()
297 .filter_map(|line| {
298 let (host, port) = Self::parse_host_port_line(line)?;
299 let scheme = match proxy_type {
300 ProxyType::Http => "http",
301 ProxyType::Https => "https",
302 #[cfg(feature = "socks")]
303 ProxyType::Socks4 => "socks4",
304 #[cfg(feature = "socks")]
305 ProxyType::Socks5 => "socks5",
306 };
307 Some(Proxy {
308 url: format!("{scheme}://{host}:{port}"),
309 proxy_type,
310 username: None,
311 password: None,
312 weight: 1,
313 tags: self.tags.clone(),
314 })
315 })
316 .collect();
317
318 debug!(source = url, count = proxies.len(), "Fetched proxy list");
319 proxies
320 }
321}
322
323#[async_trait]
324impl ProxyFetcher for FreeListFetcher {
325 async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
326 if self.sources.is_empty() {
327 return Err(ProxyError::ConfigError(
328 "no sources configured for FreeListFetcher".into(),
329 ));
330 }
331
332 // Drive all source fetches concurrently.
333 let results = join_all(self.sources.iter().map(|s| self.fetch_source(s))).await;
334 let all: Vec<Proxy> = results.into_iter().flatten().collect();
335
336 if all.is_empty() {
337 return Err(ProxyError::FetchFailed {
338 origin: self
339 .sources
340 .iter()
341 .map(|s| s.url())
342 .collect::<Vec<_>>()
343 .join(", "),
344 message: "all sources returned empty or failed".into(),
345 });
346 }
347
348 Ok(all)
349 }
350}
351
352// ─── Helper ───────────────────────────────────────────────────────────────────
353
354/// Fetch proxies from `fetcher` and add them all to `manager`.
355///
356/// Returns the number of proxies successfully added. Individual `add_proxy`
357/// failures (e.g. duplicate URL) are logged as warnings and do not abort the
358/// load.
359///
360/// # Errors
361///
362/// Returns any [`ProxyError`] emitted by `fetcher.fetch()` if the fetcher
363/// itself fails.
364///
365/// # Example
366///
367/// ```no_run
368/// use std::sync::Arc;
369/// use stygian_proxy::{ProxyManager, storage::MemoryProxyStore, fetcher::{FreeListFetcher, FreeListSource, load_from_fetcher}};
370///
371/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
372/// let manager = ProxyManager::builder()
373/// .storage(Arc::new(MemoryProxyStore::default()))
374/// .build()?;
375/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
376/// let n = load_from_fetcher(&manager, &fetcher).await?;
377/// println!("Loaded {n} proxies");
378/// # Ok(())
379/// # }
380/// ```
381pub async fn load_from_fetcher(
382 manager: &ProxyManager,
383 fetcher: &dyn ProxyFetcher,
384) -> ProxyResult<usize> {
385 let proxies = fetcher.fetch().await?;
386 let total = proxies.len();
387 let mut loaded = 0usize;
388
389 for proxy in proxies {
390 match manager.add_proxy(proxy).await {
391 Ok(_) => loaded += 1,
392 Err(e) => warn!("Skipped proxy during load: {e}"),
393 }
394 }
395
396 debug!(total, loaded, "Proxy list loaded into manager");
397 Ok(loaded)
398}
399
400// ─── Tests ────────────────────────────────────────────────────────────────────
401
402#[cfg(test)]
403mod tests {
404 use super::*;
405
406 #[test]
407 fn free_list_source_url_is_nonempty() {
408 #[cfg(not(feature = "socks"))]
409 let sources = vec![
410 FreeListSource::TheSpeedXHttp,
411 FreeListSource::ClarketmHttp,
412 FreeListSource::OpenProxyListHttp,
413 FreeListSource::Custom {
414 url: "https://example.com/proxies.txt".into(),
415 proxy_type: ProxyType::Http,
416 },
417 ];
418 #[cfg(feature = "socks")]
419 let sources = {
420 let mut s = vec![
421 FreeListSource::TheSpeedXHttp,
422 FreeListSource::ClarketmHttp,
423 FreeListSource::OpenProxyListHttp,
424 FreeListSource::Custom {
425 url: "https://example.com/proxies.txt".into(),
426 proxy_type: ProxyType::Http,
427 },
428 ];
429 s.extend([
430 FreeListSource::TheSpeedXSocks4,
431 FreeListSource::TheSpeedXSocks5,
432 ]);
433 s
434 };
435 for src in &sources {
436 assert!(
437 !src.url().is_empty(),
438 "FreeListSource::{src:?} has empty URL"
439 );
440 }
441 }
442
443 #[test]
444 fn free_list_source_proxy_types() {
445 assert_eq!(FreeListSource::TheSpeedXHttp.proxy_type(), ProxyType::Http);
446 #[cfg(feature = "socks")]
447 assert_eq!(
448 FreeListSource::TheSpeedXSocks4.proxy_type(),
449 ProxyType::Socks4
450 );
451 #[cfg(feature = "socks")]
452 assert_eq!(
453 FreeListSource::TheSpeedXSocks5.proxy_type(),
454 ProxyType::Socks5
455 );
456 assert_eq!(FreeListSource::ClarketmHttp.proxy_type(), ProxyType::Http);
457 }
458
459 #[test]
460 fn free_list_fetcher_parse_valid_lines() {
461 let fetcher = FreeListFetcher::new(vec![]);
462 // Test the parsing logic directly by calling parse on synthetic text.
463 let text = "1.2.3.4:8080\n# comment\n\nbad-line\n5.6.7.8:3128\n[2001:db8::1]:8081\n";
464 let parsed: Vec<Proxy> = text
465 .lines()
466 .filter_map(|line| {
467 let (host, port) = FreeListFetcher::parse_host_port_line(line)?;
468 Some(Proxy {
469 url: format!("http://{host}:{port}"),
470 proxy_type: ProxyType::Http,
471 username: None,
472 password: None,
473 weight: 1,
474 tags: fetcher.tags.clone(),
475 })
476 })
477 .collect();
478
479 assert_eq!(parsed.len(), 3);
480 assert_eq!(parsed[0].url, "http://1.2.3.4:8080");
481 assert_eq!(parsed[1].url, "http://5.6.7.8:3128");
482 assert_eq!(parsed[2].url, "http://[2001:db8::1]:8081");
483 }
484
485 #[test]
486 fn free_list_fetcher_with_tags_extends() {
487 let f = FreeListFetcher::new(vec![]).with_tags(vec!["custom".into()]);
488 assert!(f.tags.contains(&"free-list".to_string()));
489 assert!(f.tags.contains(&"custom".to_string()));
490 }
491
492 #[test]
493 fn free_list_fetcher_skips_invalid_port() {
494 assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:notaport").is_none());
495 assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:0").is_none());
496 assert!(FreeListFetcher::parse_host_port_line(":8080").is_none());
497 assert!(FreeListFetcher::parse_host_port_line("2001:db8::1:8080").is_none());
498 }
499
500 #[test]
501 fn free_list_fetcher_empty_sources_is_config_error() {
502 let fetcher = FreeListFetcher::new(vec![]);
503 let rt = tokio::runtime::Builder::new_current_thread()
504 .enable_time()
505 .build()
506 .unwrap_or_else(|e| panic!("failed to build runtime for test: {e}"));
507 let err = rt
508 .block_on(fetcher.fetch())
509 .expect_err("empty sources should fail");
510 match err {
511 ProxyError::ConfigError(msg) => {
512 assert!(msg.contains("no sources configured"));
513 }
514 other => panic!("unexpected error variant: {other}"),
515 }
516 }
517
518 #[test]
519 fn proxy_error_fetch_failed_display() {
520 let e = ProxyError::FetchFailed {
521 origin: "https://example.com".into(),
522 message: "timed out".into(),
523 };
524 assert!(e.to_string().contains("https://example.com"));
525 assert!(e.to_string().contains("timed out"));
526 }
527}