stygian_proxy/fetcher.rs
1//! Proxy list fetching — port trait and free-list HTTP adapter.
2//!
3//! [`ProxyFetcher`] is the port trait. Implement it to pull proxies from any
4//! source (remote HTTP list, database, commercial API, etc.) and integrate with
5//! [`ProxyManager`] via [`load_from_fetcher`].
6//!
7//! The built-in [`FreeListFetcher`] downloads plain-text `host:port` proxy
8//! lists from public URLs (e.g. the `TheSpeedX/PROXY-List` feeds on GitHub)
9//! and parses them into [`Proxy`] records. It is suitable for development,
10//! testing, and low-stakes scraping where proxy quality is less critical.
11//!
12//! ## Example — load from a free list and populate the pool
13//!
14//! ```no_run
15//! use std::sync::Arc;
16//! use stygian_proxy::{
17//! ProxyManager,
18//! storage::MemoryProxyStore,
19//! fetcher::{FreeListFetcher, ProxyFetcher, FreeListSource},
20//! };
21//!
22//! # async fn run() -> stygian_proxy::error::ProxyResult<()> {
23//! let fetcher = FreeListFetcher::new(vec![
24//! FreeListSource::TheSpeedXHttp,
25//! ]);
26//!
27//! let manager = ProxyManager::builder()
28//! .storage(Arc::new(MemoryProxyStore::default()))
29//! .build()?;
30//! let loaded = stygian_proxy::fetcher::load_from_fetcher(&manager, &fetcher).await?;
31//! println!("Loaded {loaded} proxies");
32//! # Ok(())
33//! # }
34//! ```
35
36use std::time::Duration;
37
38use async_trait::async_trait;
39use futures::future::join_all;
40use reqwest::Client;
41use tracing::{debug, warn};
42
43use crate::{
44 Proxy, ProxyManager, ProxyType,
45 error::{ProxyError, ProxyResult},
46};
47
48// ─── Port trait ───────────────────────────────────────────────────────────────
49
50/// A source that can produce a list of [`Proxy`] records asynchronously.
51///
52/// Implement this trait to integrate any proxy source (remote HTTP list,
53/// commercial API, database, file) with [`load_from_fetcher`].
54///
55/// # Example
56///
57/// ```
58/// use async_trait::async_trait;
59/// use stygian_proxy::{Proxy, ProxyType};
60/// use stygian_proxy::fetcher::ProxyFetcher;
61/// use stygian_proxy::error::ProxyResult;
62///
63/// struct MyStaticFetcher;
64///
65/// #[async_trait]
66/// impl ProxyFetcher for MyStaticFetcher {
67/// async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
68/// Ok(vec![Proxy {
69/// url: "http://192.168.1.1:8080".into(),
70/// proxy_type: ProxyType::Http,
71/// username: None,
72/// password: None,
73/// weight: 1,
74/// tags: vec!["static".into()],
75/// }])
76/// }
77/// }
78/// ```
79#[async_trait]
80pub trait ProxyFetcher: Send + Sync {
81 /// Fetch the current proxy list.
82 ///
83 /// # Errors
84 ///
85 /// Returns [`ProxyError::FetchFailed`] if the source is unreachable or
86 /// returns malformed data.
87 async fn fetch(&self) -> ProxyResult<Vec<Proxy>>;
88}
89
90// ─── Free-list sources ────────────────────────────────────────────────────────
91
92/// A well-known free/public proxy list feed.
93///
94/// These lists are community-maintained and quality varies. They are suitable
95/// for development and testing. For production use, prefer a commercial
96/// provider adapter.
97///
98/// # Example
99///
100/// ```
101/// use stygian_proxy::fetcher::FreeListSource;
102/// let _src = FreeListSource::TheSpeedXHttp;
103/// ```
104#[derive(Debug, Clone, PartialEq, Eq)]
105#[non_exhaustive]
106pub enum FreeListSource {
107 /// HTTP proxies from `TheSpeedX/PROXY-List` (GitHub, plain `host:port`).
108 TheSpeedXHttp,
109 #[cfg(feature = "socks")]
110 /// SOCKS4 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
111 TheSpeedXSocks4,
112 #[cfg(feature = "socks")]
113 /// SOCKS5 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
114 TheSpeedXSocks5,
115 /// HTTP proxies from `clarketm/proxy-list` (GitHub, plain `host:port`).
116 ClarketmHttp,
117 /// Mixed HTTP proxies from `openproxylist.xyz`.
118 OpenProxyListHttp,
119 /// A custom URL. Content must be one `host:port` entry per line.
120 Custom {
121 /// The URL to fetch.
122 url: String,
123 /// The [`ProxyType`] to assign all parsed entries.
124 proxy_type: ProxyType,
125 },
126}
127
128impl FreeListSource {
129 const fn url(&self) -> &str {
130 match self {
131 Self::TheSpeedXHttp => {
132 "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt"
133 }
134 #[cfg(feature = "socks")]
135 Self::TheSpeedXSocks4 => {
136 "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"
137 }
138 #[cfg(feature = "socks")]
139 Self::TheSpeedXSocks5 => {
140 "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"
141 }
142 Self::ClarketmHttp => {
143 "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt"
144 }
145 Self::OpenProxyListHttp => "https://openproxylist.xyz/http.txt",
146 Self::Custom { url, .. } => url.as_str(),
147 }
148 }
149
150 const fn proxy_type(&self) -> ProxyType {
151 match self {
152 Self::TheSpeedXHttp | Self::ClarketmHttp | Self::OpenProxyListHttp => ProxyType::Http,
153 #[cfg(feature = "socks")]
154 Self::TheSpeedXSocks4 => ProxyType::Socks4,
155 #[cfg(feature = "socks")]
156 Self::TheSpeedXSocks5 => ProxyType::Socks5,
157 Self::Custom { proxy_type, .. } => *proxy_type,
158 }
159 }
160}
161
162// ─── FreeListFetcher ──────────────────────────────────────────────────────────
163
164/// Fetches plain-text `host:port` proxy lists from one or more public URLs.
165///
166/// Each source is fetched concurrently. Lines that do not parse as valid
167/// `host:port` entries are silently skipped. An empty or unreachable source
168/// logs a warning but does not fail the entire fetch — at least one source
169/// must return results for the call to succeed.
170///
171/// # Example
172///
173/// ```no_run
174/// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource, ProxyFetcher};
175///
176/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
177/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
178/// let proxies = fetcher.fetch().await?;
179/// println!("Got {} proxies", proxies.len());
180/// # Ok(())
181/// # }
182/// ```
183pub struct FreeListFetcher {
184 sources: Vec<FreeListSource>,
185 client: Client,
186 tags: Vec<String>,
187}
188
189impl FreeListFetcher {
190 /// Create a fetcher for the given sources with default HTTP client settings
191 /// (10 s timeout, TLS enabled).
192 ///
193 /// # Example
194 ///
195 /// ```
196 /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
197 /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
198 /// ```
199 pub fn new(sources: Vec<FreeListSource>) -> Self {
200 let client = Client::builder()
201 .timeout(Duration::from_secs(10))
202 .build()
203 .unwrap_or_else(|e| {
204 warn!("Failed to build HTTP client with 10 s timeout (TLS backend issue?): {e}; falling back to default client with per-request timeout enforcement");
205 Client::default()
206 });
207 Self {
208 sources,
209 client,
210 tags: vec!["free-list".into()],
211 }
212 }
213
214 /// Replace the internal HTTP client with a TLS-profiled one.
215 ///
216 /// Proxy-list fetch requests will carry a browser TLS fingerprint and
217 /// matching `Accept` / `Sec-CH-UA` headers.
218 ///
219 /// Only available with the `tls-profiled` feature.
220 ///
221 /// # Example
222 ///
223 /// ```no_run
224 /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
225 /// use stygian_proxy::http_client::{ProfiledRequestMode, ProfiledRequester};
226 ///
227 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
228 /// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp])
229 /// .with_profiled_client(ProfiledRequester::chrome_mode(ProfiledRequestMode::Preset)?);
230 /// # Ok(())
231 /// # }
232 /// ```
233 #[cfg(feature = "tls-profiled")]
234 #[must_use]
235 pub fn with_profiled_client(
236 mut self,
237 requester: crate::http_client::ProfiledRequester,
238 ) -> Self {
239 self.client = requester.client().clone();
240 drop(requester);
241 self
242 }
243
244 /// Build and attach a profile-mode-based requester.
245 ///
246 /// Uses Chrome 131 as the baseline browser identity and applies `mode`
247 /// to TLS control mapping.
248 ///
249 /// Only available when the `tls-profiled` feature is enabled.
250 ///
251 /// # Errors
252 ///
253 /// Returns [`crate::error::ProxyError::ConfigError`] if the profiled
254 /// requester cannot be constructed.
255 #[cfg(feature = "tls-profiled")]
256 pub fn with_profiled_mode(
257 self,
258 mode: crate::types::ProfiledRequestMode,
259 ) -> crate::error::ProxyResult<Self> {
260 let requester = crate::http_client::ProfiledRequester::chrome_mode(mode)
261 .map_err(|e| crate::error::ProxyError::ConfigError(e.to_string()))?;
262 Ok(self.with_profiled_client(requester))
263 }
264
265 /// Attach extra tags to every proxy produced by this fetcher.
266 ///
267 /// # Example
268 ///
269 /// ```
270 /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
271 /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp])
272 /// .with_tags(vec!["dev".into(), "http".into()]);
273 /// ```
274 #[must_use]
275 pub fn with_tags(mut self, tags: Vec<String>) -> Self {
276 self.tags.extend(tags);
277 self
278 }
279
280 /// Parse one `host:port` line, including bracketed IPv6 addresses.
281 fn parse_host_port_line(line: &str) -> Option<(String, u16)> {
282 let line = line.trim();
283 if line.is_empty() || line.starts_with('#') {
284 return None;
285 }
286
287 let (host, port_str) = if line.starts_with('[') {
288 let end = line.find(']')?;
289 let host = line.get(..=end)?.trim();
290 let remainder = line.get(end + 1..)?.trim();
291 let (_, port_str) = remainder.rsplit_once(':')?;
292 (host, port_str.trim())
293 } else {
294 let (host, port_str) = line.rsplit_once(':')?;
295 let host = host.trim();
296 if host.contains(':') {
297 return None;
298 }
299 (host, port_str.trim())
300 };
301
302 if host.is_empty() || host == "[]" {
303 return None;
304 }
305
306 let port = port_str.parse::<u16>().ok()?;
307 if port == 0 {
308 return None;
309 }
310
311 Some((host.to_string(), port))
312 }
313
314 /// Fetch a single source, returning parsed proxies (empty on failure).
315 async fn fetch_source(&self, source: &FreeListSource) -> Vec<Proxy> {
316 let url = source.url();
317 let proxy_type = source.proxy_type();
318
319 let body = match self
320 .client
321 .get(url)
322 .timeout(Duration::from_secs(10))
323 .send()
324 .await
325 {
326 Ok(resp) if resp.status().is_success() => match resp.text().await {
327 Ok(t) => t,
328 Err(e) => {
329 warn!("Failed to read body from {url}: {e}");
330 return vec![];
331 }
332 },
333 Ok(resp) => {
334 warn!(
335 "Non-success status {} fetching proxy list from {url}",
336 resp.status()
337 );
338 return vec![];
339 }
340 Err(e) => {
341 warn!("Failed to fetch proxy list from {url}: {e}");
342 return vec![];
343 }
344 };
345
346 let proxies: Vec<Proxy> = body
347 .lines()
348 .filter_map(|line| {
349 let (host, port) = Self::parse_host_port_line(line)?;
350 let scheme = match proxy_type {
351 ProxyType::Http => "http",
352 ProxyType::Https => "https",
353 #[cfg(feature = "socks")]
354 ProxyType::Socks4 => "socks4",
355 #[cfg(feature = "socks")]
356 ProxyType::Socks5 => "socks5",
357 };
358 Some(Proxy {
359 url: format!("{scheme}://{host}:{port}"),
360 proxy_type,
361 username: None,
362 password: None,
363 weight: 1,
364 tags: self.tags.clone(),
365 })
366 })
367 .collect();
368
369 debug!(source = url, count = proxies.len(), "Fetched proxy list");
370 proxies
371 }
372}
373
374#[async_trait]
375impl ProxyFetcher for FreeListFetcher {
376 async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
377 if self.sources.is_empty() {
378 return Err(ProxyError::ConfigError(
379 "no sources configured for FreeListFetcher".into(),
380 ));
381 }
382
383 // Drive all source fetches concurrently.
384 let results = join_all(self.sources.iter().map(|s| self.fetch_source(s))).await;
385 let all: Vec<Proxy> = results.into_iter().flatten().collect();
386
387 if all.is_empty() {
388 return Err(ProxyError::FetchFailed {
389 origin: self
390 .sources
391 .iter()
392 .map(FreeListSource::url)
393 .collect::<Vec<_>>()
394 .join(", "),
395 message: "all sources returned empty or failed".into(),
396 });
397 }
398
399 Ok(all)
400 }
401}
402
403// ─── Helper ───────────────────────────────────────────────────────────────────
404
405/// Fetch proxies from `fetcher` and add them all to `manager`.
406///
407/// Returns the number of proxies successfully added. Individual `add_proxy`
408/// failures (e.g. duplicate URL) are logged as warnings and do not abort the
409/// load.
410///
411/// # Errors
412///
413/// Returns any [`ProxyError`] emitted by `fetcher.fetch()` if the fetcher
414/// itself fails.
415///
416/// # Example
417///
418/// ```no_run
419/// use std::sync::Arc;
420/// use stygian_proxy::{ProxyManager, storage::MemoryProxyStore, fetcher::{FreeListFetcher, FreeListSource, load_from_fetcher}};
421///
422/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
423/// let manager = ProxyManager::builder()
424/// .storage(Arc::new(MemoryProxyStore::default()))
425/// .build()?;
426/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
427/// let n = load_from_fetcher(&manager, &fetcher).await?;
428/// println!("Loaded {n} proxies");
429/// # Ok(())
430/// # }
431/// ```
432pub async fn load_from_fetcher(
433 manager: &ProxyManager,
434 fetcher: &dyn ProxyFetcher,
435) -> ProxyResult<usize> {
436 let proxies = fetcher.fetch().await?;
437 let total = proxies.len();
438 let mut loaded = 0usize;
439
440 for proxy in proxies {
441 match manager.add_proxy(proxy).await {
442 Ok(_) => loaded += 1,
443 Err(e) => warn!("Skipped proxy during load: {e}"),
444 }
445 }
446
447 debug!(total, loaded, "Proxy list loaded into manager");
448 Ok(loaded)
449}
450
451// ─── Tests ────────────────────────────────────────────────────────────────────
452
453#[cfg(test)]
454mod tests {
455 use super::*;
456
457 #[test]
458 fn free_list_source_url_is_nonempty() {
459 #[cfg(not(feature = "socks"))]
460 let sources = vec![
461 FreeListSource::TheSpeedXHttp,
462 FreeListSource::ClarketmHttp,
463 FreeListSource::OpenProxyListHttp,
464 FreeListSource::Custom {
465 url: "https://example.com/proxies.txt".into(),
466 proxy_type: ProxyType::Http,
467 },
468 ];
469 #[cfg(feature = "socks")]
470 let sources = {
471 let mut s = vec![
472 FreeListSource::TheSpeedXHttp,
473 FreeListSource::ClarketmHttp,
474 FreeListSource::OpenProxyListHttp,
475 FreeListSource::Custom {
476 url: "https://example.com/proxies.txt".into(),
477 proxy_type: ProxyType::Http,
478 },
479 ];
480 s.extend([
481 FreeListSource::TheSpeedXSocks4,
482 FreeListSource::TheSpeedXSocks5,
483 ]);
484 s
485 };
486 for src in &sources {
487 assert!(
488 !src.url().is_empty(),
489 "FreeListSource::{src:?} has empty URL"
490 );
491 }
492 }
493
494 #[test]
495 fn free_list_source_proxy_types() {
496 assert_eq!(FreeListSource::TheSpeedXHttp.proxy_type(), ProxyType::Http);
497 #[cfg(feature = "socks")]
498 assert_eq!(
499 FreeListSource::TheSpeedXSocks4.proxy_type(),
500 ProxyType::Socks4
501 );
502 #[cfg(feature = "socks")]
503 assert_eq!(
504 FreeListSource::TheSpeedXSocks5.proxy_type(),
505 ProxyType::Socks5
506 );
507 assert_eq!(FreeListSource::ClarketmHttp.proxy_type(), ProxyType::Http);
508 }
509
510 #[test]
511 fn free_list_fetcher_parse_valid_lines() {
512 let fetcher = FreeListFetcher::new(vec![]);
513 // Test the parsing logic directly by calling parse on synthetic text.
514 let text = "1.2.3.4:8080\n# comment\n\nbad-line\n5.6.7.8:3128\n[2001:db8::1]:8081\n";
515 let parsed: Vec<Proxy> = text
516 .lines()
517 .filter_map(|line| {
518 let (host, port) = FreeListFetcher::parse_host_port_line(line)?;
519 Some(Proxy {
520 url: format!("http://{host}:{port}"),
521 proxy_type: ProxyType::Http,
522 username: None,
523 password: None,
524 weight: 1,
525 tags: fetcher.tags.clone(),
526 })
527 })
528 .collect();
529
530 assert_eq!(parsed.len(), 3);
531 assert_eq!(
532 parsed.first().map(|proxy| proxy.url.as_str()),
533 Some("http://1.2.3.4:8080")
534 );
535 assert_eq!(
536 parsed.get(1).map(|proxy| proxy.url.as_str()),
537 Some("http://5.6.7.8:3128")
538 );
539 assert_eq!(
540 parsed.get(2).map(|proxy| proxy.url.as_str()),
541 Some("http://[2001:db8::1]:8081")
542 );
543 }
544
545 #[test]
546 fn free_list_fetcher_with_tags_extends() {
547 let f = FreeListFetcher::new(vec![]).with_tags(vec!["custom".into()]);
548 assert!(f.tags.contains(&"free-list".to_string()));
549 assert!(f.tags.contains(&"custom".to_string()));
550 }
551
552 #[test]
553 fn free_list_fetcher_skips_invalid_port() {
554 assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:notaport").is_none());
555 assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:0").is_none());
556 assert!(FreeListFetcher::parse_host_port_line(":8080").is_none());
557 assert!(FreeListFetcher::parse_host_port_line("2001:db8::1:8080").is_none());
558 }
559
560 #[test]
561 fn free_list_fetcher_empty_sources_is_config_error()
562 -> std::result::Result<(), Box<dyn std::error::Error>> {
563 let fetcher = FreeListFetcher::new(vec![]);
564 let rt = tokio::runtime::Builder::new_current_thread()
565 .enable_time()
566 .build()
567 .map_err(|e| std::io::Error::other(format!("failed to build runtime for test: {e}")))?;
568 let err = rt
569 .block_on(fetcher.fetch())
570 .err()
571 .ok_or_else(|| std::io::Error::other("empty sources should fail"))?;
572 match err {
573 ProxyError::ConfigError(msg) => {
574 assert!(msg.contains("no sources configured"));
575 }
576 other => {
577 return Err(
578 std::io::Error::other(format!("unexpected error variant: {other}")).into(),
579 );
580 }
581 }
582 Ok(())
583 }
584
585 #[test]
586 fn proxy_error_fetch_failed_display() {
587 let e = ProxyError::FetchFailed {
588 origin: "https://example.com".into(),
589 message: "timed out".into(),
590 };
591 assert!(e.to_string().contains("https://example.com"));
592 assert!(e.to_string().contains("timed out"));
593 }
594}