Skip to main content

stygian_proxy/
fetcher.rs

1//! Proxy list fetching — port trait and free-list HTTP adapter.
2//!
3//! [`ProxyFetcher`] is the port trait.  Implement it to pull proxies from any
4//! source (remote HTTP list, database, commercial API, etc.) and integrate with
5//! [`ProxyManager`] via [`load_from_fetcher`].
6//!
7//! The built-in [`FreeListFetcher`] downloads plain-text `host:port` proxy
8//! lists from public URLs (e.g. the `TheSpeedX/PROXY-List` feeds on GitHub)
9//! and parses them into [`Proxy`] records.  It is suitable for development,
10//! testing, and low-stakes scraping where proxy quality is less critical.
11//!
12//! ## Example — load from a free list and populate the pool
13//!
14//! ```no_run
15//! use std::sync::Arc;
16//! use stygian_proxy::{
17//!     ProxyManager,
18//!     storage::MemoryProxyStore,
19//!     fetcher::{FreeListFetcher, ProxyFetcher, FreeListSource},
20//! };
21//!
22//! # async fn run() -> stygian_proxy::error::ProxyResult<()> {
23//! let fetcher = FreeListFetcher::new(vec![
24//!     FreeListSource::TheSpeedXHttp,
25//! ]);
26//!
27//! let manager = ProxyManager::builder()
28//!     .storage(Arc::new(MemoryProxyStore::default()))
29//!     .build()?;
30//! let loaded = stygian_proxy::fetcher::load_from_fetcher(&manager, &fetcher).await?;
31//! println!("Loaded {loaded} proxies");
32//! # Ok(())
33//! # }
34//! ```
35
36use std::time::Duration;
37
38use async_trait::async_trait;
39use futures::future::join_all;
40use reqwest::Client;
41use tracing::{debug, warn};
42
43use crate::{
44    Proxy, ProxyManager, ProxyType,
45    error::{ProxyError, ProxyResult},
46};
47
48// ─── Port trait ───────────────────────────────────────────────────────────────
49
50/// A source that can produce a list of [`Proxy`] records asynchronously.
51///
52/// Implement this trait to integrate any proxy source (remote HTTP list,
53/// commercial API, database, file) with [`load_from_fetcher`].
54///
55/// # Example
56///
57/// ```
58/// use async_trait::async_trait;
59/// use stygian_proxy::{Proxy, ProxyType};
60/// use stygian_proxy::fetcher::ProxyFetcher;
61/// use stygian_proxy::error::ProxyResult;
62///
63/// struct MyStaticFetcher;
64///
65/// #[async_trait]
66/// impl ProxyFetcher for MyStaticFetcher {
67///     async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
68///         Ok(vec![Proxy {
69///             url: "http://192.168.1.1:8080".into(),
70///             proxy_type: ProxyType::Http,
71///             username: None,
72///             password: None,
73///             weight: 1,
74///             tags: vec!["static".into()],
75///         }])
76///     }
77/// }
78/// ```
79#[async_trait]
80pub trait ProxyFetcher: Send + Sync {
81    /// Fetch the current proxy list.
82    ///
83    /// # Errors
84    ///
85    /// Returns [`ProxyError::FetchFailed`] if the source is unreachable or
86    /// returns malformed data.
87    async fn fetch(&self) -> ProxyResult<Vec<Proxy>>;
88}
89
90// ─── Free-list sources ────────────────────────────────────────────────────────
91
92/// A well-known free/public proxy list feed.
93///
94/// These lists are community-maintained and quality varies.  They are suitable
95/// for development and testing.  For production use, prefer a commercial
96/// provider adapter.
97///
98/// # Example
99///
100/// ```
101/// use stygian_proxy::fetcher::FreeListSource;
102/// let _src = FreeListSource::TheSpeedXHttp;
103/// ```
104#[derive(Debug, Clone, PartialEq, Eq)]
105#[non_exhaustive]
106pub enum FreeListSource {
107    /// HTTP proxies from `TheSpeedX/PROXY-List` (GitHub, plain `host:port`).
108    TheSpeedXHttp,
109    #[cfg(feature = "socks")]
110    /// SOCKS4 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
111    TheSpeedXSocks4,
112    #[cfg(feature = "socks")]
113    /// SOCKS5 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
114    TheSpeedXSocks5,
115    /// HTTP proxies from `clarketm/proxy-list` (GitHub, plain `host:port`).
116    ClarketmHttp,
117    /// Mixed HTTP proxies from `openproxylist.xyz`.
118    OpenProxyListHttp,
119    /// A custom URL.  Content must be one `host:port` entry per line.
120    Custom {
121        /// The URL to fetch.
122        url: String,
123        /// The [`ProxyType`] to assign all parsed entries.
124        proxy_type: ProxyType,
125    },
126}
127
128impl FreeListSource {
129    const fn url(&self) -> &str {
130        match self {
131            Self::TheSpeedXHttp => {
132                "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt"
133            }
134            #[cfg(feature = "socks")]
135            Self::TheSpeedXSocks4 => {
136                "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"
137            }
138            #[cfg(feature = "socks")]
139            Self::TheSpeedXSocks5 => {
140                "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"
141            }
142            Self::ClarketmHttp => {
143                "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt"
144            }
145            Self::OpenProxyListHttp => "https://openproxylist.xyz/http.txt",
146            Self::Custom { url, .. } => url.as_str(),
147        }
148    }
149
150    const fn proxy_type(&self) -> ProxyType {
151        match self {
152            Self::TheSpeedXHttp | Self::ClarketmHttp | Self::OpenProxyListHttp => ProxyType::Http,
153            #[cfg(feature = "socks")]
154            Self::TheSpeedXSocks4 => ProxyType::Socks4,
155            #[cfg(feature = "socks")]
156            Self::TheSpeedXSocks5 => ProxyType::Socks5,
157            Self::Custom { proxy_type, .. } => *proxy_type,
158        }
159    }
160}
161
162// ─── FreeListFetcher ──────────────────────────────────────────────────────────
163
164/// Fetches plain-text `host:port` proxy lists from one or more public URLs.
165///
166/// Each source is fetched concurrently.  Lines that do not parse as valid
167/// `host:port` entries are silently skipped.  An empty or unreachable source
168/// logs a warning but does not fail the entire fetch — at least one source
169/// must return results for the call to succeed.
170///
171/// # Example
172///
173/// ```no_run
174/// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource, ProxyFetcher};
175///
176/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
177/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
178/// let proxies = fetcher.fetch().await?;
179/// println!("Got {} proxies", proxies.len());
180/// # Ok(())
181/// # }
182/// ```
183pub struct FreeListFetcher {
184    sources: Vec<FreeListSource>,
185    client: Client,
186    tags: Vec<String>,
187}
188
189impl FreeListFetcher {
190    /// Create a fetcher for the given sources with default HTTP client settings
191    /// (10 s timeout, TLS enabled).
192    ///
193    /// # Example
194    ///
195    /// ```
196    /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
197    /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
198    /// ```
199    pub fn new(sources: Vec<FreeListSource>) -> Self {
200        let client = Client::builder()
201            .timeout(Duration::from_secs(10))
202            .build()
203            .unwrap_or_else(|e| {
204                warn!("Failed to build HTTP client with 10 s timeout (TLS backend issue?): {e}; falling back to default client with per-request timeout enforcement");
205                Client::default()
206            });
207        Self {
208            sources,
209            client,
210            tags: vec!["free-list".into()],
211        }
212    }
213
214    /// Replace the internal HTTP client with a TLS-profiled one.
215    ///
216    /// Proxy-list fetch requests will carry a browser TLS fingerprint and
217    /// matching `Accept` / `Sec-CH-UA` headers.
218    ///
219    /// Only available with the `tls-profiled` feature.
220    ///
221    /// # Example
222    ///
223    /// ```no_run
224    /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
225    /// use stygian_proxy::http_client::{ProfiledRequestMode, ProfiledRequester};
226    ///
227    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
228    /// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp])
229    ///     .with_profiled_client(ProfiledRequester::chrome_mode(ProfiledRequestMode::Preset)?);
230    /// # Ok(())
231    /// # }
232    /// ```
233    #[cfg(feature = "tls-profiled")]
234    #[must_use]
235    pub fn with_profiled_client(
236        mut self,
237        requester: crate::http_client::ProfiledRequester,
238    ) -> Self {
239        self.client = requester.client().clone();
240        drop(requester);
241        self
242    }
243
244    /// Build and attach a profile-mode-based requester.
245    ///
246    /// Uses Chrome 131 as the baseline browser identity and applies `mode`
247    /// to TLS control mapping.
248    ///
249    /// Only available when the `tls-profiled` feature is enabled.
250    ///
251    /// # Errors
252    ///
253    /// Returns [`crate::error::ProxyError::ConfigError`] if the profiled
254    /// requester cannot be constructed.
255    #[cfg(feature = "tls-profiled")]
256    pub fn with_profiled_mode(
257        self,
258        mode: crate::types::ProfiledRequestMode,
259    ) -> crate::error::ProxyResult<Self> {
260        let requester = crate::http_client::ProfiledRequester::chrome_mode(mode)
261            .map_err(|e| crate::error::ProxyError::ConfigError(e.to_string()))?;
262        Ok(self.with_profiled_client(requester))
263    }
264
265    /// Attach extra tags to every proxy produced by this fetcher.
266    ///
267    /// # Example
268    ///
269    /// ```
270    /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
271    /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp])
272    ///     .with_tags(vec!["dev".into(), "http".into()]);
273    /// ```
274    #[must_use]
275    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
276        self.tags.extend(tags);
277        self
278    }
279
280    /// Parse one `host:port` line, including bracketed IPv6 addresses.
281    fn parse_host_port_line(line: &str) -> Option<(String, u16)> {
282        let line = line.trim();
283        if line.is_empty() || line.starts_with('#') {
284            return None;
285        }
286
287        let (host, port_str) = if line.starts_with('[') {
288            let end = line.find(']')?;
289            let host = line.get(..=end)?.trim();
290            let remainder = line.get(end + 1..)?.trim();
291            let (_, port_str) = remainder.rsplit_once(':')?;
292            (host, port_str.trim())
293        } else {
294            let (host, port_str) = line.rsplit_once(':')?;
295            let host = host.trim();
296            if host.contains(':') {
297                return None;
298            }
299            (host, port_str.trim())
300        };
301
302        if host.is_empty() || host == "[]" {
303            return None;
304        }
305
306        let port = port_str.parse::<u16>().ok()?;
307        if port == 0 {
308            return None;
309        }
310
311        Some((host.to_string(), port))
312    }
313
314    /// Fetch a single source, returning parsed proxies (empty on failure).
315    async fn fetch_source(&self, source: &FreeListSource) -> Vec<Proxy> {
316        let url = source.url();
317        let proxy_type = source.proxy_type();
318
319        let body = match self
320            .client
321            .get(url)
322            .timeout(Duration::from_secs(10))
323            .send()
324            .await
325        {
326            Ok(resp) if resp.status().is_success() => match resp.text().await {
327                Ok(t) => t,
328                Err(e) => {
329                    warn!("Failed to read body from {url}: {e}");
330                    return vec![];
331                }
332            },
333            Ok(resp) => {
334                warn!(
335                    "Non-success status {} fetching proxy list from {url}",
336                    resp.status()
337                );
338                return vec![];
339            }
340            Err(e) => {
341                warn!("Failed to fetch proxy list from {url}: {e}");
342                return vec![];
343            }
344        };
345
346        let proxies: Vec<Proxy> = body
347            .lines()
348            .filter_map(|line| {
349                let (host, port) = Self::parse_host_port_line(line)?;
350                let scheme = match proxy_type {
351                    ProxyType::Http => "http",
352                    ProxyType::Https => "https",
353                    #[cfg(feature = "socks")]
354                    ProxyType::Socks4 => "socks4",
355                    #[cfg(feature = "socks")]
356                    ProxyType::Socks5 => "socks5",
357                };
358                Some(Proxy {
359                    url: format!("{scheme}://{host}:{port}"),
360                    proxy_type,
361                    username: None,
362                    password: None,
363                    weight: 1,
364                    tags: self.tags.clone(),
365                })
366            })
367            .collect();
368
369        debug!(source = url, count = proxies.len(), "Fetched proxy list");
370        proxies
371    }
372}
373
374#[async_trait]
375impl ProxyFetcher for FreeListFetcher {
376    async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
377        if self.sources.is_empty() {
378            return Err(ProxyError::ConfigError(
379                "no sources configured for FreeListFetcher".into(),
380            ));
381        }
382
383        // Drive all source fetches concurrently.
384        let results = join_all(self.sources.iter().map(|s| self.fetch_source(s))).await;
385        let all: Vec<Proxy> = results.into_iter().flatten().collect();
386
387        if all.is_empty() {
388            return Err(ProxyError::FetchFailed {
389                origin: self
390                    .sources
391                    .iter()
392                    .map(FreeListSource::url)
393                    .collect::<Vec<_>>()
394                    .join(", "),
395                message: "all sources returned empty or failed".into(),
396            });
397        }
398
399        Ok(all)
400    }
401}
402
403// ─── Helper ───────────────────────────────────────────────────────────────────
404
405/// Fetch proxies from `fetcher` and add them all to `manager`.
406///
407/// Returns the number of proxies successfully added.  Individual `add_proxy`
408/// failures (e.g. duplicate URL) are logged as warnings and do not abort the
409/// load.
410///
411/// # Errors
412///
413/// Returns any [`ProxyError`] emitted by `fetcher.fetch()` if the fetcher
414/// itself fails.
415///
416/// # Example
417///
418/// ```no_run
419/// use std::sync::Arc;
420/// use stygian_proxy::{ProxyManager, storage::MemoryProxyStore, fetcher::{FreeListFetcher, FreeListSource, load_from_fetcher}};
421///
422/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
423/// let manager = ProxyManager::builder()
424///     .storage(Arc::new(MemoryProxyStore::default()))
425///     .build()?;
426/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
427/// let n = load_from_fetcher(&manager, &fetcher).await?;
428/// println!("Loaded {n} proxies");
429/// # Ok(())
430/// # }
431/// ```
432pub async fn load_from_fetcher(
433    manager: &ProxyManager,
434    fetcher: &dyn ProxyFetcher,
435) -> ProxyResult<usize> {
436    let proxies = fetcher.fetch().await?;
437    let total = proxies.len();
438    let mut loaded = 0usize;
439
440    for proxy in proxies {
441        match manager.add_proxy(proxy).await {
442            Ok(_) => loaded += 1,
443            Err(e) => warn!("Skipped proxy during load: {e}"),
444        }
445    }
446
447    debug!(total, loaded, "Proxy list loaded into manager");
448    Ok(loaded)
449}
450
451// ─── Tests ────────────────────────────────────────────────────────────────────
452
453#[cfg(test)]
454mod tests {
455    use super::*;
456
457    #[test]
458    fn free_list_source_url_is_nonempty() {
459        #[cfg(not(feature = "socks"))]
460        let sources = vec![
461            FreeListSource::TheSpeedXHttp,
462            FreeListSource::ClarketmHttp,
463            FreeListSource::OpenProxyListHttp,
464            FreeListSource::Custom {
465                url: "https://example.com/proxies.txt".into(),
466                proxy_type: ProxyType::Http,
467            },
468        ];
469        #[cfg(feature = "socks")]
470        let sources = {
471            let mut s = vec![
472                FreeListSource::TheSpeedXHttp,
473                FreeListSource::ClarketmHttp,
474                FreeListSource::OpenProxyListHttp,
475                FreeListSource::Custom {
476                    url: "https://example.com/proxies.txt".into(),
477                    proxy_type: ProxyType::Http,
478                },
479            ];
480            s.extend([
481                FreeListSource::TheSpeedXSocks4,
482                FreeListSource::TheSpeedXSocks5,
483            ]);
484            s
485        };
486        for src in &sources {
487            assert!(
488                !src.url().is_empty(),
489                "FreeListSource::{src:?} has empty URL"
490            );
491        }
492    }
493
494    #[test]
495    fn free_list_source_proxy_types() {
496        assert_eq!(FreeListSource::TheSpeedXHttp.proxy_type(), ProxyType::Http);
497        #[cfg(feature = "socks")]
498        assert_eq!(
499            FreeListSource::TheSpeedXSocks4.proxy_type(),
500            ProxyType::Socks4
501        );
502        #[cfg(feature = "socks")]
503        assert_eq!(
504            FreeListSource::TheSpeedXSocks5.proxy_type(),
505            ProxyType::Socks5
506        );
507        assert_eq!(FreeListSource::ClarketmHttp.proxy_type(), ProxyType::Http);
508    }
509
510    #[test]
511    fn free_list_fetcher_parse_valid_lines() {
512        let fetcher = FreeListFetcher::new(vec![]);
513        // Test the parsing logic directly by calling parse on synthetic text.
514        let text = "1.2.3.4:8080\n# comment\n\nbad-line\n5.6.7.8:3128\n[2001:db8::1]:8081\n";
515        let parsed: Vec<Proxy> = text
516            .lines()
517            .filter_map(|line| {
518                let (host, port) = FreeListFetcher::parse_host_port_line(line)?;
519                Some(Proxy {
520                    url: format!("http://{host}:{port}"),
521                    proxy_type: ProxyType::Http,
522                    username: None,
523                    password: None,
524                    weight: 1,
525                    tags: fetcher.tags.clone(),
526                })
527            })
528            .collect();
529
530        assert_eq!(parsed.len(), 3);
531        assert_eq!(
532            parsed.first().map(|proxy| proxy.url.as_str()),
533            Some("http://1.2.3.4:8080")
534        );
535        assert_eq!(
536            parsed.get(1).map(|proxy| proxy.url.as_str()),
537            Some("http://5.6.7.8:3128")
538        );
539        assert_eq!(
540            parsed.get(2).map(|proxy| proxy.url.as_str()),
541            Some("http://[2001:db8::1]:8081")
542        );
543    }
544
545    #[test]
546    fn free_list_fetcher_with_tags_extends() {
547        let f = FreeListFetcher::new(vec![]).with_tags(vec!["custom".into()]);
548        assert!(f.tags.contains(&"free-list".to_string()));
549        assert!(f.tags.contains(&"custom".to_string()));
550    }
551
552    #[test]
553    fn free_list_fetcher_skips_invalid_port() {
554        assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:notaport").is_none());
555        assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:0").is_none());
556        assert!(FreeListFetcher::parse_host_port_line(":8080").is_none());
557        assert!(FreeListFetcher::parse_host_port_line("2001:db8::1:8080").is_none());
558    }
559
560    #[test]
561    fn free_list_fetcher_empty_sources_is_config_error()
562    -> std::result::Result<(), Box<dyn std::error::Error>> {
563        let fetcher = FreeListFetcher::new(vec![]);
564        let rt = tokio::runtime::Builder::new_current_thread()
565            .enable_time()
566            .build()
567            .map_err(|e| std::io::Error::other(format!("failed to build runtime for test: {e}")))?;
568        let err = rt
569            .block_on(fetcher.fetch())
570            .err()
571            .ok_or_else(|| std::io::Error::other("empty sources should fail"))?;
572        match err {
573            ProxyError::ConfigError(msg) => {
574                assert!(msg.contains("no sources configured"));
575            }
576            other => {
577                return Err(
578                    std::io::Error::other(format!("unexpected error variant: {other}")).into(),
579                );
580            }
581        }
582        Ok(())
583    }
584
585    #[test]
586    fn proxy_error_fetch_failed_display() {
587        let e = ProxyError::FetchFailed {
588            origin: "https://example.com".into(),
589            message: "timed out".into(),
590        };
591        assert!(e.to_string().contains("https://example.com"));
592        assert!(e.to_string().contains("timed out"));
593    }
594}