Skip to main content

stygian_proxy/
fetcher.rs

1//! Proxy list fetching — port trait and free-list HTTP adapter.
2//!
3//! [`ProxyFetcher`] is the port trait.  Implement it to pull proxies from any
4//! source (remote HTTP list, database, commercial API, etc.) and integrate with
5//! [`ProxyManager`] via [`load_from_fetcher`].
6//!
7//! The built-in [`FreeListFetcher`] downloads plain-text `host:port` proxy
8//! lists from public URLs (e.g. the `TheSpeedX/PROXY-List` feeds on GitHub)
9//! and parses them into [`Proxy`] records.  It is suitable for development,
10//! testing, and low-stakes scraping where proxy quality is less critical.
11//!
12//! ## Example — load from a free list and populate the pool
13//!
14//! ```no_run
15//! use std::sync::Arc;
16//! use stygian_proxy::{
17//!     ProxyManager,
18//!     storage::MemoryProxyStore,
19//!     fetcher::{FreeListFetcher, ProxyFetcher, FreeListSource},
20//! };
21//!
22//! # async fn run() -> stygian_proxy::error::ProxyResult<()> {
23//! let fetcher = FreeListFetcher::new(vec![
24//!     FreeListSource::TheSpeedXHttp,
25//! ]);
26//!
27//! let manager = ProxyManager::builder()
28//!     .storage(Arc::new(MemoryProxyStore::default()))
29//!     .build()?;
30//! let loaded = stygian_proxy::fetcher::load_from_fetcher(&manager, &fetcher).await?;
31//! println!("Loaded {loaded} proxies");
32//! # Ok(())
33//! # }
34//! ```
35
36use std::time::Duration;
37
38use async_trait::async_trait;
39use futures::future::join_all;
40use reqwest::Client;
41use tracing::{debug, warn};
42
43use crate::{
44    Proxy, ProxyManager, ProxyType,
45    error::{ProxyError, ProxyResult},
46};
47
48// ─── Port trait ───────────────────────────────────────────────────────────────
49
50/// A source that can produce a list of [`Proxy`] records asynchronously.
51///
52/// Implement this trait to integrate any proxy source (remote HTTP list,
53/// commercial API, database, file) with [`load_from_fetcher`].
54///
55/// # Example
56///
57/// ```
58/// use async_trait::async_trait;
59/// use stygian_proxy::{Proxy, ProxyType};
60/// use stygian_proxy::fetcher::ProxyFetcher;
61/// use stygian_proxy::error::ProxyResult;
62///
63/// struct MyStaticFetcher;
64///
65/// #[async_trait]
66/// impl ProxyFetcher for MyStaticFetcher {
67///     async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
68///         Ok(vec![Proxy {
69///             url: "http://192.168.1.1:8080".into(),
70///             proxy_type: ProxyType::Http,
71///             username: None,
72///             password: None,
73///             weight: 1,
74///             tags: vec!["static".into()],
75///         }])
76///     }
77/// }
78/// ```
79#[async_trait]
80pub trait ProxyFetcher: Send + Sync {
81    /// Fetch the current proxy list.
82    ///
83    /// # Errors
84    ///
85    /// Returns [`ProxyError::FetchFailed`] if the source is unreachable or
86    /// returns malformed data.
87    async fn fetch(&self) -> ProxyResult<Vec<Proxy>>;
88}
89
90// ─── Free-list sources ────────────────────────────────────────────────────────
91
92/// A well-known free/public proxy list feed.
93///
94/// These lists are community-maintained and quality varies.  They are suitable
95/// for development and testing.  For production use, prefer a commercial
96/// provider adapter.
97///
98/// # Example
99///
100/// ```
101/// use stygian_proxy::fetcher::FreeListSource;
102/// let _src = FreeListSource::TheSpeedXHttp;
103/// ```
104#[derive(Debug, Clone, PartialEq, Eq)]
105#[non_exhaustive]
106pub enum FreeListSource {
107    /// HTTP proxies from `TheSpeedX/PROXY-List` (GitHub, plain `host:port`).
108    TheSpeedXHttp,
109    #[cfg(feature = "socks")]
110    /// SOCKS4 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
111    TheSpeedXSocks4,
112    #[cfg(feature = "socks")]
113    /// SOCKS5 proxies from `TheSpeedX/PROXY-List` (requires the `socks` feature).
114    TheSpeedXSocks5,
115    /// HTTP proxies from `clarketm/proxy-list` (GitHub, plain `host:port`).
116    ClarketmHttp,
117    /// Mixed HTTP proxies from `openproxylist.xyz`.
118    OpenProxyListHttp,
119    /// A custom URL.  Content must be one `host:port` entry per line.
120    Custom {
121        /// The URL to fetch.
122        url: String,
123        /// The [`ProxyType`] to assign all parsed entries.
124        proxy_type: ProxyType,
125    },
126}
127
128impl FreeListSource {
129    fn url(&self) -> &str {
130        match self {
131            Self::TheSpeedXHttp => {
132                "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt"
133            }
134            #[cfg(feature = "socks")]
135            Self::TheSpeedXSocks4 => {
136                "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"
137            }
138            #[cfg(feature = "socks")]
139            Self::TheSpeedXSocks5 => {
140                "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"
141            }
142            Self::ClarketmHttp => {
143                "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt"
144            }
145            Self::OpenProxyListHttp => "https://openproxylist.xyz/http.txt",
146            Self::Custom { url, .. } => url.as_str(),
147        }
148    }
149
150    fn proxy_type(&self) -> ProxyType {
151        match self {
152            Self::TheSpeedXHttp | Self::ClarketmHttp | Self::OpenProxyListHttp => ProxyType::Http,
153            #[cfg(feature = "socks")]
154            Self::TheSpeedXSocks4 => ProxyType::Socks4,
155            #[cfg(feature = "socks")]
156            Self::TheSpeedXSocks5 => ProxyType::Socks5,
157            Self::Custom { proxy_type, .. } => *proxy_type,
158        }
159    }
160}
161
162// ─── FreeListFetcher ──────────────────────────────────────────────────────────
163
164/// Fetches plain-text `host:port` proxy lists from one or more public URLs.
165///
166/// Each source is fetched concurrently.  Lines that do not parse as valid
167/// `host:port` entries are silently skipped.  An empty or unreachable source
168/// logs a warning but does not fail the entire fetch — at least one source
169/// must return results for the call to succeed.
170///
171/// # Example
172///
173/// ```no_run
174/// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource, ProxyFetcher};
175///
176/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
177/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
178/// let proxies = fetcher.fetch().await?;
179/// println!("Got {} proxies", proxies.len());
180/// # Ok(())
181/// # }
182/// ```
183pub struct FreeListFetcher {
184    sources: Vec<FreeListSource>,
185    client: Client,
186    tags: Vec<String>,
187}
188
189impl FreeListFetcher {
190    /// Create a fetcher for the given sources with default HTTP client settings
191    /// (10 s timeout, TLS enabled).
192    ///
193    /// # Example
194    ///
195    /// ```
196    /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
197    /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
198    /// ```
199    pub fn new(sources: Vec<FreeListSource>) -> Self {
200        let client = Client::builder()
201            .timeout(Duration::from_secs(10))
202            .build()
203            .unwrap_or_else(|e| {
204                warn!("Failed to build HTTP client with 10 s timeout (TLS backend issue?): {e}; falling back to default client with per-request timeout enforcement");
205                Client::default()
206            });
207        Self {
208            sources,
209            client,
210            tags: vec!["free-list".into()],
211        }
212    }
213
214    /// Attach extra tags to every proxy produced by this fetcher.
215    ///
216    /// # Example
217    ///
218    /// ```
219    /// use stygian_proxy::fetcher::{FreeListFetcher, FreeListSource};
220    /// let _f = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp])
221    ///     .with_tags(vec!["dev".into(), "http".into()]);
222    /// ```
223    #[must_use]
224    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
225        self.tags.extend(tags);
226        self
227    }
228
229    /// Parse one `host:port` line, including bracketed IPv6 addresses.
230    fn parse_host_port_line(line: &str) -> Option<(String, u16)> {
231        let line = line.trim();
232        if line.is_empty() || line.starts_with('#') {
233            return None;
234        }
235
236        let (host, port_str) = if line.starts_with('[') {
237            let end = line.find(']')?;
238            let host = line.get(..=end)?.trim();
239            let remainder = line.get(end + 1..)?.trim();
240            let (_, port_str) = remainder.rsplit_once(':')?;
241            (host, port_str.trim())
242        } else {
243            let (host, port_str) = line.rsplit_once(':')?;
244            let host = host.trim();
245            if host.contains(':') {
246                return None;
247            }
248            (host, port_str.trim())
249        };
250
251        if host.is_empty() || host == "[]" {
252            return None;
253        }
254
255        let port = port_str.parse::<u16>().ok()?;
256        if port == 0 {
257            return None;
258        }
259
260        Some((host.to_string(), port))
261    }
262
263    /// Fetch a single source, returning parsed proxies (empty on failure).
264    async fn fetch_source(&self, source: &FreeListSource) -> Vec<Proxy> {
265        let url = source.url();
266        let proxy_type = source.proxy_type();
267
268        let body = match self
269            .client
270            .get(url)
271            .timeout(Duration::from_secs(10))
272            .send()
273            .await
274        {
275            Ok(resp) if resp.status().is_success() => match resp.text().await {
276                Ok(t) => t,
277                Err(e) => {
278                    warn!("Failed to read body from {url}: {e}");
279                    return vec![];
280                }
281            },
282            Ok(resp) => {
283                warn!(
284                    "Non-success status {} fetching proxy list from {url}",
285                    resp.status()
286                );
287                return vec![];
288            }
289            Err(e) => {
290                warn!("Failed to fetch proxy list from {url}: {e}");
291                return vec![];
292            }
293        };
294
295        let proxies: Vec<Proxy> = body
296            .lines()
297            .filter_map(|line| {
298                let (host, port) = Self::parse_host_port_line(line)?;
299                let scheme = match proxy_type {
300                    ProxyType::Http => "http",
301                    ProxyType::Https => "https",
302                    #[cfg(feature = "socks")]
303                    ProxyType::Socks4 => "socks4",
304                    #[cfg(feature = "socks")]
305                    ProxyType::Socks5 => "socks5",
306                };
307                Some(Proxy {
308                    url: format!("{scheme}://{host}:{port}"),
309                    proxy_type,
310                    username: None,
311                    password: None,
312                    weight: 1,
313                    tags: self.tags.clone(),
314                })
315            })
316            .collect();
317
318        debug!(source = url, count = proxies.len(), "Fetched proxy list");
319        proxies
320    }
321}
322
323#[async_trait]
324impl ProxyFetcher for FreeListFetcher {
325    async fn fetch(&self) -> ProxyResult<Vec<Proxy>> {
326        if self.sources.is_empty() {
327            return Err(ProxyError::ConfigError(
328                "no sources configured for FreeListFetcher".into(),
329            ));
330        }
331
332        // Drive all source fetches concurrently.
333        let results = join_all(self.sources.iter().map(|s| self.fetch_source(s))).await;
334        let all: Vec<Proxy> = results.into_iter().flatten().collect();
335
336        if all.is_empty() {
337            return Err(ProxyError::FetchFailed {
338                origin: self
339                    .sources
340                    .iter()
341                    .map(|s| s.url())
342                    .collect::<Vec<_>>()
343                    .join(", "),
344                message: "all sources returned empty or failed".into(),
345            });
346        }
347
348        Ok(all)
349    }
350}
351
352// ─── Helper ───────────────────────────────────────────────────────────────────
353
354/// Fetch proxies from `fetcher` and add them all to `manager`.
355///
356/// Returns the number of proxies successfully added.  Individual `add_proxy`
357/// failures (e.g. duplicate URL) are logged as warnings and do not abort the
358/// load.
359///
360/// # Errors
361///
362/// Returns any [`ProxyError`] emitted by `fetcher.fetch()` if the fetcher
363/// itself fails.
364///
365/// # Example
366///
367/// ```no_run
368/// use std::sync::Arc;
369/// use stygian_proxy::{ProxyManager, storage::MemoryProxyStore, fetcher::{FreeListFetcher, FreeListSource, load_from_fetcher}};
370///
371/// # async fn run() -> stygian_proxy::error::ProxyResult<()> {
372/// let manager = ProxyManager::builder()
373///     .storage(Arc::new(MemoryProxyStore::default()))
374///     .build()?;
375/// let fetcher = FreeListFetcher::new(vec![FreeListSource::TheSpeedXHttp]);
376/// let n = load_from_fetcher(&manager, &fetcher).await?;
377/// println!("Loaded {n} proxies");
378/// # Ok(())
379/// # }
380/// ```
381pub async fn load_from_fetcher(
382    manager: &ProxyManager,
383    fetcher: &dyn ProxyFetcher,
384) -> ProxyResult<usize> {
385    let proxies = fetcher.fetch().await?;
386    let total = proxies.len();
387    let mut loaded = 0usize;
388
389    for proxy in proxies {
390        match manager.add_proxy(proxy).await {
391            Ok(_) => loaded += 1,
392            Err(e) => warn!("Skipped proxy during load: {e}"),
393        }
394    }
395
396    debug!(total, loaded, "Proxy list loaded into manager");
397    Ok(loaded)
398}
399
400// ─── Tests ────────────────────────────────────────────────────────────────────
401
402#[cfg(test)]
403mod tests {
404    use super::*;
405
406    #[test]
407    fn free_list_source_url_is_nonempty() {
408        #[cfg(not(feature = "socks"))]
409        let sources = vec![
410            FreeListSource::TheSpeedXHttp,
411            FreeListSource::ClarketmHttp,
412            FreeListSource::OpenProxyListHttp,
413            FreeListSource::Custom {
414                url: "https://example.com/proxies.txt".into(),
415                proxy_type: ProxyType::Http,
416            },
417        ];
418        #[cfg(feature = "socks")]
419        let sources = {
420            let mut s = vec![
421                FreeListSource::TheSpeedXHttp,
422                FreeListSource::ClarketmHttp,
423                FreeListSource::OpenProxyListHttp,
424                FreeListSource::Custom {
425                    url: "https://example.com/proxies.txt".into(),
426                    proxy_type: ProxyType::Http,
427                },
428            ];
429            s.extend([
430                FreeListSource::TheSpeedXSocks4,
431                FreeListSource::TheSpeedXSocks5,
432            ]);
433            s
434        };
435        for src in &sources {
436            assert!(
437                !src.url().is_empty(),
438                "FreeListSource::{src:?} has empty URL"
439            );
440        }
441    }
442
443    #[test]
444    fn free_list_source_proxy_types() {
445        assert_eq!(FreeListSource::TheSpeedXHttp.proxy_type(), ProxyType::Http);
446        #[cfg(feature = "socks")]
447        assert_eq!(
448            FreeListSource::TheSpeedXSocks4.proxy_type(),
449            ProxyType::Socks4
450        );
451        #[cfg(feature = "socks")]
452        assert_eq!(
453            FreeListSource::TheSpeedXSocks5.proxy_type(),
454            ProxyType::Socks5
455        );
456        assert_eq!(FreeListSource::ClarketmHttp.proxy_type(), ProxyType::Http);
457    }
458
459    #[test]
460    fn free_list_fetcher_parse_valid_lines() {
461        let fetcher = FreeListFetcher::new(vec![]);
462        // Test the parsing logic directly by calling parse on synthetic text.
463        let text = "1.2.3.4:8080\n# comment\n\nbad-line\n5.6.7.8:3128\n[2001:db8::1]:8081\n";
464        let parsed: Vec<Proxy> = text
465            .lines()
466            .filter_map(|line| {
467                let (host, port) = FreeListFetcher::parse_host_port_line(line)?;
468                Some(Proxy {
469                    url: format!("http://{host}:{port}"),
470                    proxy_type: ProxyType::Http,
471                    username: None,
472                    password: None,
473                    weight: 1,
474                    tags: fetcher.tags.clone(),
475                })
476            })
477            .collect();
478
479        assert_eq!(parsed.len(), 3);
480        assert_eq!(parsed[0].url, "http://1.2.3.4:8080");
481        assert_eq!(parsed[1].url, "http://5.6.7.8:3128");
482        assert_eq!(parsed[2].url, "http://[2001:db8::1]:8081");
483    }
484
485    #[test]
486    fn free_list_fetcher_with_tags_extends() {
487        let f = FreeListFetcher::new(vec![]).with_tags(vec!["custom".into()]);
488        assert!(f.tags.contains(&"free-list".to_string()));
489        assert!(f.tags.contains(&"custom".to_string()));
490    }
491
492    #[test]
493    fn free_list_fetcher_skips_invalid_port() {
494        assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:notaport").is_none());
495        assert!(FreeListFetcher::parse_host_port_line("1.2.3.4:0").is_none());
496        assert!(FreeListFetcher::parse_host_port_line(":8080").is_none());
497        assert!(FreeListFetcher::parse_host_port_line("2001:db8::1:8080").is_none());
498    }
499
500    #[test]
501    fn free_list_fetcher_empty_sources_is_config_error() {
502        let fetcher = FreeListFetcher::new(vec![]);
503        let rt = tokio::runtime::Builder::new_current_thread()
504            .enable_time()
505            .build()
506            .unwrap_or_else(|e| panic!("failed to build runtime for test: {e}"));
507        let err = rt
508            .block_on(fetcher.fetch())
509            .expect_err("empty sources should fail");
510        match err {
511            ProxyError::ConfigError(msg) => {
512                assert!(msg.contains("no sources configured"));
513            }
514            other => panic!("unexpected error variant: {other}"),
515        }
516    }
517
518    #[test]
519    fn proxy_error_fetch_failed_display() {
520        let e = ProxyError::FetchFailed {
521            origin: "https://example.com".into(),
522            message: "timed out".into(),
523        };
524        assert!(e.to_string().contains("https://example.com"));
525        assert!(e.to_string().contains("timed out"));
526    }
527}