Skip to main content

crw_core/
proxy.rs

1//! Proxy list + rotation primitives shared across the HTTP, crawl, and CDP
2//! paths.
3//!
4//! A [`ProxyRotator`] holds a set of validated [`ProxyEntry`] and selects one
5//! per request according to a [`ProxyRotation`] strategy. The rotator is built
6//! once (from config) or per request (BYOP) and is cheap to share behind an
7//! `Arc`.
8//!
9//! # Safety
10//!
11//! Proxy URLs are validated up front via [`ProxyEntry::parse`]. A malformed
12//! entry is a hard error — we never silently fall back to a direct (no-proxy)
13//! connection, which would leak the host's real IP. Callers map the returned
14//! error string to the appropriate [`crate::CrwError`] variant
15//! (`ConfigError` at startup, `InvalidRequest` for per-request BYOP).
16
17use std::sync::atomic::{AtomicUsize, Ordering};
18
19use serde::{Deserialize, Serialize};
20
21/// Strategy for selecting a proxy from a [`ProxyRotator`]'s pool.
22#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
23#[serde(rename_all = "snake_case")]
24pub enum ProxyRotation {
25    /// Cycle through the pool in order, one step per request (process-wide).
26    RoundRobin,
27    /// Pick a uniformly random entry per request.
28    Random,
29    /// Pin each target host to a single proxy for the rotator's lifetime.
30    /// Default: keeps cookie/TLS sessions coherent per host (anti-bot systems
31    /// flag mid-session IP changes), while still spreading load across hosts.
32    #[default]
33    StickyPerHost,
34}
35
36/// A single validated proxy endpoint.
37///
38/// `raw` carries the full URL (including any `user:pass`) for `reqwest`, which
39/// honours embedded credentials. `chrome_proxy_server` is the scheme-qualified
40/// `host:port` **without** credentials, suitable for Chrome's
41/// `Target.createBrowserContext { proxyServer }` (Chrome takes creds via the
42/// `Fetch.authRequired` auth pump, not in the URL).
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct ProxyEntry {
45    raw: String,
46    scheme: String,
47    chrome_proxy_server: String,
48    auth: Option<(String, String)>,
49}
50
51const ALLOWED_SCHEMES: [&str; 4] = ["http", "https", "socks5", "socks5h"];
52
53impl ProxyEntry {
54    /// Parse and validate a proxy URL. Returns an error string (no silent
55    /// fallback) when the scheme is unsupported or the host is missing.
56    pub fn parse(raw: &str) -> Result<Self, String> {
57        let trimmed = raw.trim();
58        if trimmed.is_empty() {
59            return Err("empty proxy URL".to_string());
60        }
61        let url =
62            url::Url::parse(trimmed).map_err(|e| format!("invalid proxy URL '{trimmed}': {e}"))?;
63
64        let scheme = url.scheme().to_ascii_lowercase();
65        if !ALLOWED_SCHEMES.contains(&scheme.as_str()) {
66            return Err(format!(
67                "unsupported proxy scheme '{scheme}' in '{trimmed}' (allowed: http, https, socks5, socks5h)"
68            ));
69        }
70
71        let host = url
72            .host_str()
73            .ok_or_else(|| format!("proxy URL '{trimmed}' has no host"))?;
74
75        // Chrome's `proxyServer` only understands `socks5` (which already does
76        // remote DNS) — it does not recognize the `socks5h` scheme. Normalize so
77        // the CDP path passes a scheme Chrome accepts. (`reqwest`/`raw` keeps the
78        // original scheme for the HTTP path.)
79        let chrome_scheme = if scheme == "socks5h" {
80            "socks5"
81        } else {
82            &scheme
83        };
84        let chrome_proxy_server = match url.port() {
85            Some(port) => format!("{chrome_scheme}://{host}:{port}"),
86            None => format!("{chrome_scheme}://{host}"),
87        };
88
89        let auth = match (url.username(), url.password()) {
90            ("", _) => None,
91            (user, Some(pass)) => Some((percent_decode(user), percent_decode(pass))),
92            (user, None) => Some((percent_decode(user), String::new())),
93        };
94
95        Ok(Self {
96            raw: trimmed.to_string(),
97            scheme,
98            chrome_proxy_server,
99            auth,
100        })
101    }
102
103    /// Full proxy URL (with credentials) for `reqwest::Proxy::all`.
104    pub fn raw(&self) -> &str {
105        &self.raw
106    }
107
108    /// URL scheme (lowercased): `http`, `https`, `socks5`, or `socks5h`.
109    pub fn scheme(&self) -> &str {
110        &self.scheme
111    }
112
113    /// Scheme-qualified `host:port` (no credentials) for Chrome `proxyServer`.
114    pub fn chrome_proxy_server(&self) -> &str {
115        &self.chrome_proxy_server
116    }
117
118    /// Optional `(username, password)` for the CDP auth pump.
119    pub fn auth(&self) -> Option<&(String, String)> {
120        self.auth.as_ref()
121    }
122
123    /// Whether this proxy can authenticate on the Chrome/CDP path. Chrome's
124    /// network stack never emits `Fetch.authRequired` for SOCKS proxies, so a
125    /// `socks5`/`socks5h` proxy that carries credentials cannot authenticate via
126    /// the CDP auth pump (it would hang/fail). HTTP/HTTPS proxies and
127    /// credential-less SOCKS proxies are fine. The HTTP (reqwest) path is
128    /// unaffected — it authenticates SOCKS natively via [`Self::raw`].
129    pub fn supports_cdp_auth(&self) -> bool {
130        !(self.scheme.starts_with("socks") && self.auth.is_some())
131    }
132}
133
134/// Minimal percent-decoding for proxy userinfo (handles `%XX`). Credentials
135/// frequently contain URL-encoded characters; `url` exposes them encoded.
136fn percent_decode(s: &str) -> String {
137    let bytes = s.as_bytes();
138    let mut out = Vec::with_capacity(bytes.len());
139    let mut i = 0;
140    while i < bytes.len() {
141        if bytes[i] == b'%'
142            && i + 2 < bytes.len()
143            && let (Some(h), Some(l)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2]))
144        {
145            out.push(h << 4 | l);
146            i += 3;
147            continue;
148        }
149        out.push(bytes[i]);
150        i += 1;
151    }
152    String::from_utf8_lossy(&out).into_owned()
153}
154
155fn hex_val(b: u8) -> Option<u8> {
156    match b {
157        b'0'..=b'9' => Some(b - b'0'),
158        b'a'..=b'f' => Some(b - b'a' + 10),
159        b'A'..=b'F' => Some(b - b'A' + 10),
160        _ => None,
161    }
162}
163
164/// A pool of validated proxies plus a selection strategy.
165///
166/// Construct via [`ProxyRotator::build`]. Returns `Ok(None)` when there are no
167/// proxies (caller then connects directly, preserving today's behaviour).
168#[derive(Debug)]
169pub struct ProxyRotator {
170    entries: Vec<ProxyEntry>,
171    strategy: ProxyRotation,
172    rr_cursor: AtomicUsize,
173}
174
175impl ProxyRotator {
176    /// Build a rotator with precedence: a non-empty `list` wins; otherwise the
177    /// single `single` proxy becomes a pool of one; otherwise `Ok(None)`.
178    ///
179    /// Every entry is validated — any malformed URL is a hard error (no silent
180    /// no-proxy fallback). The error is a human-readable string the caller maps
181    /// to a [`crate::CrwError`].
182    pub fn build(
183        list: &[String],
184        single: Option<&str>,
185        strategy: ProxyRotation,
186    ) -> Result<Option<Self>, String> {
187        let raws: Vec<&str> = if !list.is_empty() {
188            list.iter().map(String::as_str).collect()
189        } else if let Some(s) = single.map(str::trim).filter(|s| !s.is_empty()) {
190            vec![s]
191        } else {
192            return Ok(None);
193        };
194
195        let mut entries = Vec::with_capacity(raws.len());
196        for raw in raws {
197            entries.push(ProxyEntry::parse(raw)?);
198        }
199        if entries.is_empty() {
200            return Ok(None);
201        }
202
203        Ok(Some(Self {
204            entries,
205            strategy,
206            rr_cursor: AtomicUsize::new(0),
207        }))
208    }
209
210    /// Number of proxies in the pool.
211    pub fn len(&self) -> usize {
212        self.entries.len()
213    }
214
215    /// Always false — `build` returns `None` for empty pools.
216    pub fn is_empty(&self) -> bool {
217        self.entries.is_empty()
218    }
219
220    /// Select a proxy for a request. `host_key` is used only by
221    /// [`ProxyRotation::StickyPerHost`]; pass the normalized target host.
222    pub fn pick(&self, host_key: Option<&str>) -> &ProxyEntry {
223        &self.entries[self.pick_index(host_key)]
224    }
225
226    /// Index into the validated pool for this request, applying the strategy.
227    ///
228    /// `StickyPerHost` is **stateless**: the index is a deterministic hash of the
229    /// host modulo the pool size. This keeps a host pinned to one proxy for the
230    /// rotator's lifetime with no per-host map (no unbounded growth, no lock, and
231    /// — crucially — no cursor side-effect, so repeated picks for the same host
232    /// are idempotent and HTTP + CDP always agree).
233    pub fn pick_index(&self, host_key: Option<&str>) -> usize {
234        let len = self.entries.len();
235        if len == 0 {
236            return 0; // unreachable: `build` never yields an empty rotator.
237        }
238        match self.strategy {
239            ProxyRotation::RoundRobin => self.next_rr() % len,
240            ProxyRotation::Random => rand::random_range(0..len),
241            ProxyRotation::StickyPerHost => match host_key {
242                Some(host) => (fnv1a(host) % len as u64) as usize,
243                None => self.next_rr() % len,
244            },
245        }
246    }
247
248    fn next_rr(&self) -> usize {
249        self.rr_cursor.fetch_add(1, Ordering::Relaxed)
250    }
251}
252
253/// FNV-1a 64-bit hash — small, stable, dependency-free. Used for stateless
254/// sticky-per-host proxy assignment.
255fn fnv1a(s: &str) -> u64 {
256    let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
257    for b in s.as_bytes() {
258        hash ^= *b as u64;
259        hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
260    }
261    hash
262}
263
264#[cfg(test)]
265mod tests {
266    use super::*;
267
268    #[test]
269    fn parse_http_with_auth() {
270        let e = ProxyEntry::parse("http://user:pass@host.example:8080").unwrap();
271        assert_eq!(e.scheme(), "http");
272        assert_eq!(e.chrome_proxy_server(), "http://host.example:8080");
273        assert_eq!(e.auth(), Some(&("user".to_string(), "pass".to_string())));
274        assert_eq!(e.raw(), "http://user:pass@host.example:8080");
275    }
276
277    #[test]
278    fn parse_socks5_no_auth() {
279        let e = ProxyEntry::parse("socks5://1.2.3.4:1080").unwrap();
280        assert_eq!(e.scheme(), "socks5");
281        assert_eq!(e.chrome_proxy_server(), "socks5://1.2.3.4:1080");
282        assert!(e.auth().is_none());
283    }
284
285    #[test]
286    fn parse_percent_encoded_auth() {
287        let e = ProxyEntry::parse("http://u%40b:p%3Aw@h:8080").unwrap();
288        assert_eq!(e.auth(), Some(&("u@b".to_string(), "p:w".to_string())));
289    }
290
291    #[test]
292    fn parse_rejects_bad_scheme() {
293        assert!(ProxyEntry::parse("ftp://h:21").is_err());
294        assert!(ProxyEntry::parse("not a url").is_err());
295        assert!(ProxyEntry::parse("").is_err());
296    }
297
298    #[test]
299    fn build_empty_is_none() {
300        assert!(
301            ProxyRotator::build(&[], None, ProxyRotation::RoundRobin)
302                .unwrap()
303                .is_none()
304        );
305        assert!(
306            ProxyRotator::build(&[], Some("  "), ProxyRotation::RoundRobin)
307                .unwrap()
308                .is_none()
309        );
310    }
311
312    #[test]
313    fn build_single_is_pool_of_one() {
314        let r = ProxyRotator::build(&[], Some("http://h:8080"), ProxyRotation::RoundRobin)
315            .unwrap()
316            .unwrap();
317        assert_eq!(r.len(), 1);
318        assert_eq!(r.pick(None).chrome_proxy_server(), "http://h:8080");
319    }
320
321    #[test]
322    fn build_list_wins_over_single() {
323        let list = vec!["http://a:1".to_string(), "http://b:2".to_string()];
324        let r = ProxyRotator::build(&list, Some("http://single:9"), ProxyRotation::RoundRobin)
325            .unwrap()
326            .unwrap();
327        assert_eq!(r.len(), 2);
328    }
329
330    #[test]
331    fn build_bad_entry_is_hard_error() {
332        let list = vec!["http://ok:1".to_string(), "ftp://bad:2".to_string()];
333        assert!(ProxyRotator::build(&list, None, ProxyRotation::RoundRobin).is_err());
334    }
335
336    #[test]
337    fn round_robin_cycles_in_order() {
338        let list = vec![
339            "http://a:1".to_string(),
340            "http://b:2".to_string(),
341            "http://c:3".to_string(),
342        ];
343        let r = ProxyRotator::build(&list, None, ProxyRotation::RoundRobin)
344            .unwrap()
345            .unwrap();
346        let seq: Vec<&str> = (0..4).map(|_| r.pick(None).raw()).collect();
347        assert_eq!(
348            seq,
349            vec!["http://a:1", "http://b:2", "http://c:3", "http://a:1"]
350        );
351    }
352
353    #[test]
354    fn random_stays_in_bounds() {
355        let list = vec!["http://a:1".to_string(), "http://b:2".to_string()];
356        let r = ProxyRotator::build(&list, None, ProxyRotation::Random)
357            .unwrap()
358            .unwrap();
359        for _ in 0..100 {
360            let raw = r.pick(None).raw();
361            assert!(raw == "http://a:1" || raw == "http://b:2");
362        }
363    }
364
365    #[test]
366    fn sticky_pins_host_to_one_proxy() {
367        let list = vec![
368            "http://a:1".to_string(),
369            "http://b:2".to_string(),
370            "http://c:3".to_string(),
371        ];
372        let r = ProxyRotator::build(&list, None, ProxyRotation::StickyPerHost)
373            .unwrap()
374            .unwrap();
375        let first = r.pick(Some("example.com")).raw().to_string();
376        for _ in 0..50 {
377            assert_eq!(r.pick(Some("example.com")).raw(), first);
378        }
379        // A different host may land on a different proxy, but is itself stable.
380        let other = r.pick(Some("other.com")).raw().to_string();
381        for _ in 0..50 {
382            assert_eq!(r.pick(Some("other.com")).raw(), other);
383        }
384    }
385
386    #[test]
387    fn default_strategy_is_sticky() {
388        assert_eq!(ProxyRotation::default(), ProxyRotation::StickyPerHost);
389    }
390
391    #[test]
392    fn socks5h_maps_to_socks5_for_chrome() {
393        let e = ProxyEntry::parse("socks5h://host:1080").unwrap();
394        assert_eq!(e.scheme(), "socks5h"); // reqwest/raw keeps original
395        assert_eq!(e.chrome_proxy_server(), "socks5://host:1080"); // chrome normalized
396    }
397
398    #[test]
399    fn socks_with_auth_unsupported_on_cdp() {
400        let e = ProxyEntry::parse("socks5://user:pass@host:1080").unwrap();
401        assert!(!e.supports_cdp_auth());
402        let e2 = ProxyEntry::parse("socks5h://user:pass@host:1080").unwrap();
403        assert!(!e2.supports_cdp_auth());
404        // No-auth SOCKS and HTTP(+auth) are fine for CDP.
405        assert!(
406            ProxyEntry::parse("socks5://host:1080")
407                .unwrap()
408                .supports_cdp_auth()
409        );
410        assert!(
411            ProxyEntry::parse("http://user:pass@host:8080")
412                .unwrap()
413                .supports_cdp_auth()
414        );
415    }
416
417    #[test]
418    fn sticky_is_stateless_and_deterministic() {
419        // Two independent rotators with the same pool map a host identically
420        // (proves stickiness is a pure hash, not per-instance state).
421        let list = vec![
422            "http://a:1".to_string(),
423            "http://b:2".to_string(),
424            "http://c:3".to_string(),
425        ];
426        let r1 = ProxyRotator::build(&list, None, ProxyRotation::StickyPerHost)
427            .unwrap()
428            .unwrap();
429        let r2 = ProxyRotator::build(&list, None, ProxyRotation::StickyPerHost)
430            .unwrap()
431            .unwrap();
432        assert_eq!(
433            r1.pick(Some("example.com")).raw(),
434            r2.pick(Some("example.com")).raw()
435        );
436        // Repeated picks never advance the round-robin cursor (idempotent).
437        let first = r1.pick(Some("example.com")).raw().to_string();
438        for _ in 0..10 {
439            assert_eq!(r1.pick(Some("example.com")).raw(), first);
440        }
441    }
442
443    #[test]
444    fn round_robin_advances_exactly_once_per_pick() {
445        let list = vec!["http://a:1".to_string(), "http://b:2".to_string()];
446        let r = ProxyRotator::build(&list, None, ProxyRotation::RoundRobin)
447            .unwrap()
448            .unwrap();
449        // Each pick advances by exactly one step (a→b→a→b).
450        assert_eq!(r.pick_index(None), 0);
451        assert_eq!(r.pick_index(None), 1);
452        assert_eq!(r.pick_index(None), 0);
453        assert_eq!(r.pick_index(None), 1);
454    }
455}