Skip to main content

adler_core/
access.rs

1//! Per-site access policy and the egress (proxy) model.
2//!
3//! Access-engine phase 3: route the raw-HTTP probe path through a
4//! geo / IP-type-appropriate egress. A site declares what it needs via
5//! [`AccessPolicy`] (e.g. "only reachable from a Polish residential
6//! IP"); the client matches that against a configured pool of
7//! [`EgressSpec`]s. If the policy is unconstrained the request uses the
8//! client's default egress (direct, or the global `--proxy`); if it's
9//! constrained but nothing in the pool fits, the probe is reported as
10//! `Uncertain(GeoUnavailable)` — **never** a false `NotFound`, since
11//! "couldn't reach from the required location" is not "account absent".
12//!
13//! The browser transport keeps its backend's own egress; this phase
14//! routes the HTTP path only.
15
16use std::collections::{BTreeMap, HashMap};
17use std::fmt;
18use std::sync::Arc;
19
20use serde::{Deserialize, Serialize};
21
22use crate::transport::HttpFetcher;
23
24/// ISO-3166-1 alpha-2 country code, stored lowercased (e.g. `pl`, `de`).
25/// A newtype so a geo requirement can't be confused with an arbitrary
26/// string and is validated at the boundary.
27#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
28#[serde(try_from = "String", into = "String")]
29pub struct CountryCode([u8; 2]);
30
31impl CountryCode {
32    /// Parse a two-letter code, lowercasing ASCII. `None` for anything
33    /// that isn't exactly two ASCII letters.
34    #[must_use]
35    pub fn new(s: &str) -> Option<Self> {
36        let b = s.as_bytes();
37        if b.len() == 2 && b[0].is_ascii_alphabetic() && b[1].is_ascii_alphabetic() {
38            Some(Self([b[0].to_ascii_lowercase(), b[1].to_ascii_lowercase()]))
39        } else {
40            None
41        }
42    }
43
44    /// The lowercased two-letter code.
45    #[must_use]
46    pub fn as_str(&self) -> &str {
47        // Constructed only from ASCII letters, so this is always valid.
48        std::str::from_utf8(&self.0).unwrap_or("??")
49    }
50}
51
52impl TryFrom<String> for CountryCode {
53    type Error = String;
54    fn try_from(s: String) -> Result<Self, Self::Error> {
55        Self::new(&s).ok_or_else(|| format!("invalid country code: {s:?}"))
56    }
57}
58
59impl From<CountryCode> for String {
60    fn from(c: CountryCode) -> Self {
61        c.as_str().to_owned()
62    }
63}
64
65/// The kind of network an egress exits from.
66///
67/// A site's `ip_type` requirement is matched against this. (`Direct`
68/// isn't a kind here — the unproxied default egress is selected by an
69/// *unconstrained* policy, not by requesting a kind.)
70#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
71#[serde(rename_all = "kebab-case")]
72#[non_exhaustive]
73pub enum EgressKind {
74    /// A datacenter / hosting-provider IP (cheap, easily fingerprinted
75    /// and blocked). The default when a config entry omits `kind`.
76    #[default]
77    Datacenter,
78    /// A residential ISP IP (harder to block; what most "real users"
79    /// look like).
80    Residential,
81    /// A mobile-carrier IP (shared CGNAT ranges; highest trust on many
82    /// sites).
83    Mobile,
84    /// A Tor exit node.
85    Tor,
86}
87
88/// A configured egress (proxy) the client can route through.
89///
90/// Produced from CLI / config; the live client pairs each spec with its
91/// own HTTP client (reqwest bakes the proxy in at build time).
92/// Deserialises from the `[[egress]]` entries of a proxy-pool config
93/// file.
94#[derive(Debug, Clone, Deserialize)]
95pub struct EgressSpec {
96    /// Proxy URL — `http://`, `https://`, `socks5://`, or `socks5h://`.
97    pub url: String,
98    /// Country this egress exits from, if known.
99    #[serde(default)]
100    pub country: Option<CountryCode>,
101    /// Network kind this egress exits from (defaults to `datacenter`).
102    #[serde(default)]
103    pub kind: EgressKind,
104    /// Operator-supplied identifier for this egress — used by the web
105    /// UI's per-scan egress subset selection (and by any other call
106    /// site that needs to refer to a specific egress by stable name).
107    /// Optional: an unnamed egress still participates in policy-based
108    /// matching, it just can't be selected by name.
109    #[serde(default)]
110    pub name: Option<String>,
111}
112
113/// What a site needs from its egress. The default (empty) means "no
114/// special routing" — the request uses the client's default egress.
115///
116/// Two flavours of geo constraint co-exist:
117///
118/// - [`geo`](Self::geo) — **hard**. A site that won't answer from
119///   anywhere else (e.g. a country-locked profile). No matching egress
120///   in the pool → `Uncertain(GeoUnavailable)`, never a false `NotFound`.
121/// - [`prefer_geo`](Self::prefer_geo) — **soft**. A site that *prefers*
122///   a local egress (better recall, less aggressive bot filtering) but
123///   still works from anywhere. No matching egress → fall back to the
124///   default egress and probe normally. Auto-populated at registry-load
125///   time from `region:XX` tags when the site doesn't already declare
126///   a hard `geo` constraint.
127#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
128pub struct AccessPolicy {
129    /// Require an egress in one of these countries.
130    #[serde(default, skip_serializing_if = "Vec::is_empty")]
131    pub geo: Vec<CountryCode>,
132    /// Prefer an egress in one of these countries — fall back to the
133    /// default if the pool has no match. Soft counterpart to
134    /// [`AccessPolicy::geo`].
135    #[serde(default, skip_serializing_if = "Vec::is_empty")]
136    pub prefer_geo: Vec<CountryCode>,
137    /// Require an egress of this network kind.
138    #[serde(default, skip_serializing_if = "Option::is_none")]
139    pub ip_type: Option<EgressKind>,
140    /// Name of an operator-supplied session (see `--sessions`) whose
141    /// headers (cookies / auth tokens) this site's probes must carry.
142    /// The site is unreachable without it, so a missing session yields
143    /// `Uncertain(SessionRequired)` rather than a login-wall false
144    /// `NotFound`.
145    #[serde(default, skip_serializing_if = "Option::is_none")]
146    pub session: Option<String>,
147}
148
149impl AccessPolicy {
150    /// True when the policy imposes no constraint at all (the common
151    /// case). Drives `skip_serializing_if` so existing `sites.json`
152    /// entries serialise unchanged.
153    #[must_use]
154    pub fn is_default(&self) -> bool {
155        self.geo.is_empty()
156            && self.prefer_geo.is_empty()
157            && self.ip_type.is_none()
158            && self.session.is_none()
159    }
160}
161
162/// An operator-supplied authenticated session for a site: a bag of HTTP
163/// headers (typically `Cookie`, sometimes `Authorization` / CSRF
164/// tokens) applied to probes for sites whose `access.session` names it.
165///
166/// This is "use a real account", not evasion — the operator brings a
167/// session they're entitled to. Header *values* are secrets: they're
168/// redacted from `Debug` and are never logged or serialised.
169#[derive(Clone, Default)]
170pub struct Session {
171    headers: BTreeMap<String, String>,
172}
173
174impl Session {
175    /// Build a session from plain header name→value pairs (e.g. parsed
176    /// from a `--sessions` config file).
177    #[must_use]
178    pub fn from_headers(headers: BTreeMap<String, String>) -> Self {
179        Self { headers }
180    }
181
182    /// Merge this session's headers over `base` (the session wins on
183    /// conflict), producing the header set for the outgoing request.
184    pub(crate) fn apply(&self, base: &BTreeMap<String, String>) -> BTreeMap<String, String> {
185        let mut out = base.clone();
186        for (k, v) in &self.headers {
187            out.insert(k.clone(), v.clone());
188        }
189        out
190    }
191}
192
193impl fmt::Debug for Session {
194    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
195        // Redact values — session headers carry cookies / tokens.
196        f.debug_struct("Session")
197            .field("headers", &self.headers.keys().collect::<Vec<_>>())
198            .finish_non_exhaustive()
199    }
200}
201
202/// Named-session store, indexed by the name a site references via
203/// `access.session`. Empty by default → a no-op.
204#[derive(Clone, Default, Debug)]
205pub struct SessionStore {
206    sessions: HashMap<String, Session>,
207}
208
209impl SessionStore {
210    /// An empty store.
211    #[must_use]
212    pub fn new() -> Self {
213        Self::default()
214    }
215
216    /// Insert (or replace) a named session.
217    pub fn insert(&mut self, name: impl Into<String>, session: Session) {
218        self.sessions.insert(name.into(), session);
219    }
220
221    /// True when no session is configured.
222    #[must_use]
223    pub fn is_empty(&self) -> bool {
224        self.sessions.is_empty()
225    }
226
227    /// Number of configured sessions.
228    #[must_use]
229    pub fn len(&self) -> usize {
230        self.sessions.len()
231    }
232
233    pub(crate) fn get(&self, name: &str) -> Option<&Session> {
234        self.sessions.get(name)
235    }
236
237    /// True when a named session is configured.
238    #[must_use]
239    pub fn contains(&self, name: &str) -> bool {
240        self.sessions.contains_key(name)
241    }
242
243    /// Names of the configured sessions, sorted lexicographically for a
244    /// stable display order. Values stay private — by design the public
245    /// surface only ever leaks the keys an operator referenced via
246    /// `access.session`, never the cookie/token bytes themselves.
247    #[must_use]
248    pub fn names(&self) -> Vec<String> {
249        let mut names: Vec<String> = self.sessions.keys().cloned().collect();
250        names.sort();
251        names
252    }
253}
254
255/// Read-only metadata for one configured egress, surfaced via
256/// [`Client::egress_summary`](crate::Client::egress_summary).
257///
258/// Carries only the match-relevant facets (name + country + kind); the
259/// proxy URL is *deliberately omitted* — those typically embed
260/// credentials (`socks5://user:pass@host:1080`) that have no business
261/// landing in a JSON response served to a browser.
262#[derive(Debug, Clone, Serialize)]
263pub struct EgressSummary {
264    /// Operator-supplied name, if any. Used by per-scan egress subset
265    /// selection (`POST /api/scan` with `egress_names`).
266    #[serde(skip_serializing_if = "Option::is_none")]
267    pub name: Option<String>,
268    /// Country this egress exits from, if declared.
269    #[serde(skip_serializing_if = "Option::is_none")]
270    pub country: Option<CountryCode>,
271    /// Network kind (`datacenter` / `residential` / `mobile` / `tor`).
272    pub kind: EgressKind,
273}
274
275/// One built egress: its match metadata plus the HTTP client that
276/// routes through it.
277struct EgressEntry {
278    name: Option<String>,
279    country: Option<CountryCode>,
280    kind: EgressKind,
281    fetcher: Arc<HttpFetcher>,
282}
283
284/// Runtime pool of built egresses. Empty by default → every site uses
285/// the client's default egress, so an empty pool is a no-op.
286pub(crate) struct EgressPool {
287    entries: Vec<EgressEntry>,
288}
289
290/// Result of matching a site's [`AccessPolicy`] against the pool.
291pub(crate) enum EgressChoice {
292    /// Unconstrained policy → use the client's default egress.
293    Default,
294    /// Route through this egress's HTTP client.
295    Use(Arc<HttpFetcher>),
296    /// Constrained policy with no matching egress → honest
297    /// `Uncertain(GeoUnavailable)` rather than a false `NotFound`.
298    Unavailable,
299}
300
301/// Constructor tuple for [`EgressPool`]: one row per configured proxy
302/// carries its operator-supplied `name` (if any), its country and
303/// kind, and the already-built `reqwest`-backed fetcher.
304pub(crate) type EgressEntryTuple = (
305    Option<String>,
306    Option<CountryCode>,
307    EgressKind,
308    Arc<HttpFetcher>,
309);
310
311impl EgressPool {
312    pub(crate) fn new(entries: Vec<EgressEntryTuple>) -> Self {
313        Self {
314            entries: entries
315                .into_iter()
316                .map(|(name, country, kind, fetcher)| EgressEntry {
317                    name,
318                    country,
319                    kind,
320                    fetcher,
321                })
322                .collect(),
323        }
324    }
325
326    /// Read-only view of the pool — `(name, country, kind)` for every
327    /// configured egress, in the order they were registered. Used by the
328    /// `GET /api/access` endpoint so the SPA can show what's configured
329    /// without ever touching proxy URLs.
330    pub(crate) fn summary(&self) -> Vec<EgressSummary> {
331        self.entries
332            .iter()
333            .map(|e| EgressSummary {
334                name: e.name.clone(),
335                country: e.country.clone(),
336                kind: e.kind,
337            })
338            .collect()
339    }
340
341    /// Return a new pool containing only entries whose `name` matches
342    /// one of `names`. Entries without a name are excluded (they can't
343    /// be referenced by name). `names` being empty is treated as "no
344    /// filter" and a clone of the full pool is returned — that
345    /// preserves the policy-driven default for callers who didn't ask
346    /// for an explicit subset.
347    pub(crate) fn subset(&self, names: &[String]) -> Self {
348        if names.is_empty() {
349            return Self {
350                entries: self
351                    .entries
352                    .iter()
353                    .map(|e| EgressEntry {
354                        name: e.name.clone(),
355                        country: e.country.clone(),
356                        kind: e.kind,
357                        fetcher: Arc::clone(&e.fetcher),
358                    })
359                    .collect(),
360            };
361        }
362        let wanted: std::collections::HashSet<&str> = names.iter().map(String::as_str).collect();
363        Self {
364            entries: self
365                .entries
366                .iter()
367                .filter(|e| e.name.as_deref().is_some_and(|n| wanted.contains(n)))
368                .map(|e| EgressEntry {
369                    name: e.name.clone(),
370                    country: e.country.clone(),
371                    kind: e.kind,
372                    fetcher: Arc::clone(&e.fetcher),
373                })
374                .collect(),
375        }
376    }
377
378    /// Names of egresses configured in this pool, in registration
379    /// order. Used by the server to validate `egress_names` on
380    /// `POST /api/scan`.
381    pub(crate) fn names(&self) -> Vec<String> {
382        self.entries.iter().filter_map(|e| e.name.clone()).collect()
383    }
384
385    /// Pick an egress for `policy`. Three outcomes:
386    ///
387    /// - Unconstrained policy (no hard `geo`, no `prefer_geo`, no
388    ///   `ip_type`) → [`EgressChoice::Default`].
389    /// - Hard constraint with no match → [`EgressChoice::Unavailable`].
390    /// - Soft `prefer_geo` with no match → falls back to
391    ///   [`EgressChoice::Default`] (the probe still happens, just via
392    ///   the unproxied / default egress).
393    pub(crate) fn select(&self, policy: &AccessPolicy) -> EgressChoice {
394        // Session-only policy (no geo / no ip_type / no prefer_geo) →
395        // default egress.
396        if policy.geo.is_empty() && policy.prefer_geo.is_empty() && policy.ip_type.is_none() {
397            return EgressChoice::Default;
398        }
399
400        // Hard path: explicit `geo` (and optional `ip_type`) — when
401        // present, this is authoritative and prefer_geo is ignored.
402        if !policy.geo.is_empty() {
403            return self
404                .pick_matching(&policy.geo, policy.ip_type)
405                .map_or(EgressChoice::Unavailable, EgressChoice::Use);
406        }
407
408        // Soft path: only `prefer_geo` (and optional `ip_type`). Match
409        // → route through it; no match → fall back to the default
410        // egress rather than emit Unavailable. The site is *expected*
411        // to be reachable from anywhere; the egress preference is a
412        // recall optimisation, not a correctness constraint.
413        if !policy.prefer_geo.is_empty() {
414            return self
415                .pick_matching(&policy.prefer_geo, policy.ip_type)
416                .map_or(EgressChoice::Default, EgressChoice::Use);
417        }
418
419        // Only `ip_type` constrained — keep the hard semantics: a site
420        // that asks for a residential IP and the pool has none is
421        // Unavailable, not silently downgraded to datacenter.
422        self.pick_matching(&[], policy.ip_type)
423            .map_or(EgressChoice::Unavailable, EgressChoice::Use)
424    }
425
426    /// Internal: pick a random matching entry for the given geo and
427    /// optional `ip_type`. `geo` empty means "any country". Returns
428    /// `None` when nothing fits.
429    fn pick_matching(
430        &self,
431        geo: &[CountryCode],
432        ip_type: Option<EgressKind>,
433    ) -> Option<Arc<HttpFetcher>> {
434        let matches: Vec<&EgressEntry> = self
435            .entries
436            .iter()
437            .filter(|e| {
438                let geo_ok = geo.is_empty() || e.country.as_ref().is_some_and(|c| geo.contains(c));
439                let kind_ok = ip_type.is_none_or(|k| e.kind == k);
440                geo_ok && kind_ok
441            })
442            .collect();
443        match matches.len() {
444            0 => None,
445            n => Some(Arc::clone(&matches[fastrand::usize(0..n)].fetcher)),
446        }
447    }
448}
449
450#[cfg(test)]
451mod tests {
452    use super::*;
453    use crate::transport::HttpFetcher;
454
455    fn cc(s: &str) -> CountryCode {
456        CountryCode::new(s).expect("valid country code")
457    }
458
459    fn dummy_fetcher() -> Arc<HttpFetcher> {
460        Arc::new(HttpFetcher::new(reqwest::Client::new()))
461    }
462
463    fn pool() -> EgressPool {
464        EgressPool::new(vec![
465            (
466                None,
467                Some(cc("pl")),
468                EgressKind::Residential,
469                dummy_fetcher(),
470            ),
471            (
472                None,
473                Some(cc("de")),
474                EgressKind::Datacenter,
475                dummy_fetcher(),
476            ),
477        ])
478    }
479
480    #[test]
481    fn country_code_normalises_and_rejects() {
482        assert_eq!(CountryCode::new("PL").unwrap().as_str(), "pl");
483        assert!(CountryCode::new("p").is_none());
484        assert!(CountryCode::new("pol").is_none());
485        assert!(CountryCode::new("p1").is_none());
486    }
487
488    #[test]
489    fn unconstrained_policy_uses_default_egress() {
490        let choice = pool().select(&AccessPolicy::default());
491        assert!(matches!(choice, EgressChoice::Default));
492    }
493
494    #[test]
495    fn geo_match_picks_an_egress() {
496        let policy = AccessPolicy {
497            geo: vec![cc("pl")],
498            ..AccessPolicy::default()
499        };
500        assert!(matches!(pool().select(&policy), EgressChoice::Use(_)));
501    }
502
503    #[test]
504    fn ip_type_match_picks_an_egress() {
505        let policy = AccessPolicy {
506            ip_type: Some(EgressKind::Datacenter),
507            ..AccessPolicy::default()
508        };
509        assert!(matches!(pool().select(&policy), EgressChoice::Use(_)));
510    }
511
512    #[test]
513    fn geo_present_but_wrong_kind_is_unavailable() {
514        // PL exists in the pool, but only as Residential — asking for a
515        // PL *Mobile* egress must fail rather than fall back.
516        let policy = AccessPolicy {
517            geo: vec![cc("pl")],
518            ip_type: Some(EgressKind::Mobile),
519            ..AccessPolicy::default()
520        };
521        assert!(matches!(pool().select(&policy), EgressChoice::Unavailable));
522    }
523
524    #[test]
525    fn unknown_geo_is_unavailable() {
526        let policy = AccessPolicy {
527            geo: vec![cc("jp")],
528            ..AccessPolicy::default()
529        };
530        assert!(matches!(pool().select(&policy), EgressChoice::Unavailable));
531    }
532
533    #[test]
534    fn empty_pool_with_constraint_is_unavailable() {
535        let empty = EgressPool::new(Vec::new());
536        let policy = AccessPolicy {
537            geo: vec![cc("pl")],
538            ..AccessPolicy::default()
539        };
540        assert!(matches!(empty.select(&policy), EgressChoice::Unavailable));
541    }
542
543    #[test]
544    fn soft_prefer_match_routes_through_it() {
545        // prefer_geo = pl, pool has a PL residential → use it.
546        let policy = AccessPolicy {
547            prefer_geo: vec![cc("pl")],
548            ..AccessPolicy::default()
549        };
550        assert!(matches!(pool().select(&policy), EgressChoice::Use(_)));
551    }
552
553    #[test]
554    fn soft_prefer_no_match_falls_back_to_default() {
555        // prefer_geo = jp, pool has no JP egress → Default, NOT Unavailable.
556        // This is the whole point of soft routing: the probe still goes
557        // out, just via the unproxied default — the site is reachable
558        // from anywhere, the preference was a recall optimisation.
559        let policy = AccessPolicy {
560            prefer_geo: vec![cc("jp")],
561            ..AccessPolicy::default()
562        };
563        assert!(matches!(pool().select(&policy), EgressChoice::Default));
564    }
565
566    #[test]
567    fn hard_geo_wins_over_soft_prefer() {
568        // When both are set, hard `geo` is authoritative — prefer_geo
569        // is ignored. Asking for hard PL with no match in the JP-only
570        // prefer is still Unavailable.
571        let empty_pl = EgressPool::new(vec![(
572            None,
573            Some(cc("jp")),
574            EgressKind::Datacenter,
575            dummy_fetcher(),
576        )]);
577        let policy = AccessPolicy {
578            geo: vec![cc("pl")],
579            prefer_geo: vec![cc("jp")],
580            ..AccessPolicy::default()
581        };
582        assert!(matches!(
583            empty_pl.select(&policy),
584            EgressChoice::Unavailable
585        ));
586    }
587
588    #[test]
589    fn ip_type_only_is_still_hard() {
590        // Asking for residential when the pool has none must remain
591        // Unavailable. We only soften geo via prefer_geo — kind
592        // requirements are still load-bearing.
593        let dc_only = EgressPool::new(vec![(None, None, EgressKind::Datacenter, dummy_fetcher())]);
594        let policy = AccessPolicy {
595            ip_type: Some(EgressKind::Residential),
596            ..AccessPolicy::default()
597        };
598        assert!(matches!(dc_only.select(&policy), EgressChoice::Unavailable));
599    }
600
601    #[test]
602    fn session_apply_overrides_base_headers() {
603        let mut base = BTreeMap::new();
604        base.insert("X-IG-App-ID".to_string(), "936".to_string());
605        base.insert("Cookie".to_string(), "old".to_string());
606        let mut sh = BTreeMap::new();
607        sh.insert("Cookie".to_string(), "sessionid=real".to_string());
608        let merged = Session::from_headers(sh).apply(&base);
609        // Session wins on conflict; non-conflicting base header preserved.
610        assert_eq!(merged.get("Cookie").unwrap(), "sessionid=real");
611        assert_eq!(merged.get("X-IG-App-ID").unwrap(), "936");
612    }
613
614    #[test]
615    fn session_store_insert_and_lookup() {
616        let mut store = SessionStore::new();
617        assert!(store.is_empty());
618        store.insert("ig", Session::from_headers(BTreeMap::new()));
619        assert!(!store.is_empty());
620        assert!(store.contains("ig"));
621        assert!(store.get("ig").is_some());
622        assert!(!store.contains("missing"));
623        assert!(store.get("missing").is_none());
624    }
625}