adler_core/access.rs
1//! Per-site access policy and the egress (proxy) model.
2//!
3//! Access-engine phase 3: route the raw-HTTP probe path through a
4//! geo / IP-type-appropriate egress. A site declares what it needs via
5//! [`AccessPolicy`] (e.g. "only reachable from a Polish residential
6//! IP"); the client matches that against a configured pool of
7//! [`EgressSpec`]s. If the policy is unconstrained the request uses the
8//! client's default egress (direct, or the global `--proxy`); if it's
9//! constrained but nothing in the pool fits, the probe is reported as
10//! `Uncertain(GeoUnavailable)` — **never** a false `NotFound`, since
11//! "couldn't reach from the required location" is not "account absent".
12//!
13//! The browser transport keeps its backend's own egress; this phase
14//! routes the HTTP path only.
15
16use std::collections::{BTreeMap, HashMap};
17use std::fmt;
18use std::sync::Arc;
19
20use serde::{Deserialize, Serialize};
21
22use crate::transport::HttpFetcher;
23
24/// ISO-3166-1 alpha-2 country code, stored lowercased (e.g. `pl`, `de`).
25/// A newtype so a geo requirement can't be confused with an arbitrary
26/// string and is validated at the boundary.
27#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
28#[serde(try_from = "String", into = "String")]
29pub struct CountryCode([u8; 2]);
30
31impl CountryCode {
32 /// Parse a two-letter code, lowercasing ASCII. `None` for anything
33 /// that isn't exactly two ASCII letters.
34 #[must_use]
35 pub fn new(s: &str) -> Option<Self> {
36 let b = s.as_bytes();
37 if b.len() == 2 && b[0].is_ascii_alphabetic() && b[1].is_ascii_alphabetic() {
38 Some(Self([b[0].to_ascii_lowercase(), b[1].to_ascii_lowercase()]))
39 } else {
40 None
41 }
42 }
43
44 /// The lowercased two-letter code.
45 #[must_use]
46 pub fn as_str(&self) -> &str {
47 // Constructed only from ASCII letters, so this is always valid.
48 std::str::from_utf8(&self.0).unwrap_or("??")
49 }
50}
51
52impl TryFrom<String> for CountryCode {
53 type Error = String;
54 fn try_from(s: String) -> Result<Self, Self::Error> {
55 Self::new(&s).ok_or_else(|| format!("invalid country code: {s:?}"))
56 }
57}
58
59impl From<CountryCode> for String {
60 fn from(c: CountryCode) -> Self {
61 c.as_str().to_owned()
62 }
63}
64
65/// The kind of network an egress exits from.
66///
67/// A site's `ip_type` requirement is matched against this. (`Direct`
68/// isn't a kind here — the unproxied default egress is selected by an
69/// *unconstrained* policy, not by requesting a kind.)
70#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
71#[serde(rename_all = "kebab-case")]
72#[non_exhaustive]
73pub enum EgressKind {
74 /// A datacenter / hosting-provider IP (cheap, easily fingerprinted
75 /// and blocked). The default when a config entry omits `kind`.
76 #[default]
77 Datacenter,
78 /// A residential ISP IP (harder to block; what most "real users"
79 /// look like).
80 Residential,
81 /// A mobile-carrier IP (shared CGNAT ranges; highest trust on many
82 /// sites).
83 Mobile,
84 /// A Tor exit node.
85 Tor,
86}
87
88/// A configured egress (proxy) the client can route through.
89///
90/// Produced from CLI / config; the live client pairs each spec with its
91/// own HTTP client (reqwest bakes the proxy in at build time).
92/// Deserialises from the `[[egress]]` entries of a proxy-pool config
93/// file.
94#[derive(Debug, Clone, Deserialize)]
95pub struct EgressSpec {
96 /// Proxy URL — `http://`, `https://`, `socks5://`, or `socks5h://`.
97 pub url: String,
98 /// Country this egress exits from, if known.
99 #[serde(default)]
100 pub country: Option<CountryCode>,
101 /// Network kind this egress exits from (defaults to `datacenter`).
102 #[serde(default)]
103 pub kind: EgressKind,
104 /// Operator-supplied identifier for this egress — used by the web
105 /// UI's per-scan egress subset selection (and by any other call
106 /// site that needs to refer to a specific egress by stable name).
107 /// Optional: an unnamed egress still participates in policy-based
108 /// matching, it just can't be selected by name.
109 #[serde(default)]
110 pub name: Option<String>,
111}
112
113/// What a site needs from its egress. The default (empty) means "no
114/// special routing" — the request uses the client's default egress.
115///
116/// Two flavours of geo constraint co-exist:
117///
118/// - [`geo`](Self::geo) — **hard**. A site that won't answer from
119/// anywhere else (e.g. a country-locked profile). No matching egress
120/// in the pool → `Uncertain(GeoUnavailable)`, never a false `NotFound`.
121/// - [`prefer_geo`](Self::prefer_geo) — **soft**. A site that *prefers*
122/// a local egress (better recall, less aggressive bot filtering) but
123/// still works from anywhere. No matching egress → fall back to the
124/// default egress and probe normally. Auto-populated at registry-load
125/// time from `region:XX` tags when the site doesn't already declare
126/// a hard `geo` constraint.
127#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
128pub struct AccessPolicy {
129 /// Require an egress in one of these countries.
130 #[serde(default, skip_serializing_if = "Vec::is_empty")]
131 pub geo: Vec<CountryCode>,
132 /// Prefer an egress in one of these countries — fall back to the
133 /// default if the pool has no match. Soft counterpart to
134 /// [`AccessPolicy::geo`].
135 #[serde(default, skip_serializing_if = "Vec::is_empty")]
136 pub prefer_geo: Vec<CountryCode>,
137 /// Require an egress of this network kind.
138 #[serde(default, skip_serializing_if = "Option::is_none")]
139 pub ip_type: Option<EgressKind>,
140 /// Name of an operator-supplied session (see `--sessions`) whose
141 /// headers (cookies / auth tokens) this site's probes must carry.
142 /// The site is unreachable without it, so a missing session yields
143 /// `Uncertain(SessionRequired)` rather than a login-wall false
144 /// `NotFound`.
145 #[serde(default, skip_serializing_if = "Option::is_none")]
146 pub session: Option<String>,
147}
148
149impl AccessPolicy {
150 /// True when the policy imposes no constraint at all (the common
151 /// case). Drives `skip_serializing_if` so existing `sites.json`
152 /// entries serialise unchanged.
153 #[must_use]
154 pub fn is_default(&self) -> bool {
155 self.geo.is_empty()
156 && self.prefer_geo.is_empty()
157 && self.ip_type.is_none()
158 && self.session.is_none()
159 }
160}
161
162/// An operator-supplied authenticated session for a site: a bag of HTTP
163/// headers (typically `Cookie`, sometimes `Authorization` / CSRF
164/// tokens) applied to probes for sites whose `access.session` names it.
165///
166/// This is "use a real account", not evasion — the operator brings a
167/// session they're entitled to. Header *values* are secrets: they're
168/// redacted from `Debug` and are never logged or serialised.
169#[derive(Clone, Default)]
170pub struct Session {
171 headers: BTreeMap<String, String>,
172}
173
174impl Session {
175 /// Build a session from plain header name→value pairs (e.g. parsed
176 /// from a `--sessions` config file).
177 #[must_use]
178 pub fn from_headers(headers: BTreeMap<String, String>) -> Self {
179 Self { headers }
180 }
181
182 /// Merge this session's headers over `base` (the session wins on
183 /// conflict), producing the header set for the outgoing request.
184 pub(crate) fn apply(&self, base: &BTreeMap<String, String>) -> BTreeMap<String, String> {
185 let mut out = base.clone();
186 for (k, v) in &self.headers {
187 out.insert(k.clone(), v.clone());
188 }
189 out
190 }
191}
192
193impl fmt::Debug for Session {
194 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
195 // Redact values — session headers carry cookies / tokens.
196 f.debug_struct("Session")
197 .field("headers", &self.headers.keys().collect::<Vec<_>>())
198 .finish_non_exhaustive()
199 }
200}
201
202/// Named-session store, indexed by the name a site references via
203/// `access.session`. Empty by default → a no-op.
204#[derive(Clone, Default, Debug)]
205pub struct SessionStore {
206 sessions: HashMap<String, Session>,
207}
208
209impl SessionStore {
210 /// An empty store.
211 #[must_use]
212 pub fn new() -> Self {
213 Self::default()
214 }
215
216 /// Insert (or replace) a named session.
217 pub fn insert(&mut self, name: impl Into<String>, session: Session) {
218 self.sessions.insert(name.into(), session);
219 }
220
221 /// True when no session is configured.
222 #[must_use]
223 pub fn is_empty(&self) -> bool {
224 self.sessions.is_empty()
225 }
226
227 /// Number of configured sessions.
228 #[must_use]
229 pub fn len(&self) -> usize {
230 self.sessions.len()
231 }
232
233 pub(crate) fn get(&self, name: &str) -> Option<&Session> {
234 self.sessions.get(name)
235 }
236
237 /// Names of the configured sessions, sorted lexicographically for a
238 /// stable display order. Values stay private — by design the public
239 /// surface only ever leaks the keys an operator referenced via
240 /// `access.session`, never the cookie/token bytes themselves.
241 #[must_use]
242 pub fn names(&self) -> Vec<String> {
243 let mut names: Vec<String> = self.sessions.keys().cloned().collect();
244 names.sort();
245 names
246 }
247}
248
249/// Read-only metadata for one configured egress, surfaced via
250/// [`Client::egress_summary`](crate::Client::egress_summary).
251///
252/// Carries only the match-relevant facets (name + country + kind); the
253/// proxy URL is *deliberately omitted* — those typically embed
254/// credentials (`socks5://user:pass@host:1080`) that have no business
255/// landing in a JSON response served to a browser.
256#[derive(Debug, Clone, Serialize)]
257pub struct EgressSummary {
258 /// Operator-supplied name, if any. Used by per-scan egress subset
259 /// selection (`POST /api/scan` with `egress_names`).
260 #[serde(skip_serializing_if = "Option::is_none")]
261 pub name: Option<String>,
262 /// Country this egress exits from, if declared.
263 #[serde(skip_serializing_if = "Option::is_none")]
264 pub country: Option<CountryCode>,
265 /// Network kind (`datacenter` / `residential` / `mobile` / `tor`).
266 pub kind: EgressKind,
267}
268
269/// One built egress: its match metadata plus the HTTP client that
270/// routes through it.
271struct EgressEntry {
272 name: Option<String>,
273 country: Option<CountryCode>,
274 kind: EgressKind,
275 fetcher: Arc<HttpFetcher>,
276}
277
278/// Runtime pool of built egresses. Empty by default → every site uses
279/// the client's default egress, so an empty pool is a no-op.
280pub(crate) struct EgressPool {
281 entries: Vec<EgressEntry>,
282}
283
284/// Result of matching a site's [`AccessPolicy`] against the pool.
285pub(crate) enum EgressChoice {
286 /// Unconstrained policy → use the client's default egress.
287 Default,
288 /// Route through this egress's HTTP client.
289 Use(Arc<HttpFetcher>),
290 /// Constrained policy with no matching egress → honest
291 /// `Uncertain(GeoUnavailable)` rather than a false `NotFound`.
292 Unavailable,
293}
294
295/// Constructor tuple for [`EgressPool`]: one row per configured proxy
296/// carries its operator-supplied `name` (if any), its country and
297/// kind, and the already-built `reqwest`-backed fetcher.
298pub(crate) type EgressEntryTuple = (
299 Option<String>,
300 Option<CountryCode>,
301 EgressKind,
302 Arc<HttpFetcher>,
303);
304
305impl EgressPool {
306 pub(crate) fn new(entries: Vec<EgressEntryTuple>) -> Self {
307 Self {
308 entries: entries
309 .into_iter()
310 .map(|(name, country, kind, fetcher)| EgressEntry {
311 name,
312 country,
313 kind,
314 fetcher,
315 })
316 .collect(),
317 }
318 }
319
320 /// Read-only view of the pool — `(name, country, kind)` for every
321 /// configured egress, in the order they were registered. Used by the
322 /// `GET /api/access` endpoint so the SPA can show what's configured
323 /// without ever touching proxy URLs.
324 pub(crate) fn summary(&self) -> Vec<EgressSummary> {
325 self.entries
326 .iter()
327 .map(|e| EgressSummary {
328 name: e.name.clone(),
329 country: e.country.clone(),
330 kind: e.kind,
331 })
332 .collect()
333 }
334
335 /// Return a new pool containing only entries whose `name` matches
336 /// one of `names`. Entries without a name are excluded (they can't
337 /// be referenced by name). `names` being empty is treated as "no
338 /// filter" and a clone of the full pool is returned — that
339 /// preserves the policy-driven default for callers who didn't ask
340 /// for an explicit subset.
341 pub(crate) fn subset(&self, names: &[String]) -> Self {
342 if names.is_empty() {
343 return Self {
344 entries: self
345 .entries
346 .iter()
347 .map(|e| EgressEntry {
348 name: e.name.clone(),
349 country: e.country.clone(),
350 kind: e.kind,
351 fetcher: Arc::clone(&e.fetcher),
352 })
353 .collect(),
354 };
355 }
356 let wanted: std::collections::HashSet<&str> = names.iter().map(String::as_str).collect();
357 Self {
358 entries: self
359 .entries
360 .iter()
361 .filter(|e| e.name.as_deref().is_some_and(|n| wanted.contains(n)))
362 .map(|e| EgressEntry {
363 name: e.name.clone(),
364 country: e.country.clone(),
365 kind: e.kind,
366 fetcher: Arc::clone(&e.fetcher),
367 })
368 .collect(),
369 }
370 }
371
372 /// Names of egresses configured in this pool, in registration
373 /// order. Used by the server to validate `egress_names` on
374 /// `POST /api/scan`.
375 pub(crate) fn names(&self) -> Vec<String> {
376 self.entries.iter().filter_map(|e| e.name.clone()).collect()
377 }
378
379 /// Pick an egress for `policy`. Three outcomes:
380 ///
381 /// - Unconstrained policy (no hard `geo`, no `prefer_geo`, no
382 /// `ip_type`) → [`EgressChoice::Default`].
383 /// - Hard constraint with no match → [`EgressChoice::Unavailable`].
384 /// - Soft `prefer_geo` with no match → falls back to
385 /// [`EgressChoice::Default`] (the probe still happens, just via
386 /// the unproxied / default egress).
387 pub(crate) fn select(&self, policy: &AccessPolicy) -> EgressChoice {
388 // Session-only policy (no geo / no ip_type / no prefer_geo) →
389 // default egress.
390 if policy.geo.is_empty() && policy.prefer_geo.is_empty() && policy.ip_type.is_none() {
391 return EgressChoice::Default;
392 }
393
394 // Hard path: explicit `geo` (and optional `ip_type`) — when
395 // present, this is authoritative and prefer_geo is ignored.
396 if !policy.geo.is_empty() {
397 return self
398 .pick_matching(&policy.geo, policy.ip_type)
399 .map_or(EgressChoice::Unavailable, EgressChoice::Use);
400 }
401
402 // Soft path: only `prefer_geo` (and optional `ip_type`). Match
403 // → route through it; no match → fall back to the default
404 // egress rather than emit Unavailable. The site is *expected*
405 // to be reachable from anywhere; the egress preference is a
406 // recall optimisation, not a correctness constraint.
407 if !policy.prefer_geo.is_empty() {
408 return self
409 .pick_matching(&policy.prefer_geo, policy.ip_type)
410 .map_or(EgressChoice::Default, EgressChoice::Use);
411 }
412
413 // Only `ip_type` constrained — keep the hard semantics: a site
414 // that asks for a residential IP and the pool has none is
415 // Unavailable, not silently downgraded to datacenter.
416 self.pick_matching(&[], policy.ip_type)
417 .map_or(EgressChoice::Unavailable, EgressChoice::Use)
418 }
419
420 /// Internal: pick a random matching entry for the given geo and
421 /// optional `ip_type`. `geo` empty means "any country". Returns
422 /// `None` when nothing fits.
423 fn pick_matching(
424 &self,
425 geo: &[CountryCode],
426 ip_type: Option<EgressKind>,
427 ) -> Option<Arc<HttpFetcher>> {
428 let matches: Vec<&EgressEntry> = self
429 .entries
430 .iter()
431 .filter(|e| {
432 let geo_ok = geo.is_empty() || e.country.as_ref().is_some_and(|c| geo.contains(c));
433 let kind_ok = ip_type.is_none_or(|k| e.kind == k);
434 geo_ok && kind_ok
435 })
436 .collect();
437 match matches.len() {
438 0 => None,
439 n => Some(Arc::clone(&matches[fastrand::usize(0..n)].fetcher)),
440 }
441 }
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447 use crate::transport::HttpFetcher;
448
449 fn cc(s: &str) -> CountryCode {
450 CountryCode::new(s).expect("valid country code")
451 }
452
453 fn dummy_fetcher() -> Arc<HttpFetcher> {
454 Arc::new(HttpFetcher::new(reqwest::Client::new()))
455 }
456
457 fn pool() -> EgressPool {
458 EgressPool::new(vec![
459 (
460 None,
461 Some(cc("pl")),
462 EgressKind::Residential,
463 dummy_fetcher(),
464 ),
465 (
466 None,
467 Some(cc("de")),
468 EgressKind::Datacenter,
469 dummy_fetcher(),
470 ),
471 ])
472 }
473
474 #[test]
475 fn country_code_normalises_and_rejects() {
476 assert_eq!(CountryCode::new("PL").unwrap().as_str(), "pl");
477 assert!(CountryCode::new("p").is_none());
478 assert!(CountryCode::new("pol").is_none());
479 assert!(CountryCode::new("p1").is_none());
480 }
481
482 #[test]
483 fn unconstrained_policy_uses_default_egress() {
484 let choice = pool().select(&AccessPolicy::default());
485 assert!(matches!(choice, EgressChoice::Default));
486 }
487
488 #[test]
489 fn geo_match_picks_an_egress() {
490 let policy = AccessPolicy {
491 geo: vec![cc("pl")],
492 ..AccessPolicy::default()
493 };
494 assert!(matches!(pool().select(&policy), EgressChoice::Use(_)));
495 }
496
497 #[test]
498 fn ip_type_match_picks_an_egress() {
499 let policy = AccessPolicy {
500 ip_type: Some(EgressKind::Datacenter),
501 ..AccessPolicy::default()
502 };
503 assert!(matches!(pool().select(&policy), EgressChoice::Use(_)));
504 }
505
506 #[test]
507 fn geo_present_but_wrong_kind_is_unavailable() {
508 // PL exists in the pool, but only as Residential — asking for a
509 // PL *Mobile* egress must fail rather than fall back.
510 let policy = AccessPolicy {
511 geo: vec![cc("pl")],
512 ip_type: Some(EgressKind::Mobile),
513 ..AccessPolicy::default()
514 };
515 assert!(matches!(pool().select(&policy), EgressChoice::Unavailable));
516 }
517
518 #[test]
519 fn unknown_geo_is_unavailable() {
520 let policy = AccessPolicy {
521 geo: vec![cc("jp")],
522 ..AccessPolicy::default()
523 };
524 assert!(matches!(pool().select(&policy), EgressChoice::Unavailable));
525 }
526
527 #[test]
528 fn empty_pool_with_constraint_is_unavailable() {
529 let empty = EgressPool::new(Vec::new());
530 let policy = AccessPolicy {
531 geo: vec![cc("pl")],
532 ..AccessPolicy::default()
533 };
534 assert!(matches!(empty.select(&policy), EgressChoice::Unavailable));
535 }
536
537 #[test]
538 fn soft_prefer_match_routes_through_it() {
539 // prefer_geo = pl, pool has a PL residential → use it.
540 let policy = AccessPolicy {
541 prefer_geo: vec![cc("pl")],
542 ..AccessPolicy::default()
543 };
544 assert!(matches!(pool().select(&policy), EgressChoice::Use(_)));
545 }
546
547 #[test]
548 fn soft_prefer_no_match_falls_back_to_default() {
549 // prefer_geo = jp, pool has no JP egress → Default, NOT Unavailable.
550 // This is the whole point of soft routing: the probe still goes
551 // out, just via the unproxied default — the site is reachable
552 // from anywhere, the preference was a recall optimisation.
553 let policy = AccessPolicy {
554 prefer_geo: vec![cc("jp")],
555 ..AccessPolicy::default()
556 };
557 assert!(matches!(pool().select(&policy), EgressChoice::Default));
558 }
559
560 #[test]
561 fn hard_geo_wins_over_soft_prefer() {
562 // When both are set, hard `geo` is authoritative — prefer_geo
563 // is ignored. Asking for hard PL with no match in the JP-only
564 // prefer is still Unavailable.
565 let empty_pl = EgressPool::new(vec![(
566 None,
567 Some(cc("jp")),
568 EgressKind::Datacenter,
569 dummy_fetcher(),
570 )]);
571 let policy = AccessPolicy {
572 geo: vec![cc("pl")],
573 prefer_geo: vec![cc("jp")],
574 ..AccessPolicy::default()
575 };
576 assert!(matches!(
577 empty_pl.select(&policy),
578 EgressChoice::Unavailable
579 ));
580 }
581
582 #[test]
583 fn ip_type_only_is_still_hard() {
584 // Asking for residential when the pool has none must remain
585 // Unavailable. We only soften geo via prefer_geo — kind
586 // requirements are still load-bearing.
587 let dc_only = EgressPool::new(vec![(None, None, EgressKind::Datacenter, dummy_fetcher())]);
588 let policy = AccessPolicy {
589 ip_type: Some(EgressKind::Residential),
590 ..AccessPolicy::default()
591 };
592 assert!(matches!(dc_only.select(&policy), EgressChoice::Unavailable));
593 }
594
595 #[test]
596 fn session_apply_overrides_base_headers() {
597 let mut base = BTreeMap::new();
598 base.insert("X-IG-App-ID".to_string(), "936".to_string());
599 base.insert("Cookie".to_string(), "old".to_string());
600 let mut sh = BTreeMap::new();
601 sh.insert("Cookie".to_string(), "sessionid=real".to_string());
602 let merged = Session::from_headers(sh).apply(&base);
603 // Session wins on conflict; non-conflicting base header preserved.
604 assert_eq!(merged.get("Cookie").unwrap(), "sessionid=real");
605 assert_eq!(merged.get("X-IG-App-ID").unwrap(), "936");
606 }
607
608 #[test]
609 fn session_store_insert_and_lookup() {
610 let mut store = SessionStore::new();
611 assert!(store.is_empty());
612 store.insert("ig", Session::from_headers(BTreeMap::new()));
613 assert!(!store.is_empty());
614 assert!(store.get("ig").is_some());
615 assert!(store.get("missing").is_none());
616 }
617}