Skip to main content

adler_core/
check.rs

1//! Verdict types produced when a site is probed.
2
3use std::collections::BTreeMap;
4use std::fmt;
5
6use serde::{Deserialize, Serialize};
7
8use crate::confidence::{ConfidenceScore, ConfidenceSignals};
9use crate::profile::{ProfileEvidence, ProfileEvidenceKind};
10
11/// Outcome of a single site probe.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum MatchKind {
15    /// The account exists on this site.
16    Found,
17    /// The account does not exist on this site.
18    NotFound,
19    /// The response was inconclusive (network error, unexpected status,
20    /// ambiguous content). Reported separately so the user can review them
21    /// rather than silently dropping signal.
22    Uncertain,
23}
24
25impl MatchKind {
26    /// True if the verdict represents a positive (existing) account.
27    pub const fn is_found(self) -> bool {
28        matches!(self, Self::Found)
29    }
30}
31
32/// Why a probe was inconclusive.
33///
34/// `Uncertain` outcomes carry a typed reason rather than a free-form string,
35/// so logic that reacts to specific cases (e.g. retry on a transient ban)
36/// matches an enum variant instead of a fragile string. The [`fmt::Display`]
37/// rendering is what the CLI prints; serialization is the externally-tagged
38/// default (unit variants → a `snake_case` string, detail-carrying variants →
39/// `{ "network": "…" }`).
40#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
41#[serde(rename_all = "snake_case")]
42pub enum UncertainReason {
43    /// HTTP 429, or 503 with a `Retry-After` header.
44    RateLimited,
45    /// A Cloudflare interstitial / "checking your browser" page.
46    CloudflareChallenge,
47    /// A captcha gate.
48    Captcha,
49    /// The path is disallowed by the host's `robots.txt` (`--respect-robots`).
50    RobotsDisallowed,
51    /// The scan deadline elapsed before this site finished.
52    Deadline,
53    /// The executor's scheduler was closed (does not happen in practice).
54    SchedulerClosed,
55    /// A transport/network error while issuing the request.
56    Network(String),
57    /// An error reading the response body.
58    BodyRead(String),
59    /// A `bot-protected` site needed the browser backend but the per-scan
60    /// `--browser-budget` cap was already spent on earlier sites.
61    BrowserBudget,
62    /// The username doesn't satisfy the site's `regex_check`
63    /// (e.g. too short, contains forbidden characters). Reported
64    /// without issuing any HTTP request — saves both network and the
65    /// false-positive class where the site 404s on illegal usernames
66    /// in ways our signal can't tell apart from a missing account.
67    UsernameNotAllowed,
68    /// The browser backend itself failed (timeout, navigation error,
69    /// session drop, …) for a `bot-protected` site.
70    BrowserFailed(String),
71    /// The site's [`AccessPolicy`](crate::AccessPolicy) requires an
72    /// egress (country / IP type) that no configured proxy in the pool
73    /// satisfies, so the probe was skipped rather than fetched from the
74    /// wrong location. "Couldn't reach from the required geo" is not
75    /// "account absent" — hence `Uncertain`, never `NotFound`.
76    GeoUnavailable,
77    /// The site's [`AccessPolicy`](crate::AccessPolicy) names a session
78    /// (`access.session`) that wasn't supplied, so the probe was skipped
79    /// rather than sent unauthenticated into a login wall — which reads
80    /// the same for an existing and a missing account.
81    SessionRequired,
82    /// Any other reason (e.g. a `doctor` pre-flight skip).
83    Other(String),
84}
85
86impl fmt::Display for UncertainReason {
87    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
88        match self {
89            Self::RateLimited => f.write_str("rate_limited"),
90            Self::CloudflareChallenge => f.write_str("cloudflare_challenge"),
91            Self::Captcha => f.write_str("captcha"),
92            Self::RobotsDisallowed => f.write_str("robots_disallowed"),
93            Self::Deadline => f.write_str("deadline reached"),
94            Self::SchedulerClosed => f.write_str("scheduler closed"),
95            Self::Network(detail) => write!(f, "request: {detail}"),
96            Self::BodyRead(detail) => write!(f, "body read: {detail}"),
97            Self::BrowserBudget => f.write_str("browser_budget_exceeded"),
98            Self::UsernameNotAllowed => f.write_str("username_not_allowed"),
99            Self::BrowserFailed(detail) => write!(f, "browser: {detail}"),
100            Self::GeoUnavailable => f.write_str("geo_unavailable"),
101            Self::SessionRequired => f.write_str("session_required"),
102            Self::Other(detail) => f.write_str(detail),
103        }
104    }
105}
106
107/// Result of probing a single site for a username.
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct CheckOutcome {
110    /// Site name (matches `Site::name`).
111    pub site: String,
112    /// Concrete URL that was requested.
113    pub url: String,
114    /// Verdict produced by the site's detection strategy.
115    pub kind: MatchKind,
116    /// Why the outcome is `Uncertain`, if it is. `None` for `Found` /
117    /// `NotFound`.
118    #[serde(default, skip_serializing_if = "Option::is_none")]
119    pub reason: Option<UncertainReason>,
120    /// Wall-clock duration of the probe.
121    pub elapsed_ms: u64,
122    /// Fields extracted from a `Found` profile when `--enrich` is active
123    /// (e.g. `name`, `bio`, `avatar`). Empty unless enrichment ran and the
124    /// site has extractor rules. Ordered by field name.
125    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
126    pub enrichment: BTreeMap<String, String>,
127    /// Human-readable descriptions of the signals that produced the verdict —
128    /// e.g. `"HTTP 404 (status_not_found)"`. Empty for `Uncertain` (no signal
129    /// fired). Surfaced by `--explain`; always present in JSON output.
130    #[serde(default, skip_serializing_if = "Vec::is_empty")]
131    pub evidence: Vec<String>,
132    /// Normalized profile facts collected from extraction/enrichment. This is
133    /// distinct from the legacy `evidence` field above: `evidence` explains
134    /// the detection signal, while `profile_evidence` is structured product
135    /// data for confidence scoring, identity clustering, timelines, and
136    /// reports.
137    #[serde(default, skip_serializing_if = "Vec::is_empty")]
138    pub profile_evidence: Vec<ProfileEvidence>,
139    /// Explainable confidence in this per-site verdict.
140    #[serde(default)]
141    pub confidence: ConfidenceScore,
142    /// Which transport produced this outcome (HTTP / impersonate / browser).
143    /// `None` only on outcomes from older persisted scans saved before this
144    /// field existed; live scans always populate it.
145    #[serde(default, skip_serializing_if = "Option::is_none")]
146    pub transport: Option<crate::escalation::TransportTier>,
147    /// Number of *automatic* escalations to a heavier transport beyond the
148    /// site's primary route — usually 0, at most 1 today (HTTP / impersonate
149    /// → browser on `Uncertain(CloudflareChallenge | RateLimited)`).
150    /// Stamped so the doctor can spot sites where the primary route
151    /// systematically fails and the registry should pre-tag them.
152    #[serde(default, skip_serializing_if = "is_zero_u8")]
153    pub escalations: u8,
154}
155
156impl CheckOutcome {
157    /// Recompute confidence after callers attach signal or profile evidence.
158    pub fn refresh_confidence(&mut self) {
159        self.refresh_confidence_with_history(0);
160    }
161
162    /// Recompute confidence with a derived historical-consistency overlay.
163    ///
164    /// Live scan paths should call [`Self::refresh_confidence`] so outcomes
165    /// remain stateless. Persisted/history views can call this method after
166    /// computing a non-persisted history count.
167    pub fn refresh_confidence_with_history(&mut self, historical_consistency_count: usize) {
168        let access_paths = self
169            .profile_evidence
170            .iter()
171            .filter_map(|evidence| evidence.source.access_path.as_ref());
172        let authenticated_access = access_paths.clone().any(|path| path.authenticated);
173        let metadata_transport = access_paths.clone().map(|path| path.transport).next();
174        let metadata_escalated = access_paths.clone().any(|path| path.escalated);
175        let username_evidence_count = self
176            .profile_evidence
177            .iter()
178            .filter(|evidence| evidence.kind == ProfileEvidenceKind::Username)
179            .count();
180        let non_metadata_evidence_count = self
181            .profile_evidence
182            .iter()
183            .filter(|evidence| {
184                matches!(
185                    evidence.kind,
186                    ProfileEvidenceKind::Username | ProfileEvidenceKind::AvatarHash
187                )
188            })
189            .count();
190        let profile_evidence_count = self
191            .profile_evidence
192            .len()
193            .saturating_sub(non_metadata_evidence_count);
194        self.confidence = ConfidenceScore::from_signals(&ConfidenceSignals {
195            kind: self.kind,
196            reason: self.reason.clone(),
197            signal_evidence_count: self.evidence.len(),
198            profile_evidence_count,
199            username_evidence_count,
200            historical_consistency_count,
201            authenticated_access,
202            transport: metadata_transport.or(self.transport),
203            escalations: if metadata_escalated && self.escalations == 0 {
204                1
205            } else {
206                self.escalations
207            },
208        });
209    }
210}
211
212#[allow(clippy::trivially_copy_pass_by_ref)]
213fn is_zero_u8(n: &u8) -> bool {
214    *n == 0
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn match_kind_serialises_snake_case() {
223        assert_eq!(
224            serde_json::to_string(&MatchKind::Found).unwrap(),
225            "\"found\""
226        );
227        assert_eq!(
228            serde_json::to_string(&MatchKind::NotFound).unwrap(),
229            "\"not_found\""
230        );
231        assert_eq!(
232            serde_json::to_string(&MatchKind::Uncertain).unwrap(),
233            "\"uncertain\""
234        );
235    }
236
237    #[test]
238    fn match_kind_is_found() {
239        assert!(MatchKind::Found.is_found());
240        assert!(!MatchKind::NotFound.is_found());
241        assert!(!MatchKind::Uncertain.is_found());
242    }
243
244    #[test]
245    fn outcome_skips_absent_reason() {
246        let outcome = CheckOutcome {
247            site: "GitHub".into(),
248            url: "https://github.com/alice".into(),
249            kind: MatchKind::Found,
250            reason: None,
251            elapsed_ms: 42,
252            enrichment: BTreeMap::new(),
253            evidence: Vec::new(),
254            profile_evidence: Vec::new(),
255            confidence: ConfidenceScore::default(),
256            transport: None,
257            escalations: 0,
258        };
259        let json = serde_json::to_string(&outcome).unwrap();
260        assert!(
261            !json.contains("reason"),
262            "reason field must be omitted when None"
263        );
264        assert!(
265            !json.contains("enrichment"),
266            "enrichment must be omitted when empty"
267        );
268        assert!(
269            !json.contains("transport"),
270            "transport must be omitted when None"
271        );
272        assert!(
273            !json.contains("escalations"),
274            "escalations must be omitted when zero"
275        );
276        assert!(json.contains("\"kind\":\"found\""));
277        assert!(json.contains("\"elapsed_ms\":42"));
278    }
279
280    #[test]
281    fn unit_reason_serialises_as_snake_case_string() {
282        let outcome = CheckOutcome {
283            site: "GitHub".into(),
284            url: "https://github.com/alice".into(),
285            kind: MatchKind::Uncertain,
286            reason: Some(UncertainReason::RateLimited),
287            elapsed_ms: 5_000,
288            enrichment: BTreeMap::new(),
289            evidence: Vec::new(),
290            profile_evidence: Vec::new(),
291            confidence: ConfidenceScore::default(),
292            transport: None,
293            escalations: 0,
294        };
295        let json = serde_json::to_string(&outcome).unwrap();
296        assert!(json.contains("\"reason\":\"rate_limited\""), "{json}");
297    }
298
299    #[test]
300    fn detail_reason_serialises_as_tagged_object() {
301        let json = serde_json::to_string(&UncertainReason::Network("refused".into())).unwrap();
302        assert_eq!(json, "{\"network\":\"refused\"}");
303    }
304
305    #[test]
306    fn reason_display_matches_legacy_note_text() {
307        assert_eq!(UncertainReason::RateLimited.to_string(), "rate_limited");
308        assert_eq!(UncertainReason::Deadline.to_string(), "deadline reached");
309        assert_eq!(
310            UncertainReason::Network("boom".into()).to_string(),
311            "request: boom"
312        );
313    }
314
315    #[test]
316    fn old_outcome_json_defaults_confidence_and_profile_evidence() {
317        let json = r#"{
318            "site": "GitHub",
319            "url": "https://github.com/alice",
320            "kind": "found",
321            "elapsed_ms": 42
322        }"#;
323        let mut outcome: CheckOutcome = serde_json::from_str(json).unwrap();
324        assert!(outcome.profile_evidence.is_empty());
325        assert_eq!(outcome.confidence, ConfidenceScore::default());
326        outcome.refresh_confidence();
327        assert_eq!(outcome.confidence.score, 65);
328    }
329
330    #[test]
331    fn avatar_hash_evidence_does_not_count_as_profile_metadata() {
332        let mut outcome = CheckOutcome {
333            site: "Example".to_owned(),
334            url: "https://example.com/alice".to_owned(),
335            kind: MatchKind::Found,
336            reason: None,
337            elapsed_ms: 10,
338            enrichment: std::collections::BTreeMap::new(),
339            evidence: vec!["HTTP 200 (status_found)".to_owned()],
340            profile_evidence: vec![ProfileEvidence::from_avatar_hash(
341                "Example",
342                "https://example.com/alice",
343                "dhash64_v1:0123456789abcdef",
344                Some(100),
345                None,
346            )],
347            confidence: ConfidenceScore::default(),
348            transport: Some(crate::TransportTier::Http),
349            escalations: 0,
350        };
351
352        outcome.refresh_confidence();
353
354        assert_eq!(outcome.confidence.score, 70);
355        assert!(
356            outcome
357                .confidence
358                .reasons
359                .contains(&crate::ConfidenceReason::WeakStatusOnly)
360        );
361        assert!(!outcome.confidence.reasons.iter().any(|reason| {
362            matches!(
363                reason,
364                crate::ConfidenceReason::ProfileMetadataExtracted { .. }
365            )
366        }));
367    }
368}