Skip to main content

adler_core/
telemetry.rs

1//! Cross-scan telemetry analysis — close the doctor loop on sites
2//! that consistently need the browser backend.
3//!
4//! The router stamps every [`CheckOutcome`](crate::CheckOutcome) with the
5//! transport that produced its verdict (HTTP / impersonate / browser) and
6//! an escalation count. A site that *consistently* requires escalation
7//! across many scans is one the registry hasn't yet tagged with the right
8//! [`protection`](crate::ProtectionKind) hint — every fresh scan pays the
9//! cost of a failing HTTP probe before the router gives up and re-tries
10//! through the browser. Pre-tagging the site lets the router skip the
11//! cheap path next time.
12//!
13//! This module is the pure analytics: given a stream of per-scan outcome
14//! slices (read from `$XDG_CACHE_HOME/adler/scans/*.json` in
15//! `adler-cli`, but the input shape is unopinionated), it groups by
16//! site, decides which sites meet the "consistently escalates"
17//! threshold, and emits [`EscalationFinding`]s that
18//! `adler --doctor --suggest-protection` prints as paste-ready
19//! suggestions.
20
21use crate::check::{CheckOutcome, MatchKind, UncertainReason};
22use crate::escalation::TransportTier;
23use crate::site::ProtectionKind;
24use std::collections::HashMap;
25
26/// Default ratio at which `--suggest-protection` surfaces a site.
27/// 60% of scans needing escalation is the boundary between
28/// "intermittent edge case" and "load-bearing pattern".
29pub const DEFAULT_THRESHOLD_RATIO: f32 = 0.6;
30
31/// Default minimum scan count before a site is considered for a
32/// suggestion. Three distinct scans is the smallest sample where a
33/// pattern beats a coincidence.
34pub const DEFAULT_MIN_SCANS: u32 = 3;
35
36/// Per-site evidence drawn from a cross-scan outcome history.
37///
38/// Ready to be turned into a `protection: <kind>` suggestion.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct EscalationFinding {
41    /// Site name as it appears in [`Site::name`](crate::Site::name).
42    pub site: String,
43    /// Distinct scans in which the site appeared in the input.
44    pub scans_seen: u32,
45    /// Of those, how many produced *evidence* of needing the
46    /// browser: either a successful escalation (`transport=Browser`,
47    /// `escalations>=1`) or an `Uncertain` outcome whose reason
48    /// would have triggered escalation if a browser had been
49    /// configured (`CloudflareChallenge`, `RateLimited`).
50    pub escalation_evidence: u32,
51    /// Most common evidence type observed for this site — used to
52    /// pick the suggested [`ProtectionKind`].
53    pub dominant_reason: EvidenceKind,
54    /// Suggested addition to the site's [`protection`](crate::Site::protection)
55    /// vector. Always populated when the finding is emitted.
56    pub suggested_protection: ProtectionKind,
57}
58
59impl EscalationFinding {
60    /// Ratio of scans where the site needed (or would have needed) escalation.
61    /// Always in `[0.0, 1.0]`.
62    #[must_use]
63    pub fn ratio(&self) -> f32 {
64        if self.scans_seen == 0 {
65            0.0
66        } else {
67            f32::from(u16::try_from(self.escalation_evidence).unwrap_or(u16::MAX))
68                / f32::from(u16::try_from(self.scans_seen).unwrap_or(u16::MAX))
69        }
70    }
71}
72
73/// What kind of cross-scan evidence triggered the finding.
74///
75/// Maps 1:1 to the [`UncertainReason`] taxonomy that drives
76/// escalation, plus a "browser succeeded" bucket for outcomes where
77/// escalation already happened and produced a verdict.
78#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
79#[non_exhaustive]
80pub enum EvidenceKind {
81    /// HTTP-path `Uncertain(cloudflare_challenge)` or the same reason
82    /// resolved after escalation through the browser.
83    CloudflareChallenge,
84    /// HTTP-path `Uncertain(rate_limited)` or the same reason resolved
85    /// after escalation.
86    RateLimited,
87}
88
89impl EvidenceKind {
90    /// Map evidence to the [`ProtectionKind`] suggestion. Both
91    /// existing kinds point at `Cloudflare` today — even rate-limit
92    /// edges most commonly sit behind Cloudflare's WAF — but the
93    /// mapping is enum-keyed so a future split (e.g. `DdosGuard` /
94    /// `CfFirewall`) can be added without touching callers.
95    #[must_use]
96    pub const fn suggested_protection(self) -> ProtectionKind {
97        match self {
98            Self::CloudflareChallenge | Self::RateLimited => ProtectionKind::Cloudflare,
99        }
100    }
101}
102
103/// Tally evidence for one site across many scans.
104#[derive(Default, Debug)]
105struct SiteTally {
106    scans_seen: u32,
107    cloudflare_evidence: u32,
108    ratelimit_evidence: u32,
109}
110
111impl SiteTally {
112    fn total_evidence(&self) -> u32 {
113        self.cloudflare_evidence + self.ratelimit_evidence
114    }
115
116    fn dominant(&self) -> Option<EvidenceKind> {
117        if self.total_evidence() == 0 {
118            return None;
119        }
120        if self.cloudflare_evidence >= self.ratelimit_evidence {
121            Some(EvidenceKind::CloudflareChallenge)
122        } else {
123            Some(EvidenceKind::RateLimited)
124        }
125    }
126}
127
128/// Classify one outcome's evidence contribution.
129///
130/// Returns:
131/// - `Some(EvidenceKind)` when the outcome shows the site needed (or
132///   would have needed) escalation — either a successful escalation
133///   to the browser (`transport=Browser && escalations>=1`) or a
134///   cheap-path `Uncertain` with a should-escalate reason.
135/// - `None` otherwise — the outcome doesn't argue for pre-tagging.
136fn classify(outcome: &CheckOutcome) -> Option<EvidenceKind> {
137    if matches!(outcome.transport, Some(TransportTier::Browser)) && outcome.escalations >= 1 {
138        // Escalation already happened, browser produced the final
139        // verdict. The original reason was either CloudflareChallenge
140        // or RateLimited (only ones `should_escalate` accepts);
141        // CloudflareChallenge is the conservative default since most
142        // 429s on profile-search endpoints are CF-side.
143        return Some(EvidenceKind::CloudflareChallenge);
144    }
145    if outcome.kind == MatchKind::Uncertain {
146        match outcome.reason.as_ref()? {
147            UncertainReason::CloudflareChallenge => return Some(EvidenceKind::CloudflareChallenge),
148            UncertainReason::RateLimited => return Some(EvidenceKind::RateLimited),
149            _ => {}
150        }
151    }
152    None
153}
154
155/// Aggregate per-site evidence over a series of scans, returning
156/// findings for every site that meets `threshold_ratio` and
157/// `min_scans`. Sorted by ratio descending, then by site name.
158///
159/// Pass each scan's outcomes as one slice — the analyzer counts
160/// per-site *scans* (not per-site outcomes), so a single
161/// `&[CheckOutcome]` is one observation per site, even when the
162/// caller has many slices.
163pub fn analyze_escalation_history<'a>(
164    scans: impl IntoIterator<Item = &'a [CheckOutcome]>,
165    threshold_ratio: f32,
166    min_scans: u32,
167) -> Vec<EscalationFinding> {
168    let mut tallies: HashMap<String, SiteTally> = HashMap::new();
169    for outcomes in scans {
170        for outcome in outcomes {
171            let entry = tallies.entry(outcome.site.clone()).or_default();
172            entry.scans_seen += 1;
173            match classify(outcome) {
174                Some(EvidenceKind::CloudflareChallenge) => entry.cloudflare_evidence += 1,
175                Some(EvidenceKind::RateLimited) => entry.ratelimit_evidence += 1,
176                None => {}
177            }
178        }
179    }
180
181    let mut findings: Vec<EscalationFinding> = tallies
182        .into_iter()
183        .filter_map(|(site, tally)| {
184            if tally.scans_seen < min_scans {
185                return None;
186            }
187            let dominant = tally.dominant()?;
188            let evidence = tally.total_evidence();
189            let ratio = f32::from(u16::try_from(evidence).unwrap_or(u16::MAX))
190                / f32::from(u16::try_from(tally.scans_seen).unwrap_or(u16::MAX));
191            if ratio < threshold_ratio {
192                return None;
193            }
194            Some(EscalationFinding {
195                site,
196                scans_seen: tally.scans_seen,
197                escalation_evidence: evidence,
198                dominant_reason: dominant,
199                suggested_protection: dominant.suggested_protection(),
200            })
201        })
202        .collect();
203    findings.sort_by(|a, b| {
204        b.ratio()
205            .partial_cmp(&a.ratio())
206            .unwrap_or(std::cmp::Ordering::Equal)
207            .then_with(|| a.site.cmp(&b.site))
208    });
209    findings
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215    use crate::check::CheckOutcome;
216
217    fn outcome(site: &str, kind: MatchKind, reason: Option<UncertainReason>) -> CheckOutcome {
218        CheckOutcome {
219            site: site.to_owned(),
220            url: format!("https://{site}.example/foo"),
221            kind,
222            reason,
223            elapsed_ms: 100,
224            evidence: Vec::new(),
225            enrichment: std::collections::BTreeMap::new(),
226            transport: None,
227            escalations: 0,
228        }
229    }
230
231    fn outcome_browser_escalated(site: &str) -> CheckOutcome {
232        CheckOutcome {
233            site: site.to_owned(),
234            url: format!("https://{site}.example/foo"),
235            kind: MatchKind::Found,
236            reason: None,
237            elapsed_ms: 200,
238            evidence: Vec::new(),
239            enrichment: std::collections::BTreeMap::new(),
240            transport: Some(TransportTier::Browser),
241            escalations: 1,
242        }
243    }
244
245    fn outcome_http_uncertain_cf(site: &str) -> CheckOutcome {
246        outcome(
247            site,
248            MatchKind::Uncertain,
249            Some(UncertainReason::CloudflareChallenge),
250        )
251    }
252
253    fn outcome_http_uncertain_rl(site: &str) -> CheckOutcome {
254        outcome(
255            site,
256            MatchKind::Uncertain,
257            Some(UncertainReason::RateLimited),
258        )
259    }
260
261    fn outcome_http_found(site: &str) -> CheckOutcome {
262        outcome(site, MatchKind::Found, None)
263    }
264
265    #[test]
266    fn consistent_escalation_produces_finding() {
267        let scans: Vec<Vec<CheckOutcome>> = (0..5)
268            .map(|_| vec![outcome_browser_escalated("CDNed")])
269            .collect();
270        let scan_slices: Vec<&[CheckOutcome]> = scans.iter().map(Vec::as_slice).collect();
271        let findings = analyze_escalation_history(scan_slices.iter().copied(), 0.6, 3);
272        assert_eq!(findings.len(), 1);
273        assert_eq!(findings[0].site, "CDNed");
274        assert_eq!(findings[0].scans_seen, 5);
275        assert_eq!(findings[0].escalation_evidence, 5);
276        assert!((findings[0].ratio() - 1.0).abs() < f32::EPSILON);
277        assert_eq!(findings[0].suggested_protection, ProtectionKind::Cloudflare);
278    }
279
280    #[test]
281    fn http_only_site_does_not_get_flagged() {
282        // GitHub: every scan is a clean HTTP Found.
283        let scans: Vec<Vec<CheckOutcome>> = (0..10)
284            .map(|_| vec![outcome_http_found("GitHub")])
285            .collect();
286        let scan_slices: Vec<&[CheckOutcome]> = scans.iter().map(Vec::as_slice).collect();
287        let findings = analyze_escalation_history(scan_slices.iter().copied(), 0.6, 3);
288        assert!(findings.is_empty(), "{findings:?}");
289    }
290
291    #[test]
292    fn intermittent_escalation_below_threshold_skipped() {
293        // 2 of 10 scans escalated → 20% < 60% threshold. Not flagged.
294        let mut scans: Vec<Vec<CheckOutcome>> = Vec::new();
295        for _ in 0..2 {
296            scans.push(vec![outcome_browser_escalated("FlakyEdge")]);
297        }
298        for _ in 0..8 {
299            scans.push(vec![outcome_http_found("FlakyEdge")]);
300        }
301        let scan_slices: Vec<&[CheckOutcome]> = scans.iter().map(Vec::as_slice).collect();
302        let findings = analyze_escalation_history(scan_slices.iter().copied(), 0.6, 3);
303        assert!(findings.is_empty(), "{findings:?}");
304    }
305
306    #[test]
307    fn too_few_scans_skipped_even_at_full_ratio() {
308        // 2 of 2 escalated but min_scans=3 → not enough sample.
309        let scans: Vec<Vec<CheckOutcome>> = (0..2)
310            .map(|_| vec![outcome_browser_escalated("RareSite")])
311            .collect();
312        let scan_slices: Vec<&[CheckOutcome]> = scans.iter().map(Vec::as_slice).collect();
313        let findings = analyze_escalation_history(scan_slices.iter().copied(), 0.6, 3);
314        assert!(findings.is_empty(), "{findings:?}");
315    }
316
317    #[test]
318    fn http_uncertain_with_should_escalate_reason_counts_too() {
319        // No browser configured this run, but the HTTP probe still
320        // returned a should-escalate reason — that's evidence
321        // pre-tagging would have helped.
322        let scans: Vec<Vec<CheckOutcome>> = (0..4)
323            .map(|_| vec![outcome_http_uncertain_cf("WalledOff")])
324            .collect();
325        let scan_slices: Vec<&[CheckOutcome]> = scans.iter().map(Vec::as_slice).collect();
326        let findings = analyze_escalation_history(scan_slices.iter().copied(), 0.6, 3);
327        assert_eq!(findings.len(), 1);
328        assert_eq!(findings[0].site, "WalledOff");
329        assert_eq!(
330            findings[0].dominant_reason,
331            EvidenceKind::CloudflareChallenge
332        );
333    }
334
335    #[test]
336    fn dominant_reason_picks_higher_count() {
337        // 4× CloudflareChallenge, 1× RateLimited → CloudflareChallenge wins.
338        let mut scans: Vec<Vec<CheckOutcome>> = Vec::new();
339        for _ in 0..4 {
340            scans.push(vec![outcome_http_uncertain_cf("Mixed")]);
341        }
342        scans.push(vec![outcome_http_uncertain_rl("Mixed")]);
343        let scan_slices: Vec<&[CheckOutcome]> = scans.iter().map(Vec::as_slice).collect();
344        let findings = analyze_escalation_history(scan_slices.iter().copied(), 0.6, 3);
345        assert_eq!(findings.len(), 1);
346        assert_eq!(
347            findings[0].dominant_reason,
348            EvidenceKind::CloudflareChallenge
349        );
350    }
351
352    #[test]
353    fn findings_sorted_by_ratio_then_name() {
354        // Two sites: A escalates 5/5, B escalates 3/5. A first.
355        let mut scans: Vec<Vec<CheckOutcome>> = Vec::new();
356        for _ in 0..5 {
357            scans.push(vec![
358                outcome_browser_escalated("Aardvark"),
359                outcome_browser_escalated("Beaver"),
360            ]);
361        }
362        // Drop 2 of Beaver's escalations.
363        scans[3] = vec![
364            outcome_browser_escalated("Aardvark"),
365            outcome_http_found("Beaver"),
366        ];
367        scans[4] = vec![
368            outcome_browser_escalated("Aardvark"),
369            outcome_http_found("Beaver"),
370        ];
371
372        let scan_slices: Vec<&[CheckOutcome]> = scans.iter().map(Vec::as_slice).collect();
373        let findings = analyze_escalation_history(scan_slices.iter().copied(), 0.5, 3);
374        assert_eq!(findings.len(), 2);
375        assert_eq!(findings[0].site, "Aardvark");
376        assert!(findings[0].ratio() > findings[1].ratio());
377    }
378
379    #[test]
380    fn empty_input_returns_empty() {
381        let findings: Vec<EscalationFinding> =
382            analyze_escalation_history(std::iter::empty::<&[CheckOutcome]>(), 0.5, 1);
383        assert!(findings.is_empty());
384    }
385}