Skip to main content

px_cloudflare/
lib.rs

1//! Cloudflare challenge handler.
2//!
3//! Detects Cloudflare interstitials in fetched HTML and, when constructed
4//! with a CF-bypass harvester (e.g. [`px-camoufox::CamoufoxPool`]), re-harvests
5//! the URL to recover `cf_clearance` / `__cf_bm` plus any downstream PX cookies
6//! that the same fetch happens to set.
7//!
8//! See ADR-0015 (handler stub) and ADR-0020 (Camoufox path).
9
10use async_trait::async_trait;
11use px_core::{CookieJarDelta, NamedCookie};
12use px_errors::AppError;
13use px_harvester::{HarvestRequest, Harvester};
14use px_pipeline::{ChallengeHandler, HandlerMetrics, HandlerOutcome, PageHtml};
15use std::sync::Arc;
16use std::time::Instant;
17
18mod cookie_extractor;
19pub use cookie_extractor::{extract_session_cookies, is_session_cookie};
20
21pub struct CloudflareHandler {
22    harvester: Option<Arc<dyn Harvester>>,
23}
24
25impl CloudflareHandler {
26    pub fn new() -> Self {
27        Self { harvester: None }
28    }
29
30    pub fn with_harvester(harvester: Arc<dyn Harvester>) -> Self {
31        Self {
32            harvester: Some(harvester),
33        }
34    }
35}
36
37impl Default for CloudflareHandler {
38    fn default() -> Self {
39        Self::new()
40    }
41}
42
43#[async_trait]
44impl ChallengeHandler for CloudflareHandler {
45    fn name(&self) -> &'static str {
46        "cloudflare"
47    }
48
49    async fn detects(&self, page: &PageHtml) -> Result<bool, AppError> {
50        let h = &page.html;
51        Ok(h.contains("cdn-cgi/challenge-platform")
52            || h.contains("cf-mitigated")
53            || h.contains("cf_clearance"))
54    }
55
56    async fn solve(&self, page: &PageHtml) -> Result<HandlerOutcome, AppError> {
57        let Some(harvester) = self.harvester.as_ref() else {
58            return Ok(HandlerOutcome::not_implemented(self.name()));
59        };
60        let start = Instant::now();
61        let result = harvester.harvest(HarvestRequest::new(&page.url)).await?;
62        let session_cookies: Vec<NamedCookie> = extract_session_cookies(&result.cookies)
63            .into_iter()
64            .map(|c| NamedCookie {
65                name: c.name,
66                value: c.value,
67                domain: c.domain,
68                path: c.path,
69            })
70            .collect();
71        let delta = CookieJarDelta {
72            set: session_cookies,
73            removed: Vec::new(),
74        };
75        let metrics = HandlerMetrics {
76            detect_us: 0,
77            solve_ms: start.elapsed().as_millis() as u64,
78            bytes_read: result.html.len() as u64,
79        };
80        Ok(HandlerOutcome::solved_with_ua(
81            self.name(),
82            delta,
83            Vec::new(),
84            metrics,
85            result.user_agent,
86        ))
87    }
88}
89
90#[cfg(test)]
91#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
92mod tests {
93    use super::*;
94    use px_harvester::{HarvestResult, HarvestedCookie};
95
96    struct FakeHarvester {
97        ua: String,
98        cookies: Vec<HarvestedCookie>,
99        html: String,
100    }
101
102    #[async_trait]
103    impl Harvester for FakeHarvester {
104        async fn harvest(&self, _req: HarvestRequest) -> Result<HarvestResult, AppError> {
105            Ok(HarvestResult {
106                html: self.html.clone(),
107                user_agent: self.ua.clone(),
108                cookies: self.cookies.clone(),
109            })
110        }
111    }
112
113    fn cookie(name: &str) -> HarvestedCookie {
114        HarvestedCookie {
115            name: name.into(),
116            value: "v".into(),
117            domain: "x.com".into(),
118            path: "/".into(),
119        }
120    }
121
122    #[tokio::test]
123    async fn solve_without_harvester_is_not_implemented() {
124        let h = CloudflareHandler::new();
125        let page = PageHtml::new("https://x.com", "");
126        let oc = h.solve(&page).await.expect("solve");
127        assert_eq!(oc.status, px_pipeline::HandlerStatus::NotImplemented);
128    }
129
130    #[tokio::test]
131    async fn solve_with_harvester_returns_session_cookies_and_ua() {
132        let fake = Arc::new(FakeHarvester {
133            ua: "Mozilla/5.0 Camoufox".into(),
134            cookies: vec![
135                cookie("cf_clearance"),
136                cookie("__cf_bm"),
137                cookie("_px3"),
138                cookie("_pxhd"),
139                cookie("unrelated_session"),
140            ],
141            html: "real page".into(),
142        });
143        let h = CloudflareHandler::with_harvester(fake);
144        let page = PageHtml::new("https://x.com", "<challenge>");
145        let oc = h.solve(&page).await.expect("solve");
146        assert_eq!(oc.status, px_pipeline::HandlerStatus::Solved);
147        assert_eq!(oc.user_agent.as_deref(), Some("Mozilla/5.0 Camoufox"));
148        let names: Vec<&str> = oc.cookies.set.iter().map(|c| c.name.as_str()).collect();
149        assert!(names.contains(&"cf_clearance"));
150        assert!(names.contains(&"__cf_bm"));
151        assert!(names.contains(&"_px3"));
152        assert!(names.contains(&"_pxhd"));
153        assert!(!names.contains(&"unrelated_session"));
154    }
155}