Skip to main content

stygian_browser/
acquisition.rs

1//! Opinionated acquisition runner with deterministic escalation.
2//!
3//! The runner executes a mode-specific strategy ladder and returns a terminal
4//! [`AcquisitionResult`] for every request, including setup-failure and timeout
5//! paths.
6
7use std::sync::Arc;
8use std::time::{Duration, Instant};
9
10#[cfg(feature = "browserbase")]
11use chromiumoxide::Browser;
12#[cfg(feature = "browserbase")]
13use futures::StreamExt;
14use serde::{Deserialize, Serialize};
15use serde_json::Value;
16#[cfg(feature = "browserbase")]
17use tokio::time::timeout;
18
19use crate::BrowserPool;
20use crate::error::BrowserError;
21use crate::page::WaitUntil;
22
23/// Opinionated acquisition mode for the escalation ladder.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum AcquisitionMode {
27    /// Prioritize lowest-latency paths.
28    Fast,
29    /// Favor reliability with broader escalation.
30    Resilient,
31    /// Start from stronger anti-bot paths.
32    Hostile,
33    /// Enter from a policy-guided start point.
34    Investigate,
35}
36
37/// Strategy stage attempted by the acquisition runner.
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
39#[serde(rename_all = "snake_case")]
40pub enum StrategyUsed {
41    /// Plain HTTP fetch.
42    DirectHttp,
43    /// HTTP fetch using a TLS-profiled client.
44    TlsProfiledHttp,
45    /// Browser session with opinionated light-stealth defaults.
46    BrowserLightStealth,
47    /// Browser session scoped to a sticky context id.
48    StickyProxyBrowserSession,
49    /// Managed remote browser session routed through Browserbase.
50    #[cfg(feature = "browserbase")]
51    BrowserbaseManagedSession,
52    /// Policy-guided entry marker for investigation mode.
53    InvestigateEntry,
54}
55
56/// One acquisition request.
57#[derive(Debug, Clone)]
58pub struct AcquisitionRequest {
59    /// Target URL.
60    pub url: String,
61    /// Acquisition mode.
62    pub mode: AcquisitionMode,
63    /// Optional selector that must be present for browser-stage success.
64    pub wait_for_selector: Option<String>,
65    /// Optional JavaScript extraction expression evaluated in browser stages.
66    pub extraction_js: Option<String>,
67    /// Hard wall-clock timeout for the whole acquisition attempt.
68    pub total_timeout: Duration,
69    /// Per-navigation timeout for browser stages.
70    pub navigation_timeout: Duration,
71    /// Per-request timeout for HTTP stages.
72    pub request_timeout: Duration,
73    /// Maximum HTML bytes captured into `html_excerpt`.
74    pub html_excerpt_bytes: usize,
75    /// Optional policy-guided stage that `Investigate` mode starts from.
76    pub investigate_start: Option<StrategyUsed>,
77    /// Opt into the optional Browserbase-managed stage when available.
78    pub browserbase_enabled: bool,
79}
80
81impl Default for AcquisitionRequest {
82    fn default() -> Self {
83        Self {
84            url: String::new(),
85            mode: AcquisitionMode::Resilient,
86            wait_for_selector: None,
87            extraction_js: None,
88            total_timeout: Duration::from_secs(45),
89            navigation_timeout: Duration::from_secs(30),
90            request_timeout: Duration::from_secs(15),
91            html_excerpt_bytes: 4_096,
92            investigate_start: None,
93            browserbase_enabled: false,
94        }
95    }
96}
97
98/// Failure class recorded per strategy stage.
99#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
100#[serde(rename_all = "snake_case")]
101pub enum StageFailureKind {
102    /// Stage initialization/setup failed.
103    Setup,
104    /// Stage hit a timeout.
105    Timeout,
106    /// Stage reached a known anti-bot block class.
107    Blocked,
108    /// Transport/runtime failure.
109    Transport,
110    /// Extraction/validation failure.
111    Extraction,
112}
113
114/// Captured failure record for one stage.
115#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
116pub struct StageFailure {
117    /// Stage where the failure happened.
118    pub strategy: StrategyUsed,
119    /// Coarse failure kind.
120    pub kind: StageFailureKind,
121    /// Compact diagnostic message.
122    pub message: String,
123}
124
125/// Terminal acquisition result.
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct AcquisitionResult {
128    /// `true` when any stage satisfied success criteria.
129    pub success: bool,
130    /// Stage that produced the terminal success, if any.
131    pub strategy_used: Option<StrategyUsed>,
132    /// Ordered stage attempts.
133    pub attempted: Vec<StrategyUsed>,
134    /// Final URL observed from the successful stage.
135    pub final_url: Option<String>,
136    /// HTTP status code observed from the successful stage.
137    pub status_code: Option<u16>,
138    /// Best-effort HTML excerpt from the successful stage.
139    pub html_excerpt: Option<String>,
140    /// Optional extraction payload.
141    pub extracted: Option<Value>,
142    /// Failure bundle collected across stages.
143    pub failures: Vec<StageFailure>,
144    /// `true` when the wall-clock timeout fired before completion.
145    pub timed_out: bool,
146}
147
148impl AcquisitionResult {
149    const fn empty() -> Self {
150        Self {
151            success: false,
152            strategy_used: None,
153            attempted: Vec::new(),
154            final_url: None,
155            status_code: None,
156            html_excerpt: None,
157            extracted: None,
158            failures: Vec::new(),
159            timed_out: false,
160        }
161    }
162}
163
164#[derive(Debug, Clone)]
165struct StageSuccess {
166    final_url: Option<String>,
167    status_code: Option<u16>,
168    html_excerpt: Option<String>,
169    extracted: Option<Value>,
170}
171
172#[derive(Debug, Clone)]
173enum StageOutcome {
174    Marker,
175    Success(StageSuccess),
176    Failure(StageFailure),
177}
178
179/// Runner facade for opinionated acquisition.
180#[derive(Clone)]
181pub struct AcquisitionRunner {
182    pool: Arc<BrowserPool>,
183}
184
185impl AcquisitionRunner {
186    /// Create a new acquisition runner.
187    ///
188    /// # Example
189    ///
190    /// ```no_run
191    /// use stygian_browser::{AcquisitionRunner, BrowserConfig, BrowserPool};
192    ///
193    /// # async fn run() -> stygian_browser::Result<()> {
194    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
195    /// let _runner = AcquisitionRunner::new(pool);
196    /// # Ok(())
197    /// # }
198    /// ```
199    #[must_use]
200    pub const fn new(pool: Arc<BrowserPool>) -> Self {
201        Self { pool }
202    }
203
204    /// Return the deterministic stage ladder for a mode.
205    ///
206    /// Investigation mode starts at `investigate_start` when provided.
207    #[must_use]
208    pub fn strategy_ladder(
209        mode: AcquisitionMode,
210        investigate_start: Option<StrategyUsed>,
211    ) -> Vec<StrategyUsed> {
212        let mut stages = match mode {
213            AcquisitionMode::Fast => vec![
214                StrategyUsed::DirectHttp,
215                StrategyUsed::TlsProfiledHttp,
216                StrategyUsed::BrowserLightStealth,
217            ],
218            AcquisitionMode::Resilient => vec![
219                StrategyUsed::DirectHttp,
220                StrategyUsed::TlsProfiledHttp,
221                StrategyUsed::BrowserLightStealth,
222                StrategyUsed::StickyProxyBrowserSession,
223            ],
224            AcquisitionMode::Hostile => vec![
225                StrategyUsed::BrowserLightStealth,
226                StrategyUsed::StickyProxyBrowserSession,
227                StrategyUsed::TlsProfiledHttp,
228                StrategyUsed::DirectHttp,
229            ],
230            AcquisitionMode::Investigate => {
231                let start = investigate_start.unwrap_or(StrategyUsed::BrowserLightStealth);
232                vec![
233                    StrategyUsed::InvestigateEntry,
234                    start,
235                    StrategyUsed::StickyProxyBrowserSession,
236                    StrategyUsed::TlsProfiledHttp,
237                ]
238            }
239        };
240
241        dedupe_preserve_order(&mut stages);
242        stages
243    }
244
245    /// Execute the acquisition ladder and return a terminal result.
246    ///
247    /// This method never panics and always returns an [`AcquisitionResult`],
248    /// including timeout and setup-failure paths.
249    ///
250    /// # Example
251    ///
252    /// ```no_run
253    /// use stygian_browser::{AcquisitionMode, AcquisitionRequest, AcquisitionRunner, BrowserConfig, BrowserPool};
254    ///
255    /// # async fn run() -> stygian_browser::Result<()> {
256    /// let pool = BrowserPool::new(BrowserConfig::default()).await?;
257    /// let runner = AcquisitionRunner::new(pool);
258    /// let request = AcquisitionRequest {
259    ///     url: "https://example.com".to_string(),
260    ///     mode: AcquisitionMode::Resilient,
261    ///     ..AcquisitionRequest::default()
262    /// };
263    /// let _result = runner.run(request).await;
264    /// # Ok(())
265    /// # }
266    /// ```
267    pub async fn run(&self, request: AcquisitionRequest) -> AcquisitionResult {
268        let timeout = request.total_timeout;
269        let timeout_strategy = Self::strategy_ladder(request.mode, request.investigate_start)
270            .into_iter()
271            .find(|strategy| *strategy != StrategyUsed::InvestigateEntry)
272            .unwrap_or(StrategyUsed::DirectHttp);
273        let mut result = tokio::time::timeout(timeout, self.run_inner(&request))
274            .await
275            .unwrap_or_else(|_| {
276                let mut timed_out = AcquisitionResult::empty();
277                timed_out.timed_out = true;
278                timed_out.failures.push(StageFailure {
279                    strategy: timeout_strategy,
280                    kind: StageFailureKind::Timeout,
281                    message: format!("acquisition timed out after {}ms", timeout.as_millis()),
282                });
283                timed_out
284            });
285
286        if !result.success {
287            // Guarantee deterministic terminal output for all unsuccessful runs.
288            if result.failures.is_empty() {
289                result.failures.push(StageFailure {
290                    strategy: timeout_strategy,
291                    kind: StageFailureKind::Transport,
292                    message: "acquisition ended without stage output".to_string(),
293                });
294            }
295        }
296
297        result
298    }
299
300    async fn run_inner(&self, request: &AcquisitionRequest) -> AcquisitionResult {
301        let mut result = AcquisitionResult::empty();
302
303        #[cfg(feature = "browserbase")]
304        let mut ladder = Self::strategy_ladder(request.mode, request.investigate_start);
305
306        #[cfg(not(feature = "browserbase"))]
307        let ladder = Self::strategy_ladder(request.mode, request.investigate_start);
308
309        #[cfg(feature = "browserbase")]
310        {
311            maybe_insert_browserbase_stage(&mut ladder, request.browserbase_enabled);
312        }
313        let started = Instant::now();
314
315        for strategy in ladder {
316            if started.elapsed() >= request.total_timeout {
317                result.timed_out = true;
318                result.failures.push(StageFailure {
319                    strategy,
320                    kind: StageFailureKind::Timeout,
321                    message: "wall-clock timeout reached before stage execution".to_string(),
322                });
323                break;
324            }
325
326            result.attempted.push(strategy);
327            match self.execute_stage(strategy, request).await {
328                StageOutcome::Marker => {}
329                StageOutcome::Success(success) => {
330                    result.success = true;
331                    result.strategy_used = Some(strategy);
332                    result.final_url = success.final_url;
333                    result.status_code = success.status_code;
334                    result.html_excerpt = success.html_excerpt;
335                    result.extracted = success.extracted;
336                    break;
337                }
338                StageOutcome::Failure(failure) => result.failures.push(failure),
339            }
340        }
341
342        result
343    }
344
345    async fn execute_stage(
346        &self,
347        strategy: StrategyUsed,
348        request: &AcquisitionRequest,
349    ) -> StageOutcome {
350        match strategy {
351            StrategyUsed::DirectHttp => {
352                #[cfg(feature = "tls-config")]
353                {
354                    self.run_http_stage(request, false).await
355                }
356
357                #[cfg(not(feature = "tls-config"))]
358                {
359                    self.run_http_stage(request, false)
360                }
361            }
362            StrategyUsed::TlsProfiledHttp => {
363                #[cfg(feature = "tls-config")]
364                {
365                    self.run_http_stage(request, true).await
366                }
367
368                #[cfg(not(feature = "tls-config"))]
369                {
370                    self.run_http_stage(request, true)
371                }
372            }
373            StrategyUsed::BrowserLightStealth => self.run_browser_stage(request, false).await,
374            StrategyUsed::StickyProxyBrowserSession => self.run_browser_stage(request, true).await,
375            #[cfg(feature = "browserbase")]
376            StrategyUsed::BrowserbaseManagedSession => Self::run_browserbase_stage(request).await,
377            StrategyUsed::InvestigateEntry => StageOutcome::Marker,
378        }
379    }
380
381    #[cfg(feature = "browserbase")]
382    #[allow(clippy::too_many_lines)]
383    async fn run_browserbase_stage(request: &AcquisitionRequest) -> StageOutcome {
384        if !request.browserbase_enabled {
385            return StageOutcome::Failure(StageFailure {
386                strategy: StrategyUsed::BrowserbaseManagedSession,
387                kind: StageFailureKind::Setup,
388                message: "browserbase stage disabled for this request".to_string(),
389            });
390        }
391
392        let api_key = match std::env::var("BROWSERBASE_API_KEY") {
393            Ok(value) if !value.trim().is_empty() => value,
394            _ => {
395                return StageOutcome::Failure(StageFailure {
396                    strategy: StrategyUsed::BrowserbaseManagedSession,
397                    kind: StageFailureKind::Setup,
398                    message: "browserbase requires BROWSERBASE_API_KEY".to_string(),
399                });
400            }
401        };
402
403        let project_id = match std::env::var("BROWSERBASE_PROJECT_ID") {
404            Ok(value) if !value.trim().is_empty() => value,
405            _ => {
406                return StageOutcome::Failure(StageFailure {
407                    strategy: StrategyUsed::BrowserbaseManagedSession,
408                    kind: StageFailureKind::Setup,
409                    message: "browserbase requires BROWSERBASE_PROJECT_ID".to_string(),
410                });
411            }
412        };
413
414        let session = match create_browserbase_session(request, &api_key, &project_id).await {
415            Ok(session) => session,
416            Err(err) => {
417                return StageOutcome::Failure(StageFailure {
418                    strategy: StrategyUsed::BrowserbaseManagedSession,
419                    kind: classify_browser_error(&err),
420                    message: err.to_string(),
421                });
422            }
423        };
424
425        let connect_timeout = request.request_timeout.min(request.total_timeout);
426        let (mut browser, mut handler) = match timeout(
427            connect_timeout,
428            Browser::connect(session.connect_url.clone()),
429        )
430        .await
431        {
432            Ok(Ok(pair)) => pair,
433            Ok(Err(err)) => {
434                let _ = delete_browserbase_session(request, &api_key, &session.id).await;
435                return StageOutcome::Failure(StageFailure {
436                    strategy: StrategyUsed::BrowserbaseManagedSession,
437                    kind: StageFailureKind::Transport,
438                    message: format!("browserbase connect failed: {err}"),
439                });
440            }
441            Err(_) => {
442                let _ = delete_browserbase_session(request, &api_key, &session.id).await;
443                return StageOutcome::Failure(StageFailure {
444                    strategy: StrategyUsed::BrowserbaseManagedSession,
445                    kind: StageFailureKind::Timeout,
446                    message: format!(
447                        "browserbase connect timed out after {}ms",
448                        connect_timeout.as_millis()
449                    ),
450                });
451            }
452        };
453
454        let handler_task = tokio::spawn(async move {
455            while let Some(event) = handler.next().await {
456                if let Err(error) = event {
457                    tracing::warn!(%error, "browserbase handler error");
458                    break;
459                }
460            }
461        });
462
463        let run_result =
464            async {
465                let raw_page = browser.new_page("about:blank").await.map_err(|err| {
466                    BrowserError::CdpError {
467                        operation: "Browser.newPage".to_string(),
468                        message: err.to_string(),
469                    }
470                })?;
471
472                let mut page = crate::page::PageHandle::new(raw_page, request.navigation_timeout);
473
474                page.navigate(
475                    &request.url,
476                    WaitUntil::DomContentLoaded,
477                    request.navigation_timeout,
478                )
479                .await?;
480
481                if let Some(selector) = &request.wait_for_selector {
482                    page.wait_for_selector(selector, request.navigation_timeout)
483                        .await?;
484                }
485
486                let extracted = match request.extraction_js.as_deref() {
487                    Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
488                        BrowserError::ScriptExecutionFailed {
489                            script: script.to_string(),
490                            reason: err.to_string(),
491                        }
492                    })?),
493                    None => None,
494                };
495
496                let html = page.content().await?;
497                let final_url = page.url().await.ok();
498                let status_code = page.status_code().ok().flatten();
499
500                Ok::<StageSuccess, BrowserError>(StageSuccess {
501                    final_url,
502                    status_code,
503                    html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
504                    extracted,
505                })
506            }
507            .await;
508
509        let _ = timeout(Duration::from_secs(5), browser.close()).await;
510        handler_task.abort();
511        let _ = delete_browserbase_session(request, &api_key, &session.id).await;
512
513        match run_result {
514            Ok(success) => {
515                if is_block_status(success.status_code) {
516                    StageOutcome::Failure(StageFailure {
517                        strategy: StrategyUsed::BrowserbaseManagedSession,
518                        kind: StageFailureKind::Blocked,
519                        message: format!(
520                            "blocked status during browserbase stage: {:?}",
521                            success.status_code
522                        ),
523                    })
524                } else {
525                    StageOutcome::Success(success)
526                }
527            }
528            Err(err) => StageOutcome::Failure(StageFailure {
529                strategy: StrategyUsed::BrowserbaseManagedSession,
530                kind: classify_browser_error(&err),
531                message: err.to_string(),
532            }),
533        }
534    }
535
536    async fn run_browser_stage(&self, request: &AcquisitionRequest, sticky: bool) -> StageOutcome {
537        let strategy = if sticky {
538            StrategyUsed::StickyProxyBrowserSession
539        } else {
540            StrategyUsed::BrowserLightStealth
541        };
542
543        let handle_result = if sticky {
544            let context = host_hint(&request.url).unwrap_or_else(|| "default".to_string());
545            self.pool.acquire_for(&context).await
546        } else {
547            self.pool.acquire().await
548        };
549
550        let handle = match handle_result {
551            Ok(handle) => handle,
552            Err(err) => {
553                return StageOutcome::Failure(StageFailure {
554                    strategy,
555                    kind: StageFailureKind::Setup,
556                    message: format!("browser acquire failed: {err}"),
557                });
558            }
559        };
560
561        let page_result = async {
562            let browser = handle.browser().ok_or_else(|| {
563                BrowserError::ConfigError("browser handle already released".to_string())
564            })?;
565            let mut page = browser.new_page().await?;
566            page.navigate(
567                &request.url,
568                WaitUntil::DomContentLoaded,
569                request.navigation_timeout,
570            )
571            .await?;
572
573            if let Some(selector) = &request.wait_for_selector {
574                page.wait_for_selector(selector, request.navigation_timeout)
575                    .await?;
576            }
577
578            let extracted = match request.extraction_js.as_deref() {
579                Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
580                    BrowserError::ScriptExecutionFailed {
581                        script: script.to_string(),
582                        reason: err.to_string(),
583                    }
584                })?),
585                None => None,
586            };
587
588            let html = page.content().await?;
589            let final_url = page.url().await.ok();
590            let status_code = page.status_code().ok().flatten();
591            let html_excerpt = truncate_html(&html, request.html_excerpt_bytes);
592
593            drop(page);
594
595            Ok::<StageSuccess, BrowserError>(StageSuccess {
596                final_url,
597                status_code,
598                html_excerpt: Some(html_excerpt),
599                extracted,
600            })
601        }
602        .await;
603
604        handle.release().await;
605
606        match page_result {
607            Ok(success) => {
608                if is_block_status(success.status_code) {
609                    StageOutcome::Failure(StageFailure {
610                        strategy,
611                        kind: StageFailureKind::Blocked,
612                        message: format!(
613                            "blocked status during browser stage: {:?}",
614                            success.status_code
615                        ),
616                    })
617                } else {
618                    StageOutcome::Success(success)
619                }
620            }
621            Err(err) => StageOutcome::Failure(StageFailure {
622                strategy,
623                kind: classify_browser_error(&err),
624                message: err.to_string(),
625            }),
626        }
627    }
628
629    #[cfg(feature = "tls-config")]
630    async fn run_http_stage(
631        &self,
632        request: &AcquisitionRequest,
633        tls_profiled: bool,
634    ) -> StageOutcome {
635        if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
636            return StageOutcome::Failure(StageFailure {
637                strategy: if tls_profiled {
638                    StrategyUsed::TlsProfiledHttp
639                } else {
640                    StrategyUsed::DirectHttp
641                },
642                kind: StageFailureKind::Extraction,
643                message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
644            });
645        }
646
647        self.run_http_stage_impl(request, tls_profiled).await
648    }
649
650    #[cfg(not(feature = "tls-config"))]
651    fn run_http_stage(&self, request: &AcquisitionRequest, tls_profiled: bool) -> StageOutcome {
652        if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
653            return StageOutcome::Failure(StageFailure {
654                strategy: if tls_profiled {
655                    StrategyUsed::TlsProfiledHttp
656                } else {
657                    StrategyUsed::DirectHttp
658                },
659                kind: StageFailureKind::Extraction,
660                message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
661            });
662        }
663
664        self.run_http_stage_impl(request, tls_profiled)
665    }
666
667    #[cfg(feature = "tls-config")]
668    async fn run_http_stage_impl(
669        &self,
670        request: &AcquisitionRequest,
671        tls_profiled: bool,
672    ) -> StageOutcome {
673        use crate::tls::{CHROME_131, build_profiled_client_preset};
674
675        let strategy = if tls_profiled {
676            StrategyUsed::TlsProfiledHttp
677        } else {
678            StrategyUsed::DirectHttp
679        };
680
681        let client = if tls_profiled {
682            match build_profiled_client_preset(&CHROME_131, None) {
683                Ok(client) => client,
684                Err(err) => {
685                    return StageOutcome::Failure(StageFailure {
686                        strategy,
687                        kind: StageFailureKind::Setup,
688                        message: format!("tls-profiled client setup failed: {err}"),
689                    });
690                }
691            }
692        } else {
693            match reqwest::Client::builder()
694                .timeout(request.request_timeout)
695                .cookie_store(true)
696                .build()
697            {
698                Ok(client) => client,
699                Err(err) => {
700                    return StageOutcome::Failure(StageFailure {
701                        strategy,
702                        kind: StageFailureKind::Setup,
703                        message: format!("http client setup failed: {err}"),
704                    });
705                }
706            }
707        };
708
709        let response = match client
710            .get(&request.url)
711            .timeout(request.request_timeout)
712            .send()
713            .await
714        {
715            Ok(response) => response,
716            Err(err) => {
717                return StageOutcome::Failure(StageFailure {
718                    strategy,
719                    kind: if err.is_timeout() {
720                        StageFailureKind::Timeout
721                    } else {
722                        StageFailureKind::Transport
723                    },
724                    message: err.to_string(),
725                });
726            }
727        };
728
729        let status_code = Some(response.status().as_u16());
730        let final_url = Some(response.url().to_string());
731        let html = match response.text().await {
732            Ok(text) => text,
733            Err(err) => {
734                return StageOutcome::Failure(StageFailure {
735                    strategy,
736                    kind: StageFailureKind::Transport,
737                    message: format!("response body read failed: {err}"),
738                });
739            }
740        };
741
742        if is_block_status(status_code) {
743            return StageOutcome::Failure(StageFailure {
744                strategy,
745                kind: StageFailureKind::Blocked,
746                message: format!("blocked status from HTTP stage: {status_code:?}"),
747            });
748        }
749
750        StageOutcome::Success(StageSuccess {
751            final_url,
752            status_code,
753            html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
754            extracted: None,
755        })
756    }
757
758    #[cfg(not(feature = "tls-config"))]
759    #[expect(
760        clippy::unused_self,
761        reason = "signature must match the tls-config variant for uniform call sites"
762    )]
763    fn run_http_stage_impl(
764        &self,
765        _request: &AcquisitionRequest,
766        tls_profiled: bool,
767    ) -> StageOutcome {
768        let strategy = if tls_profiled {
769            StrategyUsed::TlsProfiledHttp
770        } else {
771            StrategyUsed::DirectHttp
772        };
773        StageOutcome::Failure(StageFailure {
774            strategy,
775            kind: StageFailureKind::Setup,
776            message: "HTTP acquisition requires the `tls-config` feature".to_string(),
777        })
778    }
779}
780
781#[cfg(feature = "browserbase")]
782#[derive(Debug, Clone)]
783struct BrowserbaseSession {
784    id: String,
785    connect_url: String,
786}
787
788#[cfg(feature = "browserbase")]
789async fn create_browserbase_session(
790    request: &AcquisitionRequest,
791    api_key: &str,
792    project_id: &str,
793) -> Result<BrowserbaseSession, BrowserError> {
794    let client = reqwest::Client::builder()
795        .timeout(request.request_timeout)
796        .build()
797        .map_err(|err| {
798            BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
799        })?;
800
801    let create_url = format!("{}/sessions", browserbase_api_base());
802    let response = client
803        .post(create_url.clone())
804        .bearer_auth(api_key)
805        .header("x-bb-api-key", api_key)
806        .json(&serde_json::json!({ "projectId": project_id }))
807        .send()
808        .await
809        .map_err(|err| BrowserError::ConnectionError {
810            url: create_url.clone(),
811            reason: err.to_string(),
812        })?;
813
814    if !response.status().is_success() {
815        let status = response.status();
816        let body = response.text().await.unwrap_or_default();
817        return Err(BrowserError::ConnectionError {
818            url: create_url,
819            reason: format!("session create failed ({status}): {body}"),
820        });
821    }
822
823    let payload: Value = response
824        .json()
825        .await
826        .map_err(|err| BrowserError::ConnectionError {
827            url: browserbase_api_base(),
828            reason: format!("session create response parse failed: {err}"),
829        })?;
830
831    let connect_url = browserbase_connect_url(&payload).ok_or_else(|| {
832        BrowserError::ConfigError("browserbase response missing connect URL".to_string())
833    })?;
834    let session_id = browserbase_session_id(&payload).ok_or_else(|| {
835        BrowserError::ConfigError("browserbase response missing session id".to_string())
836    })?;
837
838    Ok(BrowserbaseSession {
839        id: session_id,
840        connect_url,
841    })
842}
843
844#[cfg(feature = "browserbase")]
845async fn delete_browserbase_session(
846    request: &AcquisitionRequest,
847    api_key: &str,
848    session_id: &str,
849) -> Result<(), BrowserError> {
850    let client = reqwest::Client::builder()
851        .timeout(request.request_timeout)
852        .build()
853        .map_err(|err| {
854            BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
855        })?;
856
857    let delete_url = format!("{}/sessions/{session_id}", browserbase_api_base());
858    let response = client
859        .delete(delete_url.clone())
860        .bearer_auth(api_key)
861        .header("x-bb-api-key", api_key)
862        .send()
863        .await
864        .map_err(|err| BrowserError::ConnectionError {
865            url: delete_url.clone(),
866            reason: err.to_string(),
867        })?;
868
869    if response.status().is_success() {
870        Ok(())
871    } else {
872        Err(BrowserError::ConnectionError {
873            url: delete_url,
874            reason: format!("session delete failed with status {}", response.status()),
875        })
876    }
877}
878
879#[cfg(feature = "browserbase")]
880fn browserbase_api_base() -> String {
881    std::env::var("BROWSERBASE_API_BASE")
882        .unwrap_or_else(|_| "https://api.browserbase.com/v1".to_string())
883        .trim_end_matches('/')
884        .to_string()
885}
886
887#[cfg(feature = "browserbase")]
888fn browserbase_session_id(payload: &Value) -> Option<String> {
889    payload
890        .get("id")
891        .or_else(|| payload.get("sessionId"))
892        .or_else(|| payload.get("session_id"))
893        .or_else(|| payload.get("data").and_then(|v| v.get("id")))
894        .or_else(|| payload.get("data").and_then(|v| v.get("sessionId")))
895        .or_else(|| payload.get("data").and_then(|v| v.get("session_id")))
896        .and_then(Value::as_str)
897        .map(ToString::to_string)
898}
899
900#[cfg(feature = "browserbase")]
901fn browserbase_connect_url(payload: &Value) -> Option<String> {
902    [
903        "connectUrl",
904        "connect_url",
905        "wsUrl",
906        "ws_url",
907        "websocketUrl",
908        "websocket_url",
909        "browserWSEndpoint",
910        "wsEndpoint",
911        "ws_endpoint",
912    ]
913    .iter()
914    .find_map(|key| payload.get(*key).and_then(Value::as_str))
915    .or_else(|| {
916        payload.get("data").and_then(|data| {
917            [
918                "connectUrl",
919                "connect_url",
920                "wsUrl",
921                "ws_url",
922                "websocketUrl",
923                "websocket_url",
924                "browserWSEndpoint",
925                "wsEndpoint",
926                "ws_endpoint",
927            ]
928            .iter()
929            .find_map(|key| data.get(*key).and_then(Value::as_str))
930        })
931    })
932    .map(ToString::to_string)
933}
934
935fn dedupe_preserve_order(stages: &mut Vec<StrategyUsed>) {
936    let mut seen = Vec::new();
937    stages.retain(|stage| {
938        if seen.contains(stage) {
939            false
940        } else {
941            seen.push(*stage);
942            true
943        }
944    });
945}
946
947#[cfg(feature = "browserbase")]
948fn maybe_insert_browserbase_stage(stages: &mut Vec<StrategyUsed>, enabled: bool) {
949    if !enabled || stages.contains(&StrategyUsed::BrowserbaseManagedSession) {
950        return;
951    }
952
953    if let Some(pos) = stages
954        .iter()
955        .position(|stage| *stage == StrategyUsed::StickyProxyBrowserSession)
956    {
957        stages.insert(pos, StrategyUsed::BrowserbaseManagedSession);
958    } else {
959        stages.push(StrategyUsed::BrowserbaseManagedSession);
960    }
961}
962
963fn classify_browser_error(error: &BrowserError) -> StageFailureKind {
964    match error {
965        BrowserError::Timeout { .. } => StageFailureKind::Timeout,
966        BrowserError::NavigationFailed { reason, .. } if reason.contains("selector") => {
967            StageFailureKind::Blocked
968        }
969        BrowserError::ScriptExecutionFailed { .. } => StageFailureKind::Extraction,
970        BrowserError::ConfigError(_) | BrowserError::PoolExhausted { .. } => {
971            StageFailureKind::Setup
972        }
973        BrowserError::ProxyUnavailable { .. }
974        | BrowserError::ConnectionError { .. }
975        | BrowserError::CdpError { .. }
976        | BrowserError::LaunchFailed { .. }
977        | BrowserError::NavigationFailed { .. }
978        | BrowserError::Io(_)
979        | BrowserError::StaleNode { .. } => StageFailureKind::Transport,
980        #[cfg(feature = "extract")]
981        BrowserError::ExtractionFailed(_) => StageFailureKind::Extraction,
982    }
983}
984
985const fn is_block_status(status: Option<u16>) -> bool {
986    matches!(status, Some(401 | 403 | 407 | 429 | 503))
987}
988
989fn truncate_html(html: &str, max_bytes: usize) -> String {
990    if html.len() <= max_bytes {
991        return html.to_string();
992    }
993
994    let mut out = String::new();
995    for ch in html.chars() {
996        if out.len() + ch.len_utf8() > max_bytes {
997            break;
998        }
999        out.push(ch);
1000    }
1001    out
1002}
1003
1004fn host_hint(url: &str) -> Option<String> {
1005    let without_scheme = url.split_once("://")?.1;
1006    let authority = without_scheme.split('/').next()?;
1007    let host = authority.rsplit('@').next()?.split(':').next()?;
1008    if host.is_empty() {
1009        None
1010    } else {
1011        Some(host.to_ascii_lowercase())
1012    }
1013}
1014
1015#[cfg(test)]
1016mod tests {
1017    use super::*;
1018
1019    #[test]
1020    fn ladder_is_deterministic_for_modes() {
1021        assert_eq!(
1022            AcquisitionRunner::strategy_ladder(AcquisitionMode::Fast, None),
1023            vec![
1024                StrategyUsed::DirectHttp,
1025                StrategyUsed::TlsProfiledHttp,
1026                StrategyUsed::BrowserLightStealth,
1027            ]
1028        );
1029
1030        assert_eq!(
1031            AcquisitionRunner::strategy_ladder(
1032                AcquisitionMode::Investigate,
1033                Some(StrategyUsed::StickyProxyBrowserSession)
1034            ),
1035            vec![
1036                StrategyUsed::InvestigateEntry,
1037                StrategyUsed::StickyProxyBrowserSession,
1038                StrategyUsed::TlsProfiledHttp,
1039            ]
1040        );
1041    }
1042
1043    #[test]
1044    fn block_statuses_are_classified() {
1045        assert!(is_block_status(Some(403)));
1046        assert!(is_block_status(Some(429)));
1047        assert!(!is_block_status(Some(200)));
1048        assert!(!is_block_status(None));
1049    }
1050
1051    #[test]
1052    fn host_hint_extracts_authority() {
1053        assert_eq!(
1054            host_hint("https://user:pass@example.com:8443/path"),
1055            Some("example.com".to_string())
1056        );
1057    }
1058
1059    #[test]
1060    fn truncate_html_respects_utf8_boundaries() {
1061        let src = "abc😀def";
1062        let out = truncate_html(src, 5);
1063        assert_eq!(out, "abc");
1064    }
1065
1066    #[cfg(feature = "browserbase")]
1067    #[test]
1068    fn browserbase_connect_url_is_extracted_from_nested_data() {
1069        let payload = serde_json::json!({
1070            "data": {
1071                "connectUrl": "wss://connect.browserbase.example/devtools/browser/abc"
1072            }
1073        });
1074
1075        assert_eq!(
1076            browserbase_connect_url(&payload),
1077            Some("wss://connect.browserbase.example/devtools/browser/abc".to_string())
1078        );
1079    }
1080
1081    #[cfg(feature = "browserbase")]
1082    #[test]
1083    fn browserbase_stage_is_inserted_before_sticky_stage() {
1084        let mut ladder = vec![
1085            StrategyUsed::DirectHttp,
1086            StrategyUsed::StickyProxyBrowserSession,
1087            StrategyUsed::TlsProfiledHttp,
1088        ];
1089
1090        maybe_insert_browserbase_stage(&mut ladder, true);
1091
1092        assert_eq!(
1093            ladder,
1094            vec![
1095                StrategyUsed::DirectHttp,
1096                StrategyUsed::BrowserbaseManagedSession,
1097                StrategyUsed::StickyProxyBrowserSession,
1098                StrategyUsed::TlsProfiledHttp,
1099            ]
1100        );
1101    }
1102}