1use std::sync::Arc;
8use std::time::{Duration, Instant};
9
10#[cfg(feature = "browserbase")]
11use chromiumoxide::Browser;
12#[cfg(feature = "browserbase")]
13use futures::StreamExt;
14use serde::{Deserialize, Serialize};
15use serde_json::Value;
16#[cfg(feature = "browserbase")]
17use tokio::time::timeout;
18
19use crate::BrowserPool;
20use crate::error::BrowserError;
21use crate::page::WaitUntil;
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum AcquisitionMode {
27 Fast,
29 Resilient,
31 Hostile,
33 Investigate,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
39#[serde(rename_all = "snake_case")]
40pub enum StrategyUsed {
41 DirectHttp,
43 TlsProfiledHttp,
45 BrowserLightStealth,
47 StickyProxyBrowserSession,
49 #[cfg(feature = "browserbase")]
51 BrowserbaseManagedSession,
52 InvestigateEntry,
54}
55
56#[derive(Debug, Clone)]
58pub struct AcquisitionRequest {
59 pub url: String,
61 pub mode: AcquisitionMode,
63 pub wait_for_selector: Option<String>,
65 pub extraction_js: Option<String>,
67 pub total_timeout: Duration,
69 pub navigation_timeout: Duration,
71 pub request_timeout: Duration,
73 pub html_excerpt_bytes: usize,
75 pub investigate_start: Option<StrategyUsed>,
77 pub browserbase_enabled: bool,
79}
80
81impl Default for AcquisitionRequest {
82 fn default() -> Self {
83 Self {
84 url: String::new(),
85 mode: AcquisitionMode::Resilient,
86 wait_for_selector: None,
87 extraction_js: None,
88 total_timeout: Duration::from_secs(45),
89 navigation_timeout: Duration::from_secs(30),
90 request_timeout: Duration::from_secs(15),
91 html_excerpt_bytes: 4_096,
92 investigate_start: None,
93 browserbase_enabled: false,
94 }
95 }
96}
97
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
100#[serde(rename_all = "snake_case")]
101pub enum StageFailureKind {
102 Setup,
104 Timeout,
106 Blocked,
108 Transport,
110 Extraction,
112}
113
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
116pub struct StageFailure {
117 pub strategy: StrategyUsed,
119 pub kind: StageFailureKind,
121 pub message: String,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct AcquisitionResult {
128 pub success: bool,
130 pub strategy_used: Option<StrategyUsed>,
132 pub attempted: Vec<StrategyUsed>,
134 pub final_url: Option<String>,
136 pub status_code: Option<u16>,
138 pub html_excerpt: Option<String>,
140 pub extracted: Option<Value>,
142 pub failures: Vec<StageFailure>,
144 pub timed_out: bool,
146}
147
148impl AcquisitionResult {
149 const fn empty() -> Self {
150 Self {
151 success: false,
152 strategy_used: None,
153 attempted: Vec::new(),
154 final_url: None,
155 status_code: None,
156 html_excerpt: None,
157 extracted: None,
158 failures: Vec::new(),
159 timed_out: false,
160 }
161 }
162}
163
164#[derive(Debug, Clone)]
165struct StageSuccess {
166 final_url: Option<String>,
167 status_code: Option<u16>,
168 html_excerpt: Option<String>,
169 extracted: Option<Value>,
170}
171
172#[derive(Debug, Clone)]
173enum StageOutcome {
174 Marker,
175 Success(StageSuccess),
176 Failure(StageFailure),
177}
178
179#[derive(Clone)]
181pub struct AcquisitionRunner {
182 pool: Arc<BrowserPool>,
183}
184
185impl AcquisitionRunner {
186 #[must_use]
200 pub const fn new(pool: Arc<BrowserPool>) -> Self {
201 Self { pool }
202 }
203
204 #[must_use]
208 pub fn strategy_ladder(
209 mode: AcquisitionMode,
210 investigate_start: Option<StrategyUsed>,
211 ) -> Vec<StrategyUsed> {
212 let mut stages = match mode {
213 AcquisitionMode::Fast => vec![
214 StrategyUsed::DirectHttp,
215 StrategyUsed::TlsProfiledHttp,
216 StrategyUsed::BrowserLightStealth,
217 ],
218 AcquisitionMode::Resilient => vec![
219 StrategyUsed::DirectHttp,
220 StrategyUsed::TlsProfiledHttp,
221 StrategyUsed::BrowserLightStealth,
222 StrategyUsed::StickyProxyBrowserSession,
223 ],
224 AcquisitionMode::Hostile => vec![
225 StrategyUsed::BrowserLightStealth,
226 StrategyUsed::StickyProxyBrowserSession,
227 StrategyUsed::TlsProfiledHttp,
228 StrategyUsed::DirectHttp,
229 ],
230 AcquisitionMode::Investigate => {
231 let start = investigate_start.unwrap_or(StrategyUsed::BrowserLightStealth);
232 vec![
233 StrategyUsed::InvestigateEntry,
234 start,
235 StrategyUsed::StickyProxyBrowserSession,
236 StrategyUsed::TlsProfiledHttp,
237 ]
238 }
239 };
240
241 dedupe_preserve_order(&mut stages);
242 stages
243 }
244
245 pub async fn run(&self, request: AcquisitionRequest) -> AcquisitionResult {
268 let timeout = request.total_timeout;
269 let timeout_strategy = Self::strategy_ladder(request.mode, request.investigate_start)
270 .into_iter()
271 .find(|strategy| *strategy != StrategyUsed::InvestigateEntry)
272 .unwrap_or(StrategyUsed::DirectHttp);
273 let mut result = tokio::time::timeout(timeout, self.run_inner(&request))
274 .await
275 .unwrap_or_else(|_| {
276 let mut timed_out = AcquisitionResult::empty();
277 timed_out.timed_out = true;
278 timed_out.failures.push(StageFailure {
279 strategy: timeout_strategy,
280 kind: StageFailureKind::Timeout,
281 message: format!("acquisition timed out after {}ms", timeout.as_millis()),
282 });
283 timed_out
284 });
285
286 if !result.success {
287 if result.failures.is_empty() {
289 result.failures.push(StageFailure {
290 strategy: timeout_strategy,
291 kind: StageFailureKind::Transport,
292 message: "acquisition ended without stage output".to_string(),
293 });
294 }
295 }
296
297 result
298 }
299
300 async fn run_inner(&self, request: &AcquisitionRequest) -> AcquisitionResult {
301 let mut result = AcquisitionResult::empty();
302
303 #[cfg(feature = "browserbase")]
304 let mut ladder = Self::strategy_ladder(request.mode, request.investigate_start);
305
306 #[cfg(not(feature = "browserbase"))]
307 let ladder = Self::strategy_ladder(request.mode, request.investigate_start);
308
309 #[cfg(feature = "browserbase")]
310 {
311 maybe_insert_browserbase_stage(&mut ladder, request.browserbase_enabled);
312 }
313 let started = Instant::now();
314
315 for strategy in ladder {
316 if started.elapsed() >= request.total_timeout {
317 result.timed_out = true;
318 result.failures.push(StageFailure {
319 strategy,
320 kind: StageFailureKind::Timeout,
321 message: "wall-clock timeout reached before stage execution".to_string(),
322 });
323 break;
324 }
325
326 result.attempted.push(strategy);
327 match self.execute_stage(strategy, request).await {
328 StageOutcome::Marker => {}
329 StageOutcome::Success(success) => {
330 result.success = true;
331 result.strategy_used = Some(strategy);
332 result.final_url = success.final_url;
333 result.status_code = success.status_code;
334 result.html_excerpt = success.html_excerpt;
335 result.extracted = success.extracted;
336 break;
337 }
338 StageOutcome::Failure(failure) => result.failures.push(failure),
339 }
340 }
341
342 result
343 }
344
345 async fn execute_stage(
346 &self,
347 strategy: StrategyUsed,
348 request: &AcquisitionRequest,
349 ) -> StageOutcome {
350 match strategy {
351 StrategyUsed::DirectHttp => {
352 #[cfg(feature = "tls-config")]
353 {
354 self.run_http_stage(request, false).await
355 }
356
357 #[cfg(not(feature = "tls-config"))]
358 {
359 self.run_http_stage(request, false)
360 }
361 }
362 StrategyUsed::TlsProfiledHttp => {
363 #[cfg(feature = "tls-config")]
364 {
365 self.run_http_stage(request, true).await
366 }
367
368 #[cfg(not(feature = "tls-config"))]
369 {
370 self.run_http_stage(request, true)
371 }
372 }
373 StrategyUsed::BrowserLightStealth => self.run_browser_stage(request, false).await,
374 StrategyUsed::StickyProxyBrowserSession => self.run_browser_stage(request, true).await,
375 #[cfg(feature = "browserbase")]
376 StrategyUsed::BrowserbaseManagedSession => Self::run_browserbase_stage(request).await,
377 StrategyUsed::InvestigateEntry => StageOutcome::Marker,
378 }
379 }
380
381 #[cfg(feature = "browserbase")]
382 #[allow(clippy::too_many_lines)]
383 async fn run_browserbase_stage(request: &AcquisitionRequest) -> StageOutcome {
384 if !request.browserbase_enabled {
385 return StageOutcome::Failure(StageFailure {
386 strategy: StrategyUsed::BrowserbaseManagedSession,
387 kind: StageFailureKind::Setup,
388 message: "browserbase stage disabled for this request".to_string(),
389 });
390 }
391
392 let api_key = match std::env::var("BROWSERBASE_API_KEY") {
393 Ok(value) if !value.trim().is_empty() => value,
394 _ => {
395 return StageOutcome::Failure(StageFailure {
396 strategy: StrategyUsed::BrowserbaseManagedSession,
397 kind: StageFailureKind::Setup,
398 message: "browserbase requires BROWSERBASE_API_KEY".to_string(),
399 });
400 }
401 };
402
403 let project_id = match std::env::var("BROWSERBASE_PROJECT_ID") {
404 Ok(value) if !value.trim().is_empty() => value,
405 _ => {
406 return StageOutcome::Failure(StageFailure {
407 strategy: StrategyUsed::BrowserbaseManagedSession,
408 kind: StageFailureKind::Setup,
409 message: "browserbase requires BROWSERBASE_PROJECT_ID".to_string(),
410 });
411 }
412 };
413
414 let session = match create_browserbase_session(request, &api_key, &project_id).await {
415 Ok(session) => session,
416 Err(err) => {
417 return StageOutcome::Failure(StageFailure {
418 strategy: StrategyUsed::BrowserbaseManagedSession,
419 kind: classify_browser_error(&err),
420 message: err.to_string(),
421 });
422 }
423 };
424
425 let connect_timeout = request.request_timeout.min(request.total_timeout);
426 let (mut browser, mut handler) = match timeout(
427 connect_timeout,
428 Browser::connect(session.connect_url.clone()),
429 )
430 .await
431 {
432 Ok(Ok(pair)) => pair,
433 Ok(Err(err)) => {
434 let _ = delete_browserbase_session(request, &api_key, &session.id).await;
435 return StageOutcome::Failure(StageFailure {
436 strategy: StrategyUsed::BrowserbaseManagedSession,
437 kind: StageFailureKind::Transport,
438 message: format!("browserbase connect failed: {err}"),
439 });
440 }
441 Err(_) => {
442 let _ = delete_browserbase_session(request, &api_key, &session.id).await;
443 return StageOutcome::Failure(StageFailure {
444 strategy: StrategyUsed::BrowserbaseManagedSession,
445 kind: StageFailureKind::Timeout,
446 message: format!(
447 "browserbase connect timed out after {}ms",
448 connect_timeout.as_millis()
449 ),
450 });
451 }
452 };
453
454 let handler_task = tokio::spawn(async move {
455 while let Some(event) = handler.next().await {
456 if let Err(error) = event {
457 tracing::warn!(%error, "browserbase handler error");
458 break;
459 }
460 }
461 });
462
463 let run_result =
464 async {
465 let raw_page = browser.new_page("about:blank").await.map_err(|err| {
466 BrowserError::CdpError {
467 operation: "Browser.newPage".to_string(),
468 message: err.to_string(),
469 }
470 })?;
471
472 let mut page = crate::page::PageHandle::new(raw_page, request.navigation_timeout);
473
474 page.navigate(
475 &request.url,
476 WaitUntil::DomContentLoaded,
477 request.navigation_timeout,
478 )
479 .await?;
480
481 if let Some(selector) = &request.wait_for_selector {
482 page.wait_for_selector(selector, request.navigation_timeout)
483 .await?;
484 }
485
486 let extracted = match request.extraction_js.as_deref() {
487 Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
488 BrowserError::ScriptExecutionFailed {
489 script: script.to_string(),
490 reason: err.to_string(),
491 }
492 })?),
493 None => None,
494 };
495
496 let html = page.content().await?;
497 let final_url = page.url().await.ok();
498 let status_code = page.status_code().ok().flatten();
499
500 Ok::<StageSuccess, BrowserError>(StageSuccess {
501 final_url,
502 status_code,
503 html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
504 extracted,
505 })
506 }
507 .await;
508
509 let _ = timeout(Duration::from_secs(5), browser.close()).await;
510 handler_task.abort();
511 let _ = delete_browserbase_session(request, &api_key, &session.id).await;
512
513 match run_result {
514 Ok(success) => {
515 if is_block_status(success.status_code) {
516 StageOutcome::Failure(StageFailure {
517 strategy: StrategyUsed::BrowserbaseManagedSession,
518 kind: StageFailureKind::Blocked,
519 message: format!(
520 "blocked status during browserbase stage: {:?}",
521 success.status_code
522 ),
523 })
524 } else {
525 StageOutcome::Success(success)
526 }
527 }
528 Err(err) => StageOutcome::Failure(StageFailure {
529 strategy: StrategyUsed::BrowserbaseManagedSession,
530 kind: classify_browser_error(&err),
531 message: err.to_string(),
532 }),
533 }
534 }
535
536 async fn run_browser_stage(&self, request: &AcquisitionRequest, sticky: bool) -> StageOutcome {
537 let strategy = if sticky {
538 StrategyUsed::StickyProxyBrowserSession
539 } else {
540 StrategyUsed::BrowserLightStealth
541 };
542
543 let handle_result = if sticky {
544 let context = host_hint(&request.url).unwrap_or_else(|| "default".to_string());
545 self.pool.acquire_for(&context).await
546 } else {
547 self.pool.acquire().await
548 };
549
550 let handle = match handle_result {
551 Ok(handle) => handle,
552 Err(err) => {
553 return StageOutcome::Failure(StageFailure {
554 strategy,
555 kind: StageFailureKind::Setup,
556 message: format!("browser acquire failed: {err}"),
557 });
558 }
559 };
560
561 let page_result = async {
562 let browser = handle.browser().ok_or_else(|| {
563 BrowserError::ConfigError("browser handle already released".to_string())
564 })?;
565 let mut page = browser.new_page().await?;
566 page.navigate(
567 &request.url,
568 WaitUntil::DomContentLoaded,
569 request.navigation_timeout,
570 )
571 .await?;
572
573 if let Some(selector) = &request.wait_for_selector {
574 page.wait_for_selector(selector, request.navigation_timeout)
575 .await?;
576 }
577
578 let extracted = match request.extraction_js.as_deref() {
579 Some(script) => Some(page.eval::<Value>(script).await.map_err(|err| {
580 BrowserError::ScriptExecutionFailed {
581 script: script.to_string(),
582 reason: err.to_string(),
583 }
584 })?),
585 None => None,
586 };
587
588 let html = page.content().await?;
589 let final_url = page.url().await.ok();
590 let status_code = page.status_code().ok().flatten();
591 let html_excerpt = truncate_html(&html, request.html_excerpt_bytes);
592
593 drop(page);
594
595 Ok::<StageSuccess, BrowserError>(StageSuccess {
596 final_url,
597 status_code,
598 html_excerpt: Some(html_excerpt),
599 extracted,
600 })
601 }
602 .await;
603
604 handle.release().await;
605
606 match page_result {
607 Ok(success) => {
608 if is_block_status(success.status_code) {
609 StageOutcome::Failure(StageFailure {
610 strategy,
611 kind: StageFailureKind::Blocked,
612 message: format!(
613 "blocked status during browser stage: {:?}",
614 success.status_code
615 ),
616 })
617 } else {
618 StageOutcome::Success(success)
619 }
620 }
621 Err(err) => StageOutcome::Failure(StageFailure {
622 strategy,
623 kind: classify_browser_error(&err),
624 message: err.to_string(),
625 }),
626 }
627 }
628
629 #[cfg(feature = "tls-config")]
630 async fn run_http_stage(
631 &self,
632 request: &AcquisitionRequest,
633 tls_profiled: bool,
634 ) -> StageOutcome {
635 if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
636 return StageOutcome::Failure(StageFailure {
637 strategy: if tls_profiled {
638 StrategyUsed::TlsProfiledHttp
639 } else {
640 StrategyUsed::DirectHttp
641 },
642 kind: StageFailureKind::Extraction,
643 message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
644 });
645 }
646
647 self.run_http_stage_impl(request, tls_profiled).await
648 }
649
650 #[cfg(not(feature = "tls-config"))]
651 fn run_http_stage(&self, request: &AcquisitionRequest, tls_profiled: bool) -> StageOutcome {
652 if request.wait_for_selector.is_some() || request.extraction_js.is_some() {
653 return StageOutcome::Failure(StageFailure {
654 strategy: if tls_profiled {
655 StrategyUsed::TlsProfiledHttp
656 } else {
657 StrategyUsed::DirectHttp
658 },
659 kind: StageFailureKind::Extraction,
660 message: "HTTP stages cannot satisfy selector/extraction requirements".to_string(),
661 });
662 }
663
664 self.run_http_stage_impl(request, tls_profiled)
665 }
666
667 #[cfg(feature = "tls-config")]
668 async fn run_http_stage_impl(
669 &self,
670 request: &AcquisitionRequest,
671 tls_profiled: bool,
672 ) -> StageOutcome {
673 use crate::tls::{CHROME_131, build_profiled_client_preset};
674
675 let strategy = if tls_profiled {
676 StrategyUsed::TlsProfiledHttp
677 } else {
678 StrategyUsed::DirectHttp
679 };
680
681 let client = if tls_profiled {
682 match build_profiled_client_preset(&CHROME_131, None) {
683 Ok(client) => client,
684 Err(err) => {
685 return StageOutcome::Failure(StageFailure {
686 strategy,
687 kind: StageFailureKind::Setup,
688 message: format!("tls-profiled client setup failed: {err}"),
689 });
690 }
691 }
692 } else {
693 match reqwest::Client::builder()
694 .timeout(request.request_timeout)
695 .cookie_store(true)
696 .build()
697 {
698 Ok(client) => client,
699 Err(err) => {
700 return StageOutcome::Failure(StageFailure {
701 strategy,
702 kind: StageFailureKind::Setup,
703 message: format!("http client setup failed: {err}"),
704 });
705 }
706 }
707 };
708
709 let response = match client
710 .get(&request.url)
711 .timeout(request.request_timeout)
712 .send()
713 .await
714 {
715 Ok(response) => response,
716 Err(err) => {
717 return StageOutcome::Failure(StageFailure {
718 strategy,
719 kind: if err.is_timeout() {
720 StageFailureKind::Timeout
721 } else {
722 StageFailureKind::Transport
723 },
724 message: err.to_string(),
725 });
726 }
727 };
728
729 let status_code = Some(response.status().as_u16());
730 let final_url = Some(response.url().to_string());
731 let html = match response.text().await {
732 Ok(text) => text,
733 Err(err) => {
734 return StageOutcome::Failure(StageFailure {
735 strategy,
736 kind: StageFailureKind::Transport,
737 message: format!("response body read failed: {err}"),
738 });
739 }
740 };
741
742 if is_block_status(status_code) {
743 return StageOutcome::Failure(StageFailure {
744 strategy,
745 kind: StageFailureKind::Blocked,
746 message: format!("blocked status from HTTP stage: {status_code:?}"),
747 });
748 }
749
750 StageOutcome::Success(StageSuccess {
751 final_url,
752 status_code,
753 html_excerpt: Some(truncate_html(&html, request.html_excerpt_bytes)),
754 extracted: None,
755 })
756 }
757
758 #[cfg(not(feature = "tls-config"))]
759 #[expect(
760 clippy::unused_self,
761 reason = "signature must match the tls-config variant for uniform call sites"
762 )]
763 fn run_http_stage_impl(
764 &self,
765 _request: &AcquisitionRequest,
766 tls_profiled: bool,
767 ) -> StageOutcome {
768 let strategy = if tls_profiled {
769 StrategyUsed::TlsProfiledHttp
770 } else {
771 StrategyUsed::DirectHttp
772 };
773 StageOutcome::Failure(StageFailure {
774 strategy,
775 kind: StageFailureKind::Setup,
776 message: "HTTP acquisition requires the `tls-config` feature".to_string(),
777 })
778 }
779}
780
781#[cfg(feature = "browserbase")]
782#[derive(Debug, Clone)]
783struct BrowserbaseSession {
784 id: String,
785 connect_url: String,
786}
787
788#[cfg(feature = "browserbase")]
789async fn create_browserbase_session(
790 request: &AcquisitionRequest,
791 api_key: &str,
792 project_id: &str,
793) -> Result<BrowserbaseSession, BrowserError> {
794 let client = reqwest::Client::builder()
795 .timeout(request.request_timeout)
796 .build()
797 .map_err(|err| {
798 BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
799 })?;
800
801 let create_url = format!("{}/sessions", browserbase_api_base());
802 let response = client
803 .post(create_url.clone())
804 .bearer_auth(api_key)
805 .header("x-bb-api-key", api_key)
806 .json(&serde_json::json!({ "projectId": project_id }))
807 .send()
808 .await
809 .map_err(|err| BrowserError::ConnectionError {
810 url: create_url.clone(),
811 reason: err.to_string(),
812 })?;
813
814 if !response.status().is_success() {
815 let status = response.status();
816 let body = response.text().await.unwrap_or_default();
817 return Err(BrowserError::ConnectionError {
818 url: create_url,
819 reason: format!("session create failed ({status}): {body}"),
820 });
821 }
822
823 let payload: Value = response
824 .json()
825 .await
826 .map_err(|err| BrowserError::ConnectionError {
827 url: browserbase_api_base(),
828 reason: format!("session create response parse failed: {err}"),
829 })?;
830
831 let connect_url = browserbase_connect_url(&payload).ok_or_else(|| {
832 BrowserError::ConfigError("browserbase response missing connect URL".to_string())
833 })?;
834 let session_id = browserbase_session_id(&payload).ok_or_else(|| {
835 BrowserError::ConfigError("browserbase response missing session id".to_string())
836 })?;
837
838 Ok(BrowserbaseSession {
839 id: session_id,
840 connect_url,
841 })
842}
843
844#[cfg(feature = "browserbase")]
845async fn delete_browserbase_session(
846 request: &AcquisitionRequest,
847 api_key: &str,
848 session_id: &str,
849) -> Result<(), BrowserError> {
850 let client = reqwest::Client::builder()
851 .timeout(request.request_timeout)
852 .build()
853 .map_err(|err| {
854 BrowserError::ConfigError(format!("browserbase client setup failed: {err}"))
855 })?;
856
857 let delete_url = format!("{}/sessions/{session_id}", browserbase_api_base());
858 let response = client
859 .delete(delete_url.clone())
860 .bearer_auth(api_key)
861 .header("x-bb-api-key", api_key)
862 .send()
863 .await
864 .map_err(|err| BrowserError::ConnectionError {
865 url: delete_url.clone(),
866 reason: err.to_string(),
867 })?;
868
869 if response.status().is_success() {
870 Ok(())
871 } else {
872 Err(BrowserError::ConnectionError {
873 url: delete_url,
874 reason: format!("session delete failed with status {}", response.status()),
875 })
876 }
877}
878
879#[cfg(feature = "browserbase")]
880fn browserbase_api_base() -> String {
881 std::env::var("BROWSERBASE_API_BASE")
882 .unwrap_or_else(|_| "https://api.browserbase.com/v1".to_string())
883 .trim_end_matches('/')
884 .to_string()
885}
886
887#[cfg(feature = "browserbase")]
888fn browserbase_session_id(payload: &Value) -> Option<String> {
889 payload
890 .get("id")
891 .or_else(|| payload.get("sessionId"))
892 .or_else(|| payload.get("session_id"))
893 .or_else(|| payload.get("data").and_then(|v| v.get("id")))
894 .or_else(|| payload.get("data").and_then(|v| v.get("sessionId")))
895 .or_else(|| payload.get("data").and_then(|v| v.get("session_id")))
896 .and_then(Value::as_str)
897 .map(ToString::to_string)
898}
899
900#[cfg(feature = "browserbase")]
901fn browserbase_connect_url(payload: &Value) -> Option<String> {
902 [
903 "connectUrl",
904 "connect_url",
905 "wsUrl",
906 "ws_url",
907 "websocketUrl",
908 "websocket_url",
909 "browserWSEndpoint",
910 "wsEndpoint",
911 "ws_endpoint",
912 ]
913 .iter()
914 .find_map(|key| payload.get(*key).and_then(Value::as_str))
915 .or_else(|| {
916 payload.get("data").and_then(|data| {
917 [
918 "connectUrl",
919 "connect_url",
920 "wsUrl",
921 "ws_url",
922 "websocketUrl",
923 "websocket_url",
924 "browserWSEndpoint",
925 "wsEndpoint",
926 "ws_endpoint",
927 ]
928 .iter()
929 .find_map(|key| data.get(*key).and_then(Value::as_str))
930 })
931 })
932 .map(ToString::to_string)
933}
934
935fn dedupe_preserve_order(stages: &mut Vec<StrategyUsed>) {
936 let mut seen = Vec::new();
937 stages.retain(|stage| {
938 if seen.contains(stage) {
939 false
940 } else {
941 seen.push(*stage);
942 true
943 }
944 });
945}
946
947#[cfg(feature = "browserbase")]
948fn maybe_insert_browserbase_stage(stages: &mut Vec<StrategyUsed>, enabled: bool) {
949 if !enabled || stages.contains(&StrategyUsed::BrowserbaseManagedSession) {
950 return;
951 }
952
953 if let Some(pos) = stages
954 .iter()
955 .position(|stage| *stage == StrategyUsed::StickyProxyBrowserSession)
956 {
957 stages.insert(pos, StrategyUsed::BrowserbaseManagedSession);
958 } else {
959 stages.push(StrategyUsed::BrowserbaseManagedSession);
960 }
961}
962
963fn classify_browser_error(error: &BrowserError) -> StageFailureKind {
964 match error {
965 BrowserError::Timeout { .. } => StageFailureKind::Timeout,
966 BrowserError::NavigationFailed { reason, .. } if reason.contains("selector") => {
967 StageFailureKind::Blocked
968 }
969 BrowserError::ScriptExecutionFailed { .. } => StageFailureKind::Extraction,
970 BrowserError::ConfigError(_) | BrowserError::PoolExhausted { .. } => {
971 StageFailureKind::Setup
972 }
973 BrowserError::ProxyUnavailable { .. }
974 | BrowserError::ConnectionError { .. }
975 | BrowserError::CdpError { .. }
976 | BrowserError::LaunchFailed { .. }
977 | BrowserError::NavigationFailed { .. }
978 | BrowserError::Io(_)
979 | BrowserError::StaleNode { .. } => StageFailureKind::Transport,
980 #[cfg(feature = "extract")]
981 BrowserError::ExtractionFailed(_) => StageFailureKind::Extraction,
982 }
983}
984
985const fn is_block_status(status: Option<u16>) -> bool {
986 matches!(status, Some(401 | 403 | 407 | 429 | 503))
987}
988
989fn truncate_html(html: &str, max_bytes: usize) -> String {
990 if html.len() <= max_bytes {
991 return html.to_string();
992 }
993
994 let mut out = String::new();
995 for ch in html.chars() {
996 if out.len() + ch.len_utf8() > max_bytes {
997 break;
998 }
999 out.push(ch);
1000 }
1001 out
1002}
1003
1004fn host_hint(url: &str) -> Option<String> {
1005 let without_scheme = url.split_once("://")?.1;
1006 let authority = without_scheme.split('/').next()?;
1007 let host = authority.rsplit('@').next()?.split(':').next()?;
1008 if host.is_empty() {
1009 None
1010 } else {
1011 Some(host.to_ascii_lowercase())
1012 }
1013}
1014
1015#[cfg(test)]
1016mod tests {
1017 use super::*;
1018
1019 #[test]
1020 fn ladder_is_deterministic_for_modes() {
1021 assert_eq!(
1022 AcquisitionRunner::strategy_ladder(AcquisitionMode::Fast, None),
1023 vec![
1024 StrategyUsed::DirectHttp,
1025 StrategyUsed::TlsProfiledHttp,
1026 StrategyUsed::BrowserLightStealth,
1027 ]
1028 );
1029
1030 assert_eq!(
1031 AcquisitionRunner::strategy_ladder(
1032 AcquisitionMode::Investigate,
1033 Some(StrategyUsed::StickyProxyBrowserSession)
1034 ),
1035 vec![
1036 StrategyUsed::InvestigateEntry,
1037 StrategyUsed::StickyProxyBrowserSession,
1038 StrategyUsed::TlsProfiledHttp,
1039 ]
1040 );
1041 }
1042
1043 #[test]
1044 fn block_statuses_are_classified() {
1045 assert!(is_block_status(Some(403)));
1046 assert!(is_block_status(Some(429)));
1047 assert!(!is_block_status(Some(200)));
1048 assert!(!is_block_status(None));
1049 }
1050
1051 #[test]
1052 fn host_hint_extracts_authority() {
1053 assert_eq!(
1054 host_hint("https://user:pass@example.com:8443/path"),
1055 Some("example.com".to_string())
1056 );
1057 }
1058
1059 #[test]
1060 fn truncate_html_respects_utf8_boundaries() {
1061 let src = "abc😀def";
1062 let out = truncate_html(src, 5);
1063 assert_eq!(out, "abc");
1064 }
1065
1066 #[cfg(feature = "browserbase")]
1067 #[test]
1068 fn browserbase_connect_url_is_extracted_from_nested_data() {
1069 let payload = serde_json::json!({
1070 "data": {
1071 "connectUrl": "wss://connect.browserbase.example/devtools/browser/abc"
1072 }
1073 });
1074
1075 assert_eq!(
1076 browserbase_connect_url(&payload),
1077 Some("wss://connect.browserbase.example/devtools/browser/abc".to_string())
1078 );
1079 }
1080
1081 #[cfg(feature = "browserbase")]
1082 #[test]
1083 fn browserbase_stage_is_inserted_before_sticky_stage() {
1084 let mut ladder = vec![
1085 StrategyUsed::DirectHttp,
1086 StrategyUsed::StickyProxyBrowserSession,
1087 StrategyUsed::TlsProfiledHttp,
1088 ];
1089
1090 maybe_insert_browserbase_stage(&mut ladder, true);
1091
1092 assert_eq!(
1093 ladder,
1094 vec![
1095 StrategyUsed::DirectHttp,
1096 StrategyUsed::BrowserbaseManagedSession,
1097 StrategyUsed::StickyProxyBrowserSession,
1098 StrategyUsed::TlsProfiledHttp,
1099 ]
1100 );
1101 }
1102}