1pub mod abs;
3pub mod connect;
5pub mod css_selectors;
7pub mod templates;
9
10#[cfg(feature = "chrome")]
11pub(crate) mod detect_chrome;
12#[cfg(any(feature = "balance", feature = "disk"))]
13pub mod detect_system;
15pub mod header_utils;
17pub mod interner;
19pub mod trie;
21pub mod validation;
23
24#[cfg(all(not(feature = "fs"), feature = "chrome"))]
25use crate::features::automation::RemoteMultimodalConfigs;
26use crate::{
27 page::{AntiBotTech, Metadata, STREAMING_CHUNK_SIZE},
28 RelativeSelectors,
29};
30use abs::parse_absolute_url;
31use aho_corasick::AhoCorasick;
32use auto_encoder::is_binary_file;
33use case_insensitive_string::CaseInsensitiveString;
34
35#[cfg(feature = "chrome")]
36use hashbrown::HashMap;
37use hashbrown::HashSet;
38
39use lol_html::{send::HtmlRewriter, OutputSink};
40use phf::phf_set;
41use reqwest::header::CONTENT_LENGTH;
42#[cfg(feature = "chrome")]
43use reqwest::header::{HeaderMap, HeaderValue};
44use std::{
45 error::Error,
46 future::Future,
47 str::FromStr,
48 sync::Arc,
49 time::{Duration, Instant},
50};
51use tokio::sync::Semaphore;
52use url::Url;
53
54#[cfg(feature = "chrome")]
55use crate::features::chrome_common::{AutomationScripts, ExecutionScripts};
56use crate::page::{MAX_PRE_ALLOCATED_HTML_PAGE_SIZE, MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE};
57use crate::tokio_stream::StreamExt;
58use crate::Client;
59
60#[cfg(feature = "cache_chrome_hybrid")]
61use http_cache_semantics::{RequestLike, ResponseLike};
62
63use log::{info, log_enabled, Level};
64
65#[cfg(not(feature = "wreq"))]
66use reqwest::{Response, StatusCode};
67#[cfg(feature = "wreq")]
68use wreq::{Response, StatusCode};
69
70#[cfg(all(not(feature = "cache_request"), not(feature = "wreq")))]
71pub(crate) type RequestError = reqwest::Error;
72
73#[cfg(all(not(feature = "cache_request"), feature = "wreq"))]
75pub(crate) type RequestError = wreq::Error;
76
77#[cfg(feature = "cache_request")]
79pub(crate) type RequestError = reqwest_middleware::Error;
80
81pub(crate) type RequestResponse = Response;
83
84#[cfg(feature = "chrome")]
86const WAIT_TIMEOUTS: [u64; 6] = [0, 20, 50, 100, 100, 500];
87pub static IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf_set! {
93 "application/pdf",
94 "application/zip",
95 "application/x-rar-compressed",
96 "application/x-tar",
97 "image/png",
98 "image/jpeg",
99 "image/gif",
100 "image/bmp",
101 "image/svg+xml",
102 "video/mp4",
103 "video/x-msvideo",
104 "video/x-matroska",
105 "video/webm",
106 "audio/mpeg",
107 "audio/ogg",
108 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
109 "application/vnd.ms-excel",
110 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
111 "application/vnd.ms-powerpoint",
112 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
113 "application/x-7z-compressed",
114 "application/x-rpm",
115 "application/x-shockwave-flash",
116};
117
118lazy_static! {
119 pub static ref APACHE_FORBIDDEN: &'static [u8; 317] = br#"<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
121<html><head>
122<title>403 Forbidden</title>
123</head><body>
124<h1>Forbidden</h1>
125<p>You don't have permission to access this resource.</p>
126<p>Additionally, a 403 Forbidden
127error was encountered while trying to use an ErrorDocument to handle the request.</p>
128</body></html>"#;
129
130 pub static ref OPEN_RESTY_FORBIDDEN: &'static [u8; 125] = br#"<html><head><title>403 Forbidden</title></head>
132<body>
133<center><h1>403 Forbidden</h1></center>
134<hr><center>openresty</center>"#;
135
136 pub static ref EMPTY_HTML_BASIC: &'static [u8; 13] = b"<html></html>";
138
139 static ref AC_BODY_SCAN: AhoCorasick = AhoCorasick::new([
141 "cf-error-code",
142 "Access to this page has been denied",
143 "DataDome",
144 "perimeterx",
145 "funcaptcha",
146 "Request unsuccessful. Incapsula incident ID",
147 ]).unwrap();
148
149 static ref AC_URL_SCAN: AhoCorasick = AhoCorasick::builder()
150 .match_kind(aho_corasick::MatchKind::LeftmostFirst) .build([
152 "/cdn-cgi/challenge-platform", "datadome.co", "dd-api.io", "perimeterx.net", "px-captcha", "arkoselabs.com", "funcaptcha", "kasada.io", "fingerprint.com", "fpjs.io", "incapsula", "imperva", "radwarebotmanager", "reblaze.com", "cheq.ai", ])
168 .unwrap();
169}
170
171#[cfg(feature = "fs")]
172lazy_static! {
173 static ref TMP_DIR: String = {
174 use std::fs;
175 let mut tmp = std::env::temp_dir();
176
177 tmp.push("spider/");
178
179 match fs::create_dir_all(&tmp) {
181 Ok(_) => {
182 let dir_name = tmp.display().to_string();
183
184 match std::time::SystemTime::now().duration_since(std::time::SystemTime::UNIX_EPOCH) {
185 Ok(dur) => {
186 string_concat!(dir_name, dur.as_secs().to_string())
187 }
188 _ => dir_name,
189 }
190 }
191 _ => "/tmp/".to_string()
192 }
193 };
194}
195
196#[cfg(feature = "chrome")]
197lazy_static! {
198 pub(crate) static ref MASK_BYTES_INTERCEPTION: bool = {
200 std::env::var("MASK_BYTES_INTERCEPTION").unwrap_or_default() == "true"
201 };
202 pub(crate) static ref CF_WAIT_FOR: crate::features::chrome_common::WaitFor = {
204 let mut wait_for = crate::features::chrome_common::WaitFor::default();
205 wait_for.delay = crate::features::chrome_common::WaitForDelay::new(Some(core::time::Duration::from_millis(1000))).into();
206 wait_for.idle_network = crate::features::chrome_common::WaitForIdleNetwork::new(core::time::Duration::from_secs(8).into()).into();
208 wait_for
209 };
210}
211
212#[inline(always)]
214pub fn detect_open_resty_forbidden(b: &[u8]) -> bool {
215 b.starts_with(*OPEN_RESTY_FORBIDDEN)
216}
217
218#[inline(always)]
220pub fn detect_hard_forbidden_content(b: &[u8]) -> bool {
221 b == *APACHE_FORBIDDEN || detect_open_resty_forbidden(b)
222}
223
224lazy_static! {
225 pub(crate) static ref MAX_SIZE_BYTES: usize = {
227 match std::env::var("SPIDER_MAX_SIZE_BYTES") {
228 Ok(b) => {
229 const DEFAULT_MAX_SIZE_BYTES: usize = 1_073_741_824; let b = b.parse::<usize>().unwrap_or(DEFAULT_MAX_SIZE_BYTES);
232
233 if b == 0 {
234 0
235 } else {
236 b.max(1_048_576) }
238 },
239 _ => 0
240 }
241 };
242}
243
244#[derive(Debug, Default)]
246pub struct PageResponse {
247 pub content: Option<Box<Vec<u8>>>,
249 pub headers: Option<reqwest::header::HeaderMap>,
251 #[cfg(feature = "remote_addr")]
252 pub remote_addr: Option<core::net::SocketAddr>,
254 #[cfg(feature = "cookies")]
255 pub cookies: Option<reqwest::header::HeaderMap>,
257 pub status_code: StatusCode,
259 pub final_url: Option<String>,
261 pub error_for_status: Option<Result<Response, RequestError>>,
263 #[cfg(feature = "chrome")]
264 pub screenshot_bytes: Option<Vec<u8>>,
266 #[cfg(feature = "openai")]
267 pub openai_credits_used: Option<Vec<crate::features::openai_common::OpenAIUsage>>,
269 #[cfg(feature = "openai")]
270 pub extra_ai_data: Option<Vec<crate::page::AIResults>>,
272 #[cfg(feature = "gemini")]
273 pub gemini_credits_used: Option<Vec<crate::features::gemini_common::GeminiUsage>>,
275 #[cfg(feature = "gemini")]
276 pub extra_gemini_data: Option<Vec<crate::page::AIResults>>,
278 pub remote_multimodal_usage: Option<Vec<crate::features::automation::AutomationUsage>>,
281 pub extra_remote_multimodal_data: Option<Vec<crate::page::AutomationResults>>,
284 pub waf_check: bool,
286 pub bytes_transferred: Option<f64>,
288 pub signature: Option<u64>,
290 #[cfg(feature = "chrome")]
291 pub response_map: Option<HashMap<String, f64>>,
293 #[cfg(feature = "chrome")]
294 pub request_map: Option<HashMap<String, f64>>,
296 pub anti_bot_tech: crate::page::AntiBotTech,
298 pub metadata: Option<Box<Metadata>>,
300 #[cfg(feature = "time")]
302 pub duration: Option<tokio::time::Instant>,
303}
304
305#[cfg(feature = "chrome")]
307pub async fn wait_for_event<T>(page: &chromiumoxide::Page, timeout: Option<core::time::Duration>)
308where
309 T: chromiumoxide::cdp::IntoEventKind + Unpin + std::fmt::Debug,
310{
311 if let Ok(mut events) = page.event_listener::<T>().await {
312 let wait_until = async {
313 let mut index = 0;
314
315 loop {
316 let current_timeout = WAIT_TIMEOUTS[index];
317 let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
318
319 tokio::select! {
320 _ = sleep => (),
321 v = events.next() => {
322 if !v.is_none () {
323 break;
324 }
325 }
326 }
327
328 index = (index + 1) % WAIT_TIMEOUTS.len();
329 }
330 };
331 match timeout {
332 Some(timeout) => if let Err(_) = tokio::time::timeout(timeout, wait_until).await {},
333 _ => wait_until.await,
334 }
335 }
336}
337
338#[cfg(feature = "chrome")]
340pub async fn wait_for_selector(
341 page: &chromiumoxide::Page,
342 timeout: Option<core::time::Duration>,
343 selector: &str,
344) -> bool {
345 let mut valid = false;
346 let wait_until = async {
347 let mut index = 0;
348
349 loop {
350 let current_timeout = WAIT_TIMEOUTS[index];
351 let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
352
353 tokio::select! {
354 _ = sleep => (),
355 v = page.find_element(selector) => {
356 if v.is_ok() {
357 valid = true;
358 break;
359 }
360 }
361 }
362
363 index = (index + 1) % WAIT_TIMEOUTS.len();
364 }
365 };
366
367 match timeout {
368 Some(timeout) => {
369 if let Err(_) = tokio::time::timeout(timeout, wait_until).await {
370 valid = false;
371 }
372 }
373 _ => wait_until.await,
374 };
375
376 valid
377}
378
379#[cfg(feature = "chrome")]
381pub async fn wait_for_dom(
382 page: &chromiumoxide::Page,
383 timeout: Option<core::time::Duration>,
384 selector: &str,
385) {
386 let max = timeout.unwrap_or_else(|| core::time::Duration::from_millis(1200));
387
388 let script = crate::features::chrome_common::generate_wait_for_dom_js_v2(
389 max.as_millis() as u32,
390 selector,
391 500,
392 2,
393 true,
394 false,
395 );
396
397 let hard = max + core::time::Duration::from_millis(200);
398
399 let _ = tokio::time::timeout(hard, async {
400 if let Ok(v) = page.evaluate(script).await {
401 if let Some(val) = v.value().and_then(|x| x.as_bool()) {
402 let _ = val;
403 }
404 }
405 })
406 .await;
407}
408
409#[cfg(feature = "chrome")]
411pub async fn create_output_path(
412 base: &std::path::PathBuf,
413 target_url: &str,
414 format: &str,
415) -> String {
416 let out = string_concat!(
417 &percent_encoding::percent_encode(
418 target_url.as_bytes(),
419 percent_encoding::NON_ALPHANUMERIC
420 )
421 .to_string(),
422 format
423 );
424
425 let b = base.join(&out);
426
427 if let Some(p) = b.parent() {
428 let _ = tokio::fs::create_dir_all(&p).await;
429 }
430
431 b.display().to_string()
432}
433
434#[cfg(feature = "chrome")]
435pub async fn page_wait(
441 page: &chromiumoxide::Page,
442 wait_for: &Option<crate::configuration::WaitFor>,
443) {
444 if let Some(wait_for) = wait_for {
445 if let Some(wait) = &wait_for.idle_network {
446 wait_for_event::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
447 page,
448 wait.timeout,
449 )
450 .await;
451 }
452
453 if let Some(wait) = &wait_for.almost_idle_network0 {
454 if let Some(timeout) = wait.timeout {
455 let _ = page
456 .wait_for_network_almost_idle_with_timeout(timeout)
457 .await;
458 } else {
459 let _ = page.wait_for_network_almost_idle().await;
460 }
461 }
462
463 if let Some(wait) = &wait_for.idle_network0 {
464 if let Some(timeout) = wait.timeout {
465 let _ = page.wait_for_network_idle_with_timeout(timeout).await;
466 } else {
467 let _ = page.wait_for_network_idle().await;
468 }
469 }
470
471 if let Some(wait) = &wait_for.selector {
472 wait_for_selector(page, wait.timeout, &wait.selector).await;
473 }
474
475 if let Some(wait) = &wait_for.dom {
476 wait_for_dom(page, wait.timeout, &wait.selector).await;
477 }
478
479 if let Some(wait) = &wait_for.delay {
480 if let Some(timeout) = wait.timeout {
481 tokio::time::sleep(timeout).await
482 }
483 }
484 }
485}
486
487#[derive(Debug, Default)]
488#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
489#[cfg(feature = "openai")]
490pub struct JsonResponse {
492 content: Vec<String>,
494 js: String,
496 #[cfg_attr(feature = "serde", serde(default))]
497 error: Option<String>,
499}
500
501#[cfg(feature = "openai")]
503pub fn handle_openai_credits(
504 page_response: &mut PageResponse,
505 tokens_used: crate::features::openai_common::OpenAIUsage,
506) {
507 match page_response.openai_credits_used.as_mut() {
508 Some(v) => v.push(tokens_used),
509 None => page_response.openai_credits_used = Some(vec![tokens_used]),
510 };
511}
512
513#[cfg(not(feature = "openai"))]
514pub fn handle_openai_credits(
516 _page_response: &mut PageResponse,
517 _tokens_used: crate::features::openai_common::OpenAIUsage,
518) {
519}
520
521#[cfg(feature = "gemini")]
522pub fn handle_gemini_credits(
524 page_response: &mut PageResponse,
525 tokens_used: crate::features::gemini_common::GeminiUsage,
526) {
527 match page_response.gemini_credits_used.as_mut() {
528 Some(v) => v.push(tokens_used),
529 None => page_response.gemini_credits_used = Some(vec![tokens_used]),
530 };
531}
532
533#[cfg(not(feature = "gemini"))]
534pub fn handle_gemini_credits(
536 _page_response: &mut PageResponse,
537 _tokens_used: crate::features::gemini_common::GeminiUsage,
538) {
539}
540
541#[cfg(feature = "openai")]
543pub fn handle_extra_ai_data(
544 page_response: &mut PageResponse,
545 prompt: &str,
546 x: JsonResponse,
547 screenshot_output: Option<Vec<u8>>,
548 error: Option<String>,
549) {
550 let ai_response = crate::page::AIResults {
551 input: prompt.into(),
552 js_output: x.js,
553 content_output: x
554 .content
555 .iter()
556 .map(|c| c.trim_start().into())
557 .collect::<Vec<_>>(),
558 screenshot_output,
559 error,
560 };
561
562 match page_response.extra_ai_data.as_mut() {
563 Some(v) => v.push(ai_response),
564 None => page_response.extra_ai_data = Some(Vec::from([ai_response])),
565 };
566}
567
568pub enum HeaderSource<'a> {
570 HeaderMap(&'a crate::client::header::HeaderMap),
572 Map(&'a std::collections::HashMap<String, String>),
574}
575
576#[inline(always)]
577fn header_value<'a>(headers: &'a HeaderSource, key: &str) -> Option<&'a str> {
579 match headers {
580 HeaderSource::HeaderMap(hm) => hm.get(key).and_then(|v| v.to_str().ok()),
581 HeaderSource::Map(map) => map.get(key).map(|s| s.as_str()),
582 }
583}
584
585#[inline(always)]
586fn has_key(headers: &HeaderSource, key: &str) -> bool {
588 match headers {
589 HeaderSource::HeaderMap(hm) => hm.contains_key(key),
590 HeaderSource::Map(map) => map.contains_key(key),
591 }
592}
593
594#[inline(always)]
595fn eq_icase_trim(a: &str, b: &str) -> bool {
597 a.trim().eq_ignore_ascii_case(b)
598}
599
600#[inline]
602pub fn detect_anti_bot_from_headers(headers: &HeaderSource) -> Option<AntiBotTech> {
603 if has_key(headers, "cf-chl-bypass") || has_key(headers, "cf-ray") {
605 return Some(AntiBotTech::Cloudflare);
606 }
607
608 if has_key(headers, "x-captcha-endpoint") {
610 return Some(AntiBotTech::DataDome);
611 }
612
613 if has_key(headers, "x-perimeterx") || has_key(headers, "pxhd") {
615 return Some(AntiBotTech::PerimeterX);
616 }
617
618 if has_key(headers, "x-akamaibot") {
620 return Some(AntiBotTech::AkamaiBotManager);
621 }
622
623 if has_key(headers, "x-imperva-id") || has_key(headers, "x-iinfo") {
625 return Some(AntiBotTech::Imperva);
626 }
627
628 if has_key(headers, "x-reblaze-uuid") {
630 return Some(AntiBotTech::Reblaze);
631 }
632
633 if header_value(headers, "x-cdn").is_some_and(|v| eq_icase_trim(v, "imperva")) {
634 return Some(AntiBotTech::Imperva);
635 }
636
637 None
638}
639
640pub fn detect_anti_bot_from_body(body: &Vec<u8>) -> Option<AntiBotTech> {
642 if body.len() < 30_000 {
644 if let Ok(finder) = AC_BODY_SCAN.try_find_iter(body) {
645 for mat in finder {
646 match mat.pattern().as_usize() {
647 0 => return Some(AntiBotTech::Cloudflare),
648 1 | 2 => return Some(AntiBotTech::DataDome),
649 3 => return Some(AntiBotTech::PerimeterX),
650 4 => return Some(AntiBotTech::ArkoseLabs),
651 5 => return Some(AntiBotTech::Imperva),
652 _ => (),
653 }
654 }
655 }
656 }
657
658 None
659}
660
661pub fn detect_antibot_from_url(url: &str) -> Option<AntiBotTech> {
663 if let Some(mat) = AC_URL_SCAN.find(url) {
664 let tech = match mat.pattern().as_usize() {
665 0 => AntiBotTech::Cloudflare,
666 1 | 2 => AntiBotTech::DataDome,
667 3 | 4 => AntiBotTech::PerimeterX,
668 5 | 6 => AntiBotTech::ArkoseLabs,
669 7 => AntiBotTech::Kasada,
670 8 | 9 => AntiBotTech::FingerprintJS,
671 10 | 11 => AntiBotTech::Imperva,
672 12 => AntiBotTech::RadwareBotManager,
673 13 => AntiBotTech::Reblaze,
674 14 => AntiBotTech::CHEQ,
675 _ => return None,
676 };
677 Some(tech)
678 } else {
679 None
680 }
681}
682
683pub fn flip_http_https(url: &str) -> Option<String> {
685 if let Some(rest) = url.strip_prefix("http://") {
686 Some(format!("https://{rest}"))
687 } else if let Some(rest) = url.strip_prefix("https://") {
688 Some(format!("http://{rest}"))
689 } else {
690 None
691 }
692}
693
694pub fn detect_anti_bot_tech_response(
696 url: &str,
697 headers: &HeaderSource,
698 body: &Vec<u8>,
699 subject_name: Option<&str>,
700) -> AntiBotTech {
701 if let Some(subject) = subject_name {
703 if subject == "challenges.cloudflare.com" {
704 return AntiBotTech::Cloudflare;
705 }
706 }
707
708 if let Some(tech) = detect_anti_bot_from_headers(headers) {
709 return tech;
710 }
711
712 if let Some(tech) = detect_antibot_from_url(url) {
713 return tech;
714 }
715
716 if let Some(anti_bot) = detect_anti_bot_from_body(body) {
717 return anti_bot;
718 }
719
720 AntiBotTech::None
721}
722
723#[cfg(feature = "openai")]
725pub fn handle_ai_data(js: &str) -> Option<JsonResponse> {
726 match serde_json::from_str::<JsonResponse>(&js) {
727 Ok(x) => Some(x),
728 _ => None,
729 }
730}
731
732#[cfg(feature = "chrome")]
733#[derive(Default, Clone, Debug)]
734pub struct ChromeHTTPReqRes {
736 pub waf_check: bool,
738 pub status_code: StatusCode,
740 pub method: String,
742 pub response_headers: std::collections::HashMap<String, String>,
744 pub request_headers: std::collections::HashMap<String, String>,
746 pub protocol: String,
748 pub anti_bot_tech: crate::page::AntiBotTech,
750}
751
752#[cfg(feature = "chrome")]
753impl ChromeHTTPReqRes {
754 pub fn is_empty(&self) -> bool {
756 self.method.is_empty()
757 && self.protocol.is_empty()
758 && self.anti_bot_tech == crate::page::AntiBotTech::None
759 && self.request_headers.is_empty()
760 && self.response_headers.is_empty()
761 }
762}
763
764#[cfg(feature = "chrome")]
765fn is_cipher_mismatch(err: &chromiumoxide::error::CdpError) -> bool {
767 match err {
768 chromiumoxide::error::CdpError::ChromeMessage(msg) => {
769 msg.contains("net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH")
770 }
771 other => other
772 .to_string()
773 .contains("net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH"),
774 }
775}
776
777#[cfg(feature = "chrome")]
778pub async fn perform_chrome_http_request(
780 page: &chromiumoxide::Page,
781 source: &str,
782 referrer: Option<String>,
783) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
784 async fn attempt_once(
785 page: &chromiumoxide::Page,
786 source: &str,
787 referrer: Option<String>,
788 ) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
789 let mut waf_check = false;
790 let mut status_code = StatusCode::OK;
791 let mut method = String::from("GET");
792 let mut response_headers: std::collections::HashMap<String, String> =
793 std::collections::HashMap::default();
794 let mut request_headers = std::collections::HashMap::default();
795 let mut protocol = String::from("http/1.1");
796 let mut anti_bot_tech = AntiBotTech::default();
797
798 let frame_id = page.mainframe().await?;
799
800 let page_base =
801 page.http_future(chromiumoxide::cdp::browser_protocol::page::NavigateParams {
802 url: source.to_string(),
803 transition_type: Some(
804 chromiumoxide::cdp::browser_protocol::page::TransitionType::Other,
805 ),
806 frame_id,
807 referrer,
808 referrer_policy: None,
809 })?;
810
811 match page_base.await {
812 Ok(page_base) => {
813 if let Some(http_request) = page_base {
814 if let Some(http_method) = http_request.method.as_deref() {
815 method = http_method.into();
816 }
817
818 request_headers.clone_from(&http_request.headers);
819
820 if let Some(response) = &http_request.response {
821 if let Some(p) = &response.protocol {
822 protocol.clone_from(p);
823 }
824
825 if let Some(res_headers) = response.headers.inner().as_object() {
826 for (k, v) in res_headers {
827 response_headers.insert(k.to_string(), v.to_string());
828 }
829 }
830
831 let mut firewall = false;
832
833 waf_check = detect_antibot_from_url(&response.url).is_some();
834
835 if !response.url.starts_with(source) {
838 match &response.security_details {
839 Some(security_details) => {
840 anti_bot_tech = detect_anti_bot_tech_response(
841 &response.url,
842 &HeaderSource::Map(&response_headers),
843 &Default::default(),
844 Some(&security_details.subject_name),
845 );
846 firewall = true;
847 }
848 _ => {
849 anti_bot_tech = detect_anti_bot_tech_response(
850 &response.url,
851 &HeaderSource::Map(&response_headers),
852 &Default::default(),
853 None,
854 );
855 if anti_bot_tech == AntiBotTech::Cloudflare {
856 if let Some(xframe_options) =
857 response_headers.get("x-frame-options")
858 {
859 if xframe_options == r#"\"DENY\""# {
860 firewall = true;
861 }
862 } else if let Some(encoding) =
863 response_headers.get("Accept-Encoding")
864 {
865 if encoding == r#"cf-ray"# {
866 firewall = true;
867 }
868 }
869 } else {
870 firewall = true;
871 }
872 }
873 };
874
875 waf_check = waf_check
876 || firewall && !matches!(anti_bot_tech, AntiBotTech::None);
877
878 if !waf_check {
879 waf_check = match &response.protocol {
880 Some(protocol) => protocol == "blob",
881 _ => false,
882 }
883 }
884 }
885
886 status_code = StatusCode::from_u16(response.status as u16)
887 .unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
888 } else if let Some(failure_text) = &http_request.failure_text {
889 if failure_text == "net::ERR_FAILED" {
890 waf_check = true;
891 }
892 }
893 }
894 }
895 Err(e) => return Err(e),
896 }
897
898 Ok(ChromeHTTPReqRes {
899 waf_check,
900 status_code,
901 method,
902 response_headers,
903 request_headers,
904 protocol,
905 anti_bot_tech,
906 })
907 }
908
909 match attempt_once(page, source, referrer.clone()).await {
910 Ok(ok) => Ok(ok),
911 Err(e) => {
912 if is_cipher_mismatch(&e) {
913 if let Some(flipped) = flip_http_https(source) {
914 return attempt_once(page, &flipped, referrer).await;
915 }
916 }
917 Err(e)
918 }
919 }
920}
921
922#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
923pub async fn perform_chrome_http_request_cache(
925 page: &chromiumoxide::Page,
926 source: &str,
927 referrer: Option<String>,
928 cache_options: &Option<CacheOptions>,
929 cache_policy: &Option<BasicCachePolicy>,
930) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
931 async fn attempt_once(
932 page: &chromiumoxide::Page,
933 source: &str,
934 referrer: Option<String>,
935 cache_options: &Option<CacheOptions>,
936 cache_policy: &Option<BasicCachePolicy>,
937 ) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
938 let mut waf_check = false;
939 let mut status_code = StatusCode::OK;
940 let mut method = String::from("GET");
941 let mut response_headers: std::collections::HashMap<String, String> =
942 std::collections::HashMap::default();
943 let mut request_headers = std::collections::HashMap::default();
944 let mut protocol = String::from("http/1.1");
945 let mut anti_bot_tech = AntiBotTech::default();
946
947 let frame_id = page.mainframe().await?;
948
949 let cmd = chromiumoxide::cdp::browser_protocol::page::NavigateParams {
950 url: source.to_string(),
951 transition_type: Some(
952 chromiumoxide::cdp::browser_protocol::page::TransitionType::Other,
953 ),
954 frame_id,
955 referrer,
956 referrer_policy: None,
957 };
958
959 let auth_opt = cache_auth_token(cache_options);
960 let cache_policy = cache_policy.as_ref().map(|f| f.from_basic());
961 let cache_strategy = None;
962 let remote = None;
963
964 let page_base = page.http_future_with_cache_intercept_enabled(
965 cmd,
966 auth_opt,
967 cache_policy,
968 cache_strategy,
969 remote,
970 );
971
972 match page_base.await {
973 Ok(http_request) => {
974 if let Some(http_method) = http_request.method.as_deref() {
975 method = http_method.into();
976 }
977
978 request_headers.clone_from(&http_request.headers);
979
980 if let Some(response) = &http_request.response {
981 if let Some(p) = &response.protocol {
982 protocol.clone_from(p);
983 }
984
985 if let Some(res_headers) = response.headers.inner().as_object() {
986 for (k, v) in res_headers {
987 response_headers.insert(k.to_string(), v.to_string());
988 }
989 }
990
991 let mut firewall = false;
992
993 waf_check = detect_antibot_from_url(&response.url).is_some();
994
995 if !response.url.starts_with(source) {
996 match &response.security_details {
997 Some(security_details) => {
998 anti_bot_tech = detect_anti_bot_tech_response(
999 &response.url,
1000 &HeaderSource::Map(&response_headers),
1001 &Default::default(),
1002 Some(&security_details.subject_name),
1003 );
1004 firewall = true;
1005 }
1006 _ => {
1007 anti_bot_tech = detect_anti_bot_tech_response(
1008 &response.url,
1009 &HeaderSource::Map(&response_headers),
1010 &Default::default(),
1011 None,
1012 );
1013 if anti_bot_tech == AntiBotTech::Cloudflare {
1014 if let Some(xframe_options) =
1015 response_headers.get("x-frame-options")
1016 {
1017 if xframe_options == r#"\"DENY\""# {
1018 firewall = true;
1019 }
1020 } else if let Some(encoding) =
1021 response_headers.get("Accept-Encoding")
1022 {
1023 if encoding == r#"cf-ray"# {
1024 firewall = true;
1025 }
1026 }
1027 } else {
1028 firewall = true;
1029 }
1030 }
1031 };
1032
1033 waf_check =
1034 waf_check || firewall && !matches!(anti_bot_tech, AntiBotTech::None);
1035
1036 if !waf_check {
1037 waf_check = match &response.protocol {
1038 Some(protocol) => protocol == "blob",
1039 _ => false,
1040 }
1041 }
1042 }
1043
1044 status_code = StatusCode::from_u16(response.status as u16)
1045 .unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
1046 } else if let Some(failure_text) = &http_request.failure_text {
1047 if failure_text == "net::ERR_FAILED" {
1048 waf_check = true;
1049 }
1050 }
1051 }
1052 Err(e) => return Err(e),
1053 }
1054
1055 Ok(ChromeHTTPReqRes {
1056 waf_check,
1057 status_code,
1058 method,
1059 response_headers,
1060 request_headers,
1061 protocol,
1062 anti_bot_tech,
1063 })
1064 }
1065
1066 match attempt_once(page, source, referrer.clone(), cache_options, cache_policy).await {
1067 Ok(ok) => Ok(ok),
1068 Err(e) => {
1069 if is_cipher_mismatch(&e) {
1070 if let Some(flipped) = flip_http_https(source) {
1071 return attempt_once(page, &flipped, referrer, cache_options, cache_policy)
1072 .await;
1073 }
1074 }
1075 Err(e)
1076 }
1077 }
1078}
1079
1080#[cfg(all(feature = "chrome", not(feature = "openai")))]
1082pub async fn run_openai_request(
1083 _source: &str,
1084 _page: &chromiumoxide::Page,
1085 _wait_for: &Option<crate::configuration::WaitFor>,
1086 _openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
1087 _page_response: &mut PageResponse,
1088 _ok: bool,
1089) {
1090}
1091
1092#[cfg(all(feature = "chrome", feature = "openai"))]
1094pub async fn run_openai_request(
1095 source: &str,
1096 page: &chromiumoxide::Page,
1097 wait_for: &Option<crate::configuration::WaitFor>,
1098 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
1099 mut page_response: &mut PageResponse,
1100 ok: bool,
1101) {
1102 if let Some(gpt_configs) = openai_config {
1103 let gpt_configs = match &gpt_configs.prompt_url_map {
1104 Some(h) => {
1105 let c = h.get::<case_insensitive_string::CaseInsensitiveString>(&source.into());
1106
1107 if !c.is_some() && gpt_configs.paths_map {
1108 h.get::<case_insensitive_string::CaseInsensitiveString>(
1109 &get_path_from_url(&source).into(),
1110 )
1111 } else {
1112 c
1113 }
1114 }
1115 _ => Some(gpt_configs),
1116 };
1117
1118 if let Some(gpt_configs) = gpt_configs {
1119 let mut prompts = gpt_configs.prompt.clone();
1120
1121 while let Some(prompt) = prompts.next() {
1122 let gpt_results = if !gpt_configs.model.is_empty() && ok {
1123 openai_request(
1124 gpt_configs,
1125 match page_response.content.as_ref() {
1126 Some(html) => auto_encoder::auto_encode_bytes(html),
1127 _ => Default::default(),
1128 },
1129 &source,
1130 &prompt,
1131 )
1132 .await
1133 } else {
1134 Default::default()
1135 };
1136
1137 let js_script = gpt_results.response;
1138 let tokens_used = gpt_results.usage;
1139 let gpt_error = gpt_results.error;
1140
1141 handle_openai_credits(&mut page_response, tokens_used);
1143
1144 let json_res = if gpt_configs.extra_ai_data {
1145 match handle_ai_data(&js_script) {
1146 Some(jr) => jr,
1147 _ => {
1148 let mut jr = JsonResponse::default();
1149 jr.error = Some("An issue occured with serialization.".into());
1150
1151 jr
1152 }
1153 }
1154 } else {
1155 let mut x = JsonResponse::default();
1156 x.js = js_script;
1157 x
1158 };
1159
1160 if !json_res.js.is_empty() {
1162 let html: Option<Box<Vec<u8>>> = match page
1163 .evaluate_function(string_concat!(
1164 "async function() { ",
1165 json_res.js,
1166 "; return document.documentElement.outerHTML; }"
1167 ))
1168 .await
1169 {
1170 Ok(h) => match h.into_value() {
1171 Ok(hh) => Some(hh),
1172 _ => None,
1173 },
1174 _ => None,
1175 };
1176
1177 if html.is_some() {
1178 page_wait(&page, &wait_for).await;
1179 if json_res.js.len() <= 400 && json_res.js.contains("window.location") {
1180 if let Ok(b) = page.outer_html_bytes().await {
1181 page_response.content = Some(b.into());
1182 }
1183 } else {
1184 page_response.content = html;
1185 }
1186 }
1187 }
1188
1189 if gpt_configs.extra_ai_data {
1191 let screenshot_bytes = if gpt_configs.screenshot && !json_res.js.is_empty() {
1192 let format = chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png;
1193
1194 let screenshot_configs = chromiumoxide::page::ScreenshotParams::builder()
1195 .format(format)
1196 .full_page(true)
1197 .quality(45)
1198 .omit_background(false);
1199
1200 match page.screenshot(screenshot_configs.build()).await {
1201 Ok(b) => {
1202 log::debug!("took screenshot: {:?}", source);
1203 Some(b)
1204 }
1205 Err(e) => {
1206 log::error!("failed to take screenshot: {:?} - {:?}", e, source);
1207 None
1208 }
1209 }
1210 } else {
1211 None
1212 };
1213
1214 handle_extra_ai_data(
1215 page_response,
1216 &prompt,
1217 json_res,
1218 screenshot_bytes,
1219 gpt_error,
1220 );
1221 }
1222 }
1223 }
1224 }
1225}
1226
1227#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1229#[non_exhaustive]
1230pub enum HttpVersion {
1231 Http09,
1233 Http10,
1235 Http11,
1237 H2,
1239 H3,
1241}
1242
1243#[derive(Debug, Clone)]
1245pub struct HttpResponse {
1246 pub body: Vec<u8>,
1248 pub headers: std::collections::HashMap<String, String>,
1250 pub status: u16,
1252 pub url: url::Url,
1254 pub version: HttpVersion,
1256}
1257
1258#[cfg(feature = "cache_chrome_hybrid")]
1260pub struct HttpRequestLike {
1261 pub uri: http::uri::Uri,
1263 pub method: reqwest::Method,
1265 pub headers: http::HeaderMap,
1267}
1268
1269#[cfg(feature = "cache_chrome_hybrid")]
1270pub struct HttpResponseLike {
1272 pub status: StatusCode,
1274 pub headers: http::HeaderMap,
1276}
1277
1278#[cfg(feature = "cache_chrome_hybrid")]
1279impl RequestLike for HttpRequestLike {
1280 fn uri(&self) -> http::uri::Uri {
1281 self.uri.clone()
1282 }
1283 fn is_same_uri(&self, other: &http::Uri) -> bool {
1284 &self.uri == other
1285 }
1286 fn method(&self) -> &reqwest::Method {
1287 &self.method
1288 }
1289 fn headers(&self) -> &http::HeaderMap {
1290 &self.headers
1291 }
1292}
1293
1294#[cfg(feature = "cache_chrome_hybrid")]
1295impl ResponseLike for HttpResponseLike {
1296 fn status(&self) -> StatusCode {
1297 self.status
1298 }
1299 fn headers(&self) -> &http::HeaderMap {
1300 &self.headers
1301 }
1302}
1303
1304#[cfg(any(
1306 feature = "cache_chrome_hybrid",
1307 feature = "headers",
1308 feature = "cookies"
1309))]
1310pub fn convert_headers(
1311 headers: &std::collections::HashMap<String, String>,
1312) -> reqwest::header::HeaderMap {
1313 let mut header_map = reqwest::header::HeaderMap::new();
1314
1315 for (index, items) in headers.iter().enumerate() {
1316 if let Ok(head) = reqwest::header::HeaderValue::from_str(items.1) {
1317 use std::str::FromStr;
1318 if let Ok(key) = reqwest::header::HeaderName::from_str(items.0) {
1319 header_map.insert(key, head);
1320 }
1321 }
1322 if index > 1000 {
1324 break;
1325 }
1326 }
1327
1328 header_map
1329}
1330
1331#[cfg(feature = "cache_chrome_hybrid")]
1332pub async fn put_hybrid_cache(
1334 cache_key: &str,
1335 http_response: HttpResponse,
1336 method: &str,
1337 http_request_headers: std::collections::HashMap<String, String>,
1338) {
1339 use crate::http_cache_reqwest::CacheManager;
1340 use http_cache_semantics::CachePolicy;
1341
1342 match http_response.url.as_str().parse::<http::uri::Uri>() {
1343 Ok(u) => {
1344 let req = HttpRequestLike {
1345 uri: u,
1346 method: reqwest::Method::from_bytes(method.as_bytes())
1347 .unwrap_or(reqwest::Method::GET),
1348 headers: convert_headers(&http_response.headers),
1349 };
1350
1351 let res = HttpResponseLike {
1352 status: StatusCode::from_u16(http_response.status)
1353 .unwrap_or(StatusCode::EXPECTATION_FAILED),
1354 headers: convert_headers(&http_request_headers),
1355 };
1356
1357 let policy = CachePolicy::new(&req, &res);
1358
1359 let _ = crate::website::CACACHE_MANAGER
1360 .put(
1361 cache_key.into(),
1362 http_cache_reqwest::HttpResponse {
1363 url: http_response.url,
1364 body: http_response.body,
1365 headers: http_response.headers,
1366 version: match http_response.version {
1367 HttpVersion::H2 => http_cache::HttpVersion::H2,
1368 HttpVersion::Http10 => http_cache::HttpVersion::Http10,
1369 HttpVersion::H3 => http_cache::HttpVersion::H3,
1370 HttpVersion::Http09 => http_cache::HttpVersion::Http09,
1371 HttpVersion::Http11 => http_cache::HttpVersion::Http11,
1372 },
1373 status: http_response.status,
1374 },
1375 policy,
1376 )
1377 .await;
1378 }
1379 _ => (),
1380 }
1381}
1382
1383#[cfg(not(feature = "cache_chrome_hybrid"))]
1384pub async fn put_hybrid_cache(
1386 _cache_key: &str,
1387 _http_response: HttpResponse,
1388 _method: &str,
1389 _http_request_headers: std::collections::HashMap<String, String>,
1390) {
1391}
1392
1393#[cfg(feature = "chrome")]
1395fn sub_duration(
1396 base_timeout: std::time::Duration,
1397 elapsed: std::time::Duration,
1398) -> std::time::Duration {
1399 match base_timeout.checked_sub(elapsed) {
1400 Some(remaining_time) => remaining_time,
1401 None => Default::default(),
1402 }
1403}
1404
1405#[cfg(feature = "chrome")]
1407async fn navigate(
1408 page: &chromiumoxide::Page,
1409 url: &str,
1410 chrome_http_req_res: &mut ChromeHTTPReqRes,
1411 referrer: Option<String>,
1412) -> Result<(), chromiumoxide::error::CdpError> {
1413 *chrome_http_req_res = perform_chrome_http_request(page, url, referrer).await?;
1414 Ok(())
1415}
1416
1417#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1419async fn navigate_cache(
1420 page: &chromiumoxide::Page,
1421 url: &str,
1422 chrome_http_req_res: &mut ChromeHTTPReqRes,
1423 referrer: Option<String>,
1424 cache_options: &Option<CacheOptions>,
1425 cache_policy: &Option<BasicCachePolicy>,
1426) -> Result<(), chromiumoxide::error::CdpError> {
1427 *chrome_http_req_res =
1428 perform_chrome_http_request_cache(page, url, referrer, cache_options, cache_policy).await?;
1429 Ok(())
1430}
1431
1432#[cfg(all(feature = "real_browser", feature = "chrome"))]
1433pub async fn perform_smart_mouse_movement(
1435 page: &chromiumoxide::Page,
1436 viewport: &Option<crate::configuration::Viewport>,
1437) {
1438 use chromiumoxide::layout::Point;
1439 use fastrand::Rng;
1440 use spider_fingerprint::spoof_mouse_movement::GaussianMouse;
1441 use tokio::time::{sleep, Duration};
1442
1443 let (viewport_width, viewport_height) = match viewport {
1444 Some(vp) => (vp.width as f64, vp.height as f64),
1445 None => (800.0, 600.0),
1446 };
1447
1448 let mut rng = Rng::new();
1449
1450 for (x, y) in GaussianMouse::generate_random_coordinates(viewport_width, viewport_height) {
1451 let _ = page.move_mouse(Point::new(x, y)).await;
1452
1453 if rng.f32() < 0.25 {
1455 let delay_micros = if rng.f32() < 0.9 {
1456 rng.u64(300..=1200) } else {
1458 rng.u64(2000..=8000) };
1460 sleep(Duration::from_micros(delay_micros)).await;
1461 }
1462 }
1463}
1464
1465#[cfg(all(not(feature = "real_browser"), feature = "chrome"))]
1466async fn perform_smart_mouse_movement(
1468 _page: &chromiumoxide::Page,
1469 _viewport: &Option<crate::configuration::Viewport>,
1470) {
1471}
1472
1473#[cfg(all(
1475 feature = "chrome",
1476 feature = "cache_chrome_hybrid",
1477 feature = "cache_chrome_hybrid_mem"
1478))]
1479pub async fn cache_chrome_response(
1480 target_url: &str,
1481 page_response: &PageResponse,
1482 chrome_http_req_res: ChromeHTTPReqRes,
1483) {
1484 if let Ok(u) = url::Url::parse(target_url) {
1485 let http_response = HttpResponse {
1486 url: u,
1487 body: match page_response.content.as_ref() {
1488 Some(b) => b.into(),
1489 _ => Default::default(),
1490 },
1491 status: chrome_http_req_res.status_code.into(),
1492 version: match chrome_http_req_res.protocol.as_str() {
1493 "http/0.9" => HttpVersion::Http09,
1494 "http/1" | "http/1.0" => HttpVersion::Http10,
1495 "http/1.1" => HttpVersion::Http11,
1496 "http/2.0" | "http/2" => HttpVersion::H2,
1497 "http/3.0" | "http/3" => HttpVersion::H3,
1498 _ => HttpVersion::Http11,
1499 },
1500 headers: chrome_http_req_res.response_headers,
1501 };
1502 let auth_opt = match cache_options {
1503 Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
1504 Some(CacheOptions::Authorized(token))
1505 | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
1506 Some(CacheOptions::No) | None => None,
1507 };
1508 let cache_key = create_cache_key_raw(
1509 target_url,
1510 Some(&chrome_http_req_res.method),
1511 auth_opt.as_deref(),
1512 );
1513
1514 put_hybrid_cache(
1515 &cache_key,
1516 http_response,
1517 &chrome_http_req_res.method,
1518 chrome_http_req_res.request_headers,
1519 )
1520 .await;
1521 }
1522}
1523
1524#[cfg(all(
1526 feature = "chrome",
1527 feature = "cache_chrome_hybrid",
1528 not(feature = "cache_chrome_hybrid_mem")
1529))]
1530pub async fn cache_chrome_response(
1531 target_url: &str,
1532 page_response: &PageResponse,
1533 chrome_http_req_res: ChromeHTTPReqRes,
1534 cache_options: &Option<CacheOptions>,
1535) {
1536 if let Ok(u) = url::Url::parse(target_url) {
1537 let http_response = HttpResponse {
1538 url: u,
1539 body: match page_response.content.as_ref() {
1540 Some(b) => b.to_vec(),
1541 _ => Default::default(),
1542 },
1543 status: chrome_http_req_res.status_code.into(),
1544 version: match chrome_http_req_res.protocol.as_str() {
1545 "http/0.9" => HttpVersion::Http09,
1546 "http/1" | "http/1.0" => HttpVersion::Http10,
1547 "http/1.1" => HttpVersion::Http11,
1548 "http/2.0" | "http/2" => HttpVersion::H2,
1549 "http/3.0" | "http/3" => HttpVersion::H3,
1550 _ => HttpVersion::Http11,
1551 },
1552 headers: chrome_http_req_res.response_headers,
1553 };
1554
1555 let auth_opt = match cache_options {
1556 Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
1557 Some(CacheOptions::Authorized(token))
1558 | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
1559 Some(CacheOptions::No) | None => None,
1560 };
1561 let cache_key = create_cache_key_raw(
1562 target_url,
1563 Some(&chrome_http_req_res.method),
1564 auth_opt.as_deref().map(|x| x.as_str()),
1565 );
1566 put_hybrid_cache(
1567 &cache_key,
1568 http_response,
1569 &chrome_http_req_res.method,
1570 chrome_http_req_res.request_headers,
1571 )
1572 .await;
1573 }
1574}
1575
1576#[cfg(all(feature = "chrome", not(feature = "cache_chrome_hybrid")))]
1578pub async fn cache_chrome_response(
1579 _target_url: &str,
1580 _page_response: &PageResponse,
1581 _chrome_http_req_res: ChromeHTTPReqRes,
1582 _cache_options: &Option<CacheOptions>,
1583) {
1584}
1585
1586pub(crate) const FIVE_MINUTES: u32 = 300_000;
1588
1589#[cfg(feature = "chrome")]
1591const MAX_PAGE_TIMEOUT: tokio::time::Duration =
1592 tokio::time::Duration::from_millis(FIVE_MINUTES as u64);
1593#[cfg(feature = "chrome")]
1595const HALF_MAX_PAGE_TIMEOUT: tokio::time::Duration =
1596 tokio::time::Duration::from_millis(FIVE_MINUTES as u64 / 2);
1597
1598#[cfg(all(feature = "chrome", feature = "headers"))]
1599fn store_headers(page_response: &PageResponse, chrome_http_req_res: &mut ChromeHTTPReqRes) {
1601 if let Some(response_headers) = &page_response.headers {
1602 chrome_http_req_res.response_headers =
1603 crate::utils::header_utils::header_map_to_hash_map(&response_headers);
1604 }
1605}
1606
1607#[cfg(all(feature = "chrome", not(feature = "headers")))]
1608fn store_headers(_page_response: &PageResponse, _chrome_http_req_res: &mut ChromeHTTPReqRes) {}
1610
1611#[inline]
1612#[cfg(feature = "chrome")]
1614fn f64_to_u64_floor(x: f64) -> u64 {
1615 if !x.is_finite() || x <= 0.0 {
1616 0
1617 } else if x >= u64::MAX as f64 {
1618 u64::MAX
1619 } else {
1620 x as u64
1621 }
1622}
1623
1624#[cfg(all(feature = "chrome", feature = "cache_request"))]
1625async fn cache_chrome_response_from_cdp_body(
1627 target_url: &str,
1628 body: &[u8],
1629 chrome_http_req_res: &ChromeHTTPReqRes,
1630 cache_options: &Option<CacheOptions>,
1631) {
1632 use crate::utils::create_cache_key_raw;
1633
1634 if let Ok(u) = url::Url::parse(target_url) {
1635 let http_response = HttpResponse {
1636 url: u,
1637 body: body.to_vec(),
1638 status: chrome_http_req_res.status_code.into(),
1639 version: match chrome_http_req_res.protocol.as_str() {
1640 "http/0.9" => HttpVersion::Http09,
1641 "http/1" | "http/1.0" => HttpVersion::Http10,
1642 "http/1.1" => HttpVersion::Http11,
1643 "http/2.0" | "http/2" => HttpVersion::H2,
1644 "http/3.0" | "http/3" => HttpVersion::H3,
1645 _ => HttpVersion::Http11,
1646 },
1647 headers: chrome_http_req_res.response_headers.clone(),
1648 };
1649
1650 let auth_opt = match cache_options {
1651 Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
1652 Some(CacheOptions::Authorized(token))
1653 | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
1654 Some(CacheOptions::No) | None => None,
1655 };
1656 let cache_key = create_cache_key_raw(
1657 target_url,
1658 Some(&chrome_http_req_res.method),
1659 auth_opt.as_deref().map(|x| x.as_str()),
1660 );
1661
1662 put_hybrid_cache(
1663 &cache_key,
1664 http_response,
1665 &chrome_http_req_res.method,
1666 chrome_http_req_res.request_headers.clone(),
1667 )
1668 .await;
1669 }
1670}
1671
1672#[derive(Debug, Clone, Default)]
1673#[cfg(feature = "chrome")]
1674struct ResponseMap {
1676 url: String,
1678 skipped: bool,
1680 bytes_transferred: f64,
1682}
1683
1684#[derive(Debug, Clone, Default)]
1685#[cfg(feature = "chrome")]
1686struct ResponseBase {
1687 response_map: Option<hashbrown::HashMap<String, ResponseMap>>,
1689 headers: Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
1691 status_code: Option<i64>,
1693 #[cfg(feature = "cache_request")]
1694 main_doc_from_cache: bool,
1696}
1697
1698#[cfg(feature = "chrome")]
1699#[inline]
1700fn log_target<'a>(source: &'a str, url_target: Option<&'a str>) -> &'a str {
1702 url_target.unwrap_or(source)
1703}
1704
1705#[cfg(feature = "chrome")]
1706#[inline]
1707fn is_timeout(e: &chromiumoxide::error::CdpError) -> bool {
1709 matches!(e, chromiumoxide::error::CdpError::Timeout)
1710}
1711
1712#[cfg(feature = "chrome")]
1713async fn goto_with_html_once(
1715 page: &chromiumoxide::Page,
1716 target_url: &str,
1717 html: &str,
1718 block_bytes: &mut bool,
1719 resp_headers: &Option<reqwest::header::HeaderMap<reqwest::header::HeaderValue>>,
1720 chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1721) -> Result<(), chromiumoxide::error::CdpError> {
1722 use base64::Engine;
1723 use chromiumoxide::cdp::browser_protocol::fetch::{
1724 DisableParams, EnableParams, EventRequestPaused, FulfillRequestParams, RequestPattern,
1725 RequestStage,
1726 };
1727 use chromiumoxide::cdp::browser_protocol::network::ResourceType;
1728 use tokio_stream::StreamExt;
1729
1730 let mut paused = page.event_listener::<EventRequestPaused>().await?;
1731
1732 let url_prefix = target_url.to_string();
1733 let fulfill_headers =
1734 chrome_fulfill_headers_from_reqwest(resp_headers.as_ref(), "text/html; charset=utf-8");
1735
1736 let interception_required = chrome_intercept.map(|c| !c.enabled).unwrap_or(false);
1737
1738 if interception_required {
1739 page.execute(EnableParams {
1740 patterns: Some(vec![RequestPattern {
1741 url_pattern: Some("*".into()),
1742 resource_type: Some(ResourceType::Document),
1743 request_stage: Some(RequestStage::Request),
1744 }]),
1745 handle_auth_requests: Some(false),
1746 })
1747 .await?;
1748 }
1749
1750 let mut did_goto = false;
1751
1752 loop {
1753 tokio::select! {
1754 biased;
1755 res = page.goto(target_url), if !did_goto => {
1756 did_goto = true;
1757 if let Err(e) = res {
1758 if matches!(e, chromiumoxide::error::CdpError::Timeout) {
1759 *block_bytes = true;
1760 }
1761 if interception_required {
1762 let _ = page.execute(DisableParams {}).await;
1763 } else {
1764 let _ = page.set_request_interception(true).await;
1765 }
1766 return Err(e);
1767 }
1768 }
1769 maybe_ev = paused.next() => {
1770 let Some(ev) = maybe_ev else {
1771 break;
1772 };
1773
1774 if ev.resource_type != ResourceType::Document {
1775 continue;
1776 }
1777 if !ev.request.url.starts_with(&url_prefix) {
1778 continue;
1779 }
1780
1781 let body_b64 = base64::engine::general_purpose::STANDARD.encode(html.as_bytes());
1782
1783 let res = page.execute(FulfillRequestParams {
1784 request_id: ev.request_id.clone(),
1785 response_code: 200,
1786 response_phrase: None,
1787 response_headers: Some(fulfill_headers.clone()),
1788 body: Some(chromiumoxide::Binary(body_b64)),
1789 binary_response_headers: None,
1790 }).await;
1791
1792 if interception_required {
1793 let _ = page.execute(DisableParams {}).await;
1794 } else {
1795 let _ = page.set_request_interception(true).await;
1796 }
1797
1798 match res {
1799 Ok(_) => return Ok(()),
1800 Err(e) => {
1801 if matches!(e, chromiumoxide::error::CdpError::Timeout) {
1802 *block_bytes = true;
1803 }
1804 return Err(e);
1805 }
1806 }
1807 }
1808 }
1809 }
1810
1811 if interception_required {
1812 let _ = page.execute(DisableParams {}).await;
1813 } else {
1814 let _ = page.set_request_interception(true).await;
1815 }
1816
1817 Ok(())
1818}
1819
1820#[cfg(feature = "chrome")]
1821async fn set_document_content_if_requested(
1823 page: &chromiumoxide::Page,
1824 source: &str,
1825 url_target: Option<&str>,
1826 block_bytes: &mut bool,
1827 resp_headers: &Option<HeaderMap<HeaderValue>>,
1828 chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1829) {
1830 if let Some(target_url) = url_target {
1831 let _ = goto_with_html_once(
1832 page,
1833 target_url,
1834 source,
1835 block_bytes,
1836 &resp_headers,
1837 chrome_intercept,
1838 )
1839 .await;
1840 }
1841}
1842
1843#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1844async fn set_document_content_if_requested_cached(
1846 page: &chromiumoxide::Page,
1847 source: &str,
1848 url_target: Option<&str>,
1849 block_bytes: &mut bool,
1850 cache_options: &Option<CacheOptions>,
1851 cache_policy: &Option<BasicCachePolicy>,
1852 resp_headers: &Option<HeaderMap<HeaderValue>>,
1853 chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1854) {
1855 let auth_opt = cache_auth_token(cache_options);
1856 let cache_policy = cache_policy.as_ref().map(|f| f.from_basic());
1857 let cache_strategy = None;
1858 let remote = Some("true");
1859 let target_url = url_target.unwrap_or_default();
1860 let cache_site = chromiumoxide::cache::manager::site_key_for_target_url(&target_url, auth_opt);
1861
1862 let _ = page
1863 .set_cache_key((Some(cache_site.clone()), cache_policy.clone()))
1864 .await;
1865
1866 let cache_future = async {
1867 if let Some(target_url) = url_target {
1868 let _ = goto_with_html_once(
1869 page,
1870 target_url,
1871 source,
1872 block_bytes,
1873 &resp_headers,
1874 chrome_intercept,
1875 )
1876 .await;
1877 }
1878 };
1879
1880 let (_, __, _cache_future) = tokio::join!(
1881 page.spawn_cache_listener(
1882 &cache_site,
1883 auth_opt.map(|f| f.into()),
1884 cache_strategy.clone(),
1885 remote.map(|f| f.into())
1886 ),
1887 page.seed_cache(&target_url, auth_opt, remote),
1888 cache_future
1889 );
1890
1891 let _ = page.clear_local_cache(&cache_site);
1892}
1893
1894#[cfg(feature = "chrome")]
1895async fn navigate_if_requested(
1896 page: &chromiumoxide::Page,
1897 source: &str,
1898 url_target: Option<&str>,
1899 chrome_http_req_res: &mut ChromeHTTPReqRes,
1900 referrer: Option<String>,
1901 block_bytes: &mut bool,
1902) -> Result<(), chromiumoxide::error::CdpError> {
1903 if let Err(e) = navigate(page, source, chrome_http_req_res, referrer).await {
1904 log::info!(
1905 "Navigation Error({:?}) - {:?}",
1906 e,
1907 log_target(source, url_target)
1908 );
1909 if is_timeout(&e) {
1910 *block_bytes = true;
1911 }
1912 return Err(e);
1913 }
1914 Ok(())
1915}
1916
1917#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1918async fn navigate_if_requested_cache(
1920 page: &chromiumoxide::Page,
1921 source: &str,
1922 url_target: Option<&str>,
1923 chrome_http_req_res: &mut ChromeHTTPReqRes,
1924 referrer: Option<String>,
1925 block_bytes: &mut bool,
1926 cache_options: &Option<CacheOptions>,
1927 cache_policy: &Option<BasicCachePolicy>,
1928) -> Result<(), chromiumoxide::error::CdpError> {
1929 if let Err(e) = navigate_cache(
1930 page,
1931 source,
1932 chrome_http_req_res,
1933 referrer,
1934 cache_options,
1935 cache_policy,
1936 )
1937 .await
1938 {
1939 log::info!(
1940 "Navigation Error({:?}) - {:?}",
1941 e,
1942 log_target(source, url_target)
1943 );
1944 if is_timeout(&e) {
1945 *block_bytes = true;
1946 }
1947 return Err(e);
1948 }
1949 Ok(())
1950}
1951
1952#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1953fn cache_enabled(cache_options: &Option<CacheOptions>) -> bool {
1955 matches!(
1956 cache_options,
1957 Some(CacheOptions::Yes | CacheOptions::Authorized(_))
1958 )
1959}
1960
1961#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1962fn chrome_cache_policy(
1964 cache_policy: &Option<BasicCachePolicy>,
1965) -> chromiumoxide::cache::BasicCachePolicy {
1966 cache_policy
1967 .as_ref()
1968 .map(|p| p.from_basic())
1969 .unwrap_or(chromiumoxide::cache::BasicCachePolicy::Normal)
1970}
1971
1972#[cfg(all(feature = "chrome", not(feature = "chrome_remote_cache")))]
1973pub async fn run_navigate_or_content_set_core(
1980 page: &chromiumoxide::Page,
1981 page_set: bool,
1982 content: bool,
1983 source: &str,
1984 url_target: Option<&str>,
1985 chrome_http_req_res: &mut ChromeHTTPReqRes,
1986 referrer: Option<String>,
1987 block_bytes: &mut bool,
1988 _cache_options: &Option<CacheOptions>,
1989 _cache_policy: &Option<BasicCachePolicy>,
1990 resp_headers: &Option<HeaderMap<HeaderValue>>,
1991 chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1992) -> Result<(), chromiumoxide::error::CdpError> {
1993 if page_set {
1994 return Ok(());
1995 }
1996
1997 if content {
1998 if crate::features::solvers::detect_cf_turnstyle(source.as_bytes()) {
2000 chrome_http_req_res.anti_bot_tech = AntiBotTech::Cloudflare;
2001 }
2002 set_document_content_if_requested(
2003 page,
2004 source,
2005 url_target,
2006 block_bytes,
2007 resp_headers,
2008 chrome_intercept,
2009 )
2010 .await;
2011 return Ok(());
2012 }
2013
2014 navigate_if_requested(
2015 page,
2016 source,
2017 url_target,
2018 chrome_http_req_res,
2019 referrer,
2020 block_bytes,
2021 )
2022 .await
2023}
2024
2025#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
2026pub async fn run_navigate_or_content_set_core(
2033 page: &chromiumoxide::Page,
2034 page_set: bool,
2035 content: bool,
2036 source: &str,
2037 url_target: Option<&str>,
2038 chrome_http_req_res: &mut ChromeHTTPReqRes,
2039 referrer: Option<String>,
2040 block_bytes: &mut bool,
2041 cache_options: &Option<CacheOptions>,
2042 cache_policy: &Option<BasicCachePolicy>,
2043 resp_headers: &Option<HeaderMap<HeaderValue>>,
2044 chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
2045) -> Result<(), chromiumoxide::error::CdpError> {
2046 if page_set {
2047 return Ok(());
2048 }
2049
2050 let cache = cache_enabled(cache_options);
2051
2052 if content {
2053 if crate::features::solvers::detect_cf_turnstyle(source.as_bytes()) {
2055 chrome_http_req_res.anti_bot_tech = AntiBotTech::Cloudflare;
2056 }
2057
2058 if cache {
2059 set_document_content_if_requested_cached(
2060 page,
2061 source,
2062 url_target,
2063 block_bytes,
2064 cache_options,
2065 cache_policy,
2066 &resp_headers,
2067 chrome_intercept,
2068 )
2069 .await;
2070 } else {
2071 set_document_content_if_requested(
2072 page,
2073 source,
2074 url_target,
2075 block_bytes,
2076 resp_headers,
2077 chrome_intercept,
2078 )
2079 .await;
2080 }
2081 return Ok(());
2082 }
2083
2084 if cache {
2085 navigate_if_requested_cache(
2086 page,
2087 source,
2088 url_target,
2089 chrome_http_req_res,
2090 referrer,
2091 block_bytes,
2092 cache_options,
2093 cache_policy,
2094 )
2095 .await
2096 } else {
2097 navigate_if_requested(
2098 page,
2099 source,
2100 url_target,
2101 chrome_http_req_res,
2102 referrer,
2103 block_bytes,
2104 )
2105 .await
2106 }
2107}
2108
2109#[cfg(feature = "chrome")]
2110pub async fn get_final_redirect(
2112 page: &chromiumoxide::Page,
2113 source: &str,
2114 base_timeout: Duration,
2115) -> Option<String> {
2116 let last_redirect = tokio::time::timeout(base_timeout, async {
2117 match page.wait_for_navigation_response().await {
2118 Ok(u) => get_last_redirect(&source, &u, &page).await,
2119 _ => None,
2120 }
2121 })
2122 .await;
2123
2124 match last_redirect {
2125 Ok(final_url) => {
2126 if final_url.as_deref() == Some("about:blank")
2127 || final_url.as_deref() == Some("chrome-error://chromewebdata/")
2128 {
2129 None
2130 } else {
2131 final_url
2132 }
2133 }
2134 _ => None,
2135 }
2136}
2137
2138#[cfg(feature = "chrome")]
2139pub fn chrome_fulfill_headers_from_reqwest(
2141 headers: Option<&reqwest::header::HeaderMap<reqwest::header::HeaderValue>>,
2142 default_content_type: &'static str,
2143) -> Vec<chromiumoxide::cdp::browser_protocol::fetch::HeaderEntry> {
2144 use chromiumoxide::cdp::browser_protocol::fetch::HeaderEntry;
2145
2146 let mut out: Vec<HeaderEntry> = Vec::new();
2147
2148 if let Some(hm) = headers {
2150 for (name, value) in hm.iter() {
2151 let k = name.as_str();
2152
2153 match k.to_ascii_lowercase().as_str() {
2155 "content-length" | "transfer-encoding" | "connection" | "keep-alive"
2156 | "proxy-connection" | "te" | "trailers" | "upgrade" => continue,
2157 _ => {}
2158 }
2159
2160 let v = match value.to_str() {
2161 Ok(s) => s.to_string(),
2162 Err(_) => String::from_utf8_lossy(value.as_bytes()).into_owned(),
2163 };
2164
2165 out.push(HeaderEntry {
2166 name: k.to_string(),
2167 value: v,
2168 });
2169 }
2170 }
2171
2172 let has_ct = out
2174 .iter()
2175 .any(|h| h.name.eq_ignore_ascii_case("content-type"));
2176 if !has_ct {
2177 out.push(HeaderEntry {
2178 name: "Content-Type".into(),
2179 value: default_content_type.into(),
2180 });
2181 }
2182
2183 if !out
2185 .iter()
2186 .any(|h| h.name.eq_ignore_ascii_case("cache-control"))
2187 {
2188 out.push(HeaderEntry {
2189 name: "Cache-Control".into(),
2190 value: "no-store".into(),
2191 });
2192 }
2193
2194 out
2195}
2196
2197#[cfg(feature = "chrome")]
2198const SKIP_BYTES_AMOUNT: f64 = 17.0;
2200
2201#[cfg(feature = "chrome")]
2202pub async fn fetch_page_html_chrome_base(
2204 source: &str,
2205 page: &chromiumoxide::Page,
2206 content: bool,
2207 wait_for_navigation: bool,
2208 wait_for: &Option<crate::configuration::WaitFor>,
2209 screenshot: &Option<crate::configuration::ScreenShotConfig>,
2210 page_set: bool,
2211 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2212 url_target: Option<&str>,
2213 execution_scripts: &Option<ExecutionScripts>,
2214 automation_scripts: &Option<AutomationScripts>,
2215 viewport: &Option<crate::configuration::Viewport>,
2216 request_timeout: &Option<Box<Duration>>,
2217 track_events: &Option<crate::configuration::ChromeEventTracker>,
2218 referrer: Option<String>,
2219 max_page_bytes: Option<f64>,
2220 cache_options: Option<CacheOptions>,
2221 cache_policy: &Option<BasicCachePolicy>,
2222 resp_headers: &Option<HeaderMap<HeaderValue>>,
2223 chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
2224 jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
2225 remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
2226) -> Result<PageResponse, chromiumoxide::error::CdpError> {
2227 use crate::page::{is_asset_url, DOWNLOADABLE_MEDIA_TYPES, UNKNOWN_STATUS_ERROR};
2228 use chromiumoxide::{
2229 cdp::browser_protocol::network::{
2230 EventDataReceived, EventLoadingFailed, EventRequestWillBeSent, EventResponseReceived,
2231 GetResponseBodyParams, RequestId, ResourceType,
2232 },
2233 error::CdpError,
2234 };
2235 use tokio::{
2236 sync::{oneshot, OnceCell},
2237 time::Instant,
2238 };
2239
2240 let duration = if cfg!(feature = "time") {
2241 Some(tokio::time::Instant::now())
2242 } else {
2243 None
2244 };
2245
2246 let mut chrome_http_req_res = ChromeHTTPReqRes::default();
2247 let mut metadata: Option<Vec<crate::page::AutomationResults>> = None;
2248 let mut block_bytes = false;
2249
2250 let mut base_timeout = match request_timeout {
2252 Some(timeout) => **timeout.min(&Box::new(MAX_PAGE_TIMEOUT)),
2253 _ => MAX_PAGE_TIMEOUT,
2254 };
2255
2256 let base_timeout_measurement = base_timeout;
2258 let target_url = url_target.unwrap_or(source);
2259 let asset = is_asset_url(target_url);
2260
2261 let (tx1, rx1) = if asset {
2262 let c = oneshot::channel::<Option<RequestId>>();
2263
2264 (Some(c.0), Some(c.1))
2265 } else {
2266 (None, None)
2267 };
2268
2269 let should_block = max_page_bytes.is_some();
2270
2271 let (track_requests, track_responses, track_automation) = match track_events {
2272 Some(tracker) => (tracker.requests, tracker.responses, tracker.automation),
2273 _ => (false, false, false),
2274 };
2275
2276 let (
2277 event_loading_listener,
2278 cancel_listener,
2279 received_listener,
2280 event_sent_listener,
2281 event_data_received,
2282 ) = tokio::join!(
2283 page.event_listener::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
2284 ),
2285 page.event_listener::<EventLoadingFailed>(),
2286 page.event_listener::<EventResponseReceived>(),
2287 async {
2288 if track_requests {
2289 page.event_listener::<EventRequestWillBeSent>().await
2290 } else {
2291 Err(CdpError::NotFound)
2292 }
2293 },
2294 async {
2295 if should_block {
2296 page.event_listener::<EventDataReceived>().await
2297 } else {
2298 Err(CdpError::NotFound)
2299 }
2300 }
2301 );
2302
2303 #[cfg(feature = "cache_request")]
2304 let cache_request = match cache_options {
2305 Some(CacheOptions::No) => false,
2306 _ => true,
2307 };
2308
2309 let (tx, rx) = oneshot::channel::<bool>();
2310
2311 #[cfg(feature = "cache_request")]
2312 let (main_tx, main_rx) = if cache_request {
2313 let c = oneshot::channel::<RequestId>();
2314 (Some(c.0), Some(c.1))
2315 } else {
2316 (None, None)
2317 };
2318
2319 let page_clone = if should_block {
2320 Some(page.clone())
2321 } else {
2322 None
2323 };
2324
2325 let html_source_size = source.len();
2326
2327 let bytes_collected_handle = tokio::spawn(async move {
2330 let finished_media: Option<OnceCell<RequestId>> =
2331 if asset { Some(OnceCell::new()) } else { None };
2332
2333 let f1 = async {
2334 let mut total = 0.0;
2335
2336 let mut response_map: Option<HashMap<String, f64>> = if track_responses {
2337 Some(HashMap::new())
2338 } else {
2339 None
2340 };
2341
2342 if let Ok(mut listener) = event_loading_listener {
2343 while let Some(event) = listener.next().await {
2344 total += event.encoded_data_length;
2345 if let Some(response_map) = response_map.as_mut() {
2346 response_map
2347 .entry(event.request_id.inner().clone())
2348 .and_modify(|e| *e += event.encoded_data_length)
2349 .or_insert(event.encoded_data_length);
2350 }
2351 if asset {
2352 if let Some(once) = &finished_media {
2353 if let Some(request_id) = once.get() {
2354 if request_id == &event.request_id {
2355 if let Some(tx1) = tx1 {
2356 let _ = tx1.send(Some(request_id.clone()));
2357 break;
2358 }
2359 }
2360 }
2361 }
2362 }
2363 }
2364 }
2365
2366 (total, response_map)
2367 };
2368
2369 let f2 = async {
2370 if let Ok(mut listener) = cancel_listener {
2371 let mut net_aborted = false;
2372
2373 while let Some(event) = listener.next().await {
2374 if event.r#type == ResourceType::Document
2375 && event.error_text == "net::ERR_ABORTED"
2376 && event.canceled.unwrap_or_default()
2377 {
2378 net_aborted = true;
2379 break;
2380 }
2381 }
2382
2383 if net_aborted {
2384 let _ = tx.send(true);
2385 }
2386 }
2387 };
2388
2389 let f3 = async {
2390 let mut response_map: Option<HashMap<String, ResponseMap>> = if track_responses {
2391 Some(HashMap::new())
2392 } else {
2393 None
2394 };
2395
2396 let mut status_code = None;
2397 let mut headers = None;
2398 #[cfg(feature = "cache_request")]
2399 let mut main_doc_request_id: Option<RequestId> = None;
2400 #[cfg(feature = "cache_request")]
2401 let mut main_doc_from_cache = false;
2402
2403 let persist_event = asset || track_responses;
2404
2405 if let Ok(mut listener) = received_listener {
2406 let mut initial_asset = false;
2407 let mut allow_download = false;
2408 let mut intial_request = false;
2409
2410 while let Some(event) = listener.next().await {
2411 let document = event.r#type == ResourceType::Document;
2412
2413 if !intial_request && document {
2414 let redirect = event.response.status >= 300 && event.response.status <= 399;
2416
2417 if !redirect {
2418 intial_request = true;
2419 status_code = Some(event.response.status);
2420 headers = Some(event.response.headers.clone());
2421 #[cfg(feature = "cache_request")]
2422 {
2423 main_doc_request_id = Some(event.request_id.clone());
2424 let from_disk = event.response.from_disk_cache.unwrap_or(false);
2426 let from_prefetch =
2427 event.response.from_prefetch_cache.unwrap_or(false);
2428 let from_sw = event.response.from_service_worker.unwrap_or(false);
2429 main_doc_from_cache = from_disk || from_prefetch || from_sw;
2430 }
2431
2432 if !persist_event {
2433 break;
2434 }
2435
2436 if content {
2437 if let Some(response_map) = response_map.as_mut() {
2438 response_map.insert(
2439 event.request_id.inner().clone(),
2440 ResponseMap {
2441 url: event.response.url.clone(),
2442 bytes_transferred: (html_source_size as f64)
2444 + event.response.encoded_data_length,
2445 skipped: true,
2446 },
2447 );
2448 continue;
2449 }
2450 }
2451 }
2452 }
2453 else if asset {
2455 if !initial_asset && document {
2456 allow_download =
2457 DOWNLOADABLE_MEDIA_TYPES.contains(&event.response.mime_type);
2458 }
2459 if event.r#type == ResourceType::Media && allow_download {
2460 if let Some(once) = &finished_media {
2461 let _ = once.set(event.request_id.clone());
2462 }
2463 }
2464 initial_asset = true;
2465 }
2466
2467 if let Some(response_map) = response_map.as_mut() {
2468 response_map.insert(
2469 event.request_id.inner().clone(),
2470 ResponseMap {
2471 url: event.response.url.clone(),
2472 bytes_transferred: event.response.encoded_data_length,
2473 skipped: *MASK_BYTES_INTERCEPTION
2474 && event.response.connection_id == 0.0
2475 && event.response.encoded_data_length <= SKIP_BYTES_AMOUNT,
2476 },
2477 );
2478 }
2479 }
2480 }
2481
2482 #[cfg(feature = "cache_request")]
2483 if let Some(request_id) = &main_doc_request_id {
2484 if let Some(tx) = main_tx {
2485 let _ = tx.send(request_id.clone());
2486 }
2487 }
2488
2489 ResponseBase {
2490 response_map,
2491 status_code,
2492 headers,
2493 #[cfg(feature = "cache_request")]
2494 main_doc_from_cache,
2495 }
2496 };
2497
2498 let f4 = async {
2499 let mut request_map: Option<HashMap<String, f64>> = if track_requests {
2500 Some(HashMap::new())
2501 } else {
2502 None
2503 };
2504
2505 if request_map.is_some() {
2506 if let Some(response_map) = request_map.as_mut() {
2507 if let Ok(mut listener) = event_sent_listener {
2508 while let Some(event) = listener.next().await {
2509 response_map
2510 .insert(event.request.url.clone(), *event.timestamp.inner());
2511 }
2512 }
2513 }
2514 }
2515
2516 request_map
2517 };
2518
2519 let f5 = async {
2520 if let Some(page_clone) = &page_clone {
2521 if let Ok(mut listener) = event_data_received {
2522 let mut total_bytes: u64 = 0;
2523 let total_max = f64_to_u64_floor(max_page_bytes.unwrap_or_default());
2524 while let Some(event) = listener.next().await {
2525 let encoded = event.encoded_data_length.max(0) as u64;
2526 total_bytes = total_bytes.saturating_add(encoded);
2527 if total_bytes > total_max {
2528 let _ = page_clone.force_stop_all().await;
2529 break;
2530 }
2531 }
2532 }
2533 }
2534 };
2535
2536 let (t1, _, res_map, req_map, __) = tokio::join!(f1, f2, f3, f4, f5);
2537
2538 (t1.0, t1.1, res_map, req_map)
2539 });
2540
2541 let page_navigation = async {
2542 run_navigate_or_content_set_core(
2543 page,
2544 page_set,
2545 content,
2546 source,
2547 url_target,
2548 &mut chrome_http_req_res,
2549 referrer,
2550 &mut block_bytes,
2551 &cache_options,
2552 &cache_policy,
2553 resp_headers,
2554 chrome_intercept,
2555 )
2556 .await
2557 };
2558
2559 let start_time = Instant::now();
2560
2561 let mut request_cancelled = false;
2562
2563 let page_navigate = async {
2564 if cfg!(feature = "real_browser") {
2565 let notify = tokio::sync::Notify::new();
2566
2567 let mouse_loop = async {
2568 let mut index = 0;
2569
2570 loop {
2571 tokio::select! {
2572 _ = notify.notified() => {
2573 break;
2574 }
2575 _ = perform_smart_mouse_movement(&page, &viewport) => {
2576 tokio::time::sleep(std::time::Duration::from_millis(WAIT_TIMEOUTS[index])).await;
2577 }
2578 }
2579
2580 index = (index + 1) % WAIT_TIMEOUTS.len();
2581 }
2582 };
2583
2584 let navigation_loop = async {
2585 let result = page_navigation.await;
2586 notify.notify_waiters();
2587 result
2588 };
2589
2590 let (result, _) = tokio::join!(navigation_loop, mouse_loop);
2591
2592 result
2593 } else {
2594 page_navigation.await
2595 }
2596 };
2597
2598 tokio::select! {
2599 v = tokio::time::timeout(base_timeout + Duration::from_millis(50), page_navigate) => {
2600 if v.is_err() {
2601 request_cancelled = true;
2602 }
2603 }
2604 v = rx => {
2605 if let Ok(v) = v {
2606 request_cancelled = !v;
2607 }
2608 }
2609 };
2610
2611 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2612
2613 let final_url = if wait_for_navigation && !request_cancelled && !block_bytes {
2615 let last_redirect = get_final_redirect(page, &source, base_timeout).await;
2616 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2617 last_redirect
2618 } else {
2619 None
2620 };
2621
2622 let chrome_http_req_res1 = if asset {
2623 Some(chrome_http_req_res.clone())
2624 } else {
2625 None
2626 };
2627
2628 let run_events = !base_timeout.is_zero()
2629 && !block_bytes
2630 && !request_cancelled
2631 && !(chrome_http_req_res.is_empty() && !content)
2632 && (!chrome_http_req_res.status_code.is_server_error()
2633 && !chrome_http_req_res.status_code.is_client_error()
2634 || chrome_http_req_res.status_code == *UNKNOWN_STATUS_ERROR
2635 || chrome_http_req_res.status_code == 404
2636 || chrome_http_req_res.status_code == 403
2637 || chrome_http_req_res.status_code == 524
2638 || chrome_http_req_res.status_code.is_redirection()
2639 || chrome_http_req_res.status_code.is_success());
2640
2641 block_bytes = chrome_http_req_res.status_code == StatusCode::REQUEST_TIMEOUT;
2642
2643 let waf_check = chrome_http_req_res.waf_check;
2644 let mut status_code = chrome_http_req_res.status_code;
2645 let mut anti_bot_tech = chrome_http_req_res.anti_bot_tech;
2646 let mut validate_cf = false;
2647
2648 let run_page_response = async move {
2649 let mut page_response = if run_events {
2650 if waf_check {
2651 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2652 if let Err(elasped) = tokio::time::timeout(
2653 base_timeout,
2654 perform_smart_mouse_movement(&page, &viewport),
2655 )
2656 .await
2657 {
2658 log::warn!("mouse movement timeout exceeded {elasped}");
2659 }
2660 }
2661
2662 if wait_for.is_some() {
2663 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2664 if let Err(elasped) =
2665 tokio::time::timeout(base_timeout, page_wait(&page, &wait_for)).await
2666 {
2667 log::warn!("max wait for timeout {elasped}");
2668 }
2669 }
2670
2671 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2672
2673 if execution_scripts.is_some() || automation_scripts.is_some() {
2674 let target_url = final_url
2675 .as_deref()
2676 .or(url_target)
2677 .unwrap_or(source)
2678 .to_string();
2679
2680 if let Err(elasped) = tokio::time::timeout(base_timeout, async {
2681 let mut _metadata = Vec::new();
2682
2683 if track_automation {
2684 tokio::join!(
2685 crate::features::chrome_common::eval_execution_scripts(
2686 &page,
2687 &target_url,
2688 &execution_scripts
2689 ),
2690 crate::features::chrome_common::eval_automation_scripts_tracking(
2691 &page,
2692 &target_url,
2693 &automation_scripts,
2694 &mut _metadata
2695 )
2696 );
2697 metadata = Some(_metadata);
2698 } else {
2699 tokio::join!(
2700 crate::features::chrome_common::eval_execution_scripts(
2701 &page,
2702 &target_url,
2703 &execution_scripts
2704 ),
2705 crate::features::chrome_common::eval_automation_scripts(
2706 &page,
2707 &target_url,
2708 &automation_scripts
2709 )
2710 );
2711 }
2712 })
2713 .await
2714 {
2715 log::warn!("eval scripts timeout exceeded {elasped}");
2716 }
2717 }
2718
2719 let xml_target = match &final_url {
2720 Some(f) => f.ends_with(".xml"),
2721 _ => target_url.ends_with(".xml"),
2722 };
2723
2724 let page_fn = async {
2725 if !xml_target {
2726 return page.outer_html_bytes().await;
2727 }
2728 match page.content_bytes_xml().await {
2729 Ok(b) if !b.is_empty() => Ok(b),
2730 _ => page.outer_html_bytes().await,
2731 }
2732 };
2733
2734 let results = tokio::time::timeout(base_timeout.max(HALF_MAX_PAGE_TIMEOUT), page_fn);
2735
2736 let mut res: Box<Vec<u8>> = match results.await {
2737 Ok(v) => v.map(Box::new).unwrap_or_default(),
2738 _ => Default::default(),
2739 };
2740
2741 let forbidden = waf_check && res.starts_with(b"<html><head>\n <style global=") && res.ends_with(b";</script><iframe height=\"1\" width=\"1\" style=\"position: absolute; top: 0px; left: 0px; border: none; visibility: hidden;\"></iframe>\n\n</body></html>");
2742
2743 #[cfg(feature = "real_browser")]
2744 {
2745 if res.len() <= crate::page::TURNSTILE_WALL_PAGE_SIZE {
2747 if anti_bot_tech == AntiBotTech::Cloudflare || waf_check {
2748 if crate::features::solvers::detect_cf_turnstyle(&res) {
2749 if let Err(_e) = tokio::time::timeout(base_timeout, async {
2750 if let Ok(success) = crate::features::solvers::cf_handle(
2751 &mut res,
2752 &page,
2753 &target_url,
2754 &viewport,
2755 )
2756 .await
2757 {
2758 if success {
2759 status_code = StatusCode::OK;
2760 }
2761 }
2762 })
2763 .await
2764 {
2765 validate_cf = true;
2766 }
2767 }
2768 } else if anti_bot_tech == AntiBotTech::Imperva {
2769 if crate::features::solvers::looks_like_imperva_verify(res.len(), &*res) {
2770 if let Err(_e) = tokio::time::timeout(base_timeout, async {
2771 if let Ok(success) = crate::features::solvers::imperva_handle(
2772 &mut res,
2773 &page,
2774 &target_url,
2775 &viewport,
2776 )
2777 .await
2778 {
2779 if success {
2780 status_code = StatusCode::OK;
2781 }
2782 }
2783 })
2784 .await
2785 {
2786 validate_cf = true;
2787 }
2788 }
2789 } else if crate::features::solvers::detect_recaptcha(&res) {
2790 if let Err(_e) = tokio::time::timeout(base_timeout, async {
2791 if let Ok(solved) = crate::features::solvers::recaptcha_handle(
2792 &mut res, &page, &viewport,
2793 )
2794 .await
2795 {
2796 if solved {
2797 status_code = StatusCode::OK;
2798 }
2799 }
2800 })
2801 .await
2802 {
2803 validate_cf = true;
2804 }
2805 } else if crate::features::solvers::detect_geetest(&res) {
2806 if let Err(_e) = tokio::time::timeout(base_timeout, async {
2807 if let Ok(solved) =
2808 crate::features::solvers::geetest_handle(&mut res, &page, &viewport)
2809 .await
2810 {
2811 if solved {
2812 status_code = StatusCode::OK;
2813 }
2814 }
2815 })
2816 .await
2817 {
2818 validate_cf = true;
2819 }
2820 } else if crate::features::solvers::detect_lemin(&res) {
2821 if let Err(_e) = tokio::time::timeout(base_timeout, async {
2822 if let Ok(solved) =
2823 crate::features::solvers::lemin_handle(&mut res, &page, &viewport)
2824 .await
2825 {
2826 if solved {
2827 status_code = StatusCode::OK;
2828 }
2829 }
2830 })
2831 .await
2832 {
2833 validate_cf = true;
2834 }
2835 }
2836 }
2837 }
2838
2839 let ok = !res.is_empty();
2840
2841 #[cfg(feature = "real_browser")]
2842 if validate_cf && ok {
2843 if !crate::features::solvers::detect_cf_turnstyle(&res)
2844 && status_code == StatusCode::FORBIDDEN
2845 {
2846 status_code = StatusCode::OK;
2847 }
2848 }
2849
2850 let mut page_response = set_page_response(
2851 ok,
2852 res,
2853 if forbidden {
2854 StatusCode::FORBIDDEN
2855 } else {
2856 status_code
2857 },
2858 final_url,
2859 );
2860
2861 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2862
2863 let scope_url = if jar.is_some() {
2864 let scope_url = page_response
2865 .final_url
2866 .as_deref()
2867 .filter(|u| !u.is_empty())
2868 .or(url_target)
2869 .unwrap_or(source);
2870
2871 url::Url::parse(scope_url).ok()
2872 } else {
2873 None
2874 };
2875
2876 let _ = tokio::time::timeout(
2877 base_timeout,
2878 set_page_response_cookies(&mut page_response, &page, jar, scope_url.as_ref()),
2879 )
2880 .await;
2881
2882 if openai_config.is_some() && !base_timeout.is_zero() {
2883 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2884
2885 let openai_request = run_openai_request(
2886 match &url_target {
2887 Some(ut) => ut,
2888 _ => source,
2889 },
2890 page,
2891 wait_for,
2892 openai_config,
2893 &mut page_response,
2894 ok,
2895 );
2896
2897 let _ = tokio::time::timeout(base_timeout, openai_request).await;
2898 }
2899
2900 if remote_multimodal.is_some() && !base_timeout.is_zero() {
2901 use crate::features::automation::run_remote_multimodal_if_enabled;
2902
2903 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2904
2905 let multi_modal_request = run_remote_multimodal_if_enabled(
2906 remote_multimodal,
2907 page,
2908 match &url_target {
2909 Some(ut) => ut,
2910 _ => source,
2911 },
2912 );
2913
2914 let multimodal_success =
2915 match tokio::time::timeout(base_timeout, multi_modal_request).await {
2916 Ok(Ok(Some(result))) => {
2917 let success = result.success;
2918
2919 match page_response.remote_multimodal_usage.as_mut() {
2921 Some(v) => v.push(result.usage.clone()),
2922 None => page_response.remote_multimodal_usage = Some(vec![result.usage.clone()]),
2923 }
2924
2925 if result.extracted.is_some() || result.screenshot.is_some() {
2927 let automation_result = result.to_automation_results();
2928 match page_response.extra_remote_multimodal_data.as_mut() {
2929 Some(v) => v.push(automation_result),
2930 None => page_response.extra_remote_multimodal_data = Some(vec![automation_result]),
2931 }
2932 }
2933
2934 success
2935 }
2936 Ok(Ok(None)) => false,
2937 Ok(Err(_e)) => false,
2938 Err(_elapsed) => false,
2939 };
2940
2941 if multimodal_success {
2942 let next_content = tokio::time::timeout(base_timeout, page.outer_html_bytes())
2943 .await
2944 .ok()
2945 .and_then(Result::ok)
2946 .filter(|b| !b.is_empty())
2947 .map(|b| Box::new(b.into()));
2948
2949 if next_content.is_some() {
2950 page_response.content = next_content;
2951 }
2952 }
2953 }
2954
2955 if cfg!(feature = "chrome_screenshot") || screenshot.is_some() {
2956 let _ = tokio::time::timeout(
2957 base_timeout + tokio::time::Duration::from_secs(30),
2958 perform_screenshot(source, page, screenshot, &mut page_response),
2959 )
2960 .await;
2961 }
2962
2963 if metadata.is_some() {
2964 let mut default_metadata = Metadata::default();
2965 default_metadata.automation = metadata;
2966 page_response.metadata = Some(Box::new(default_metadata));
2967 }
2968
2969 page_response
2970 } else {
2971 let res = if !block_bytes {
2972 let results = tokio::time::timeout(
2973 base_timeout.max(HALF_MAX_PAGE_TIMEOUT),
2974 page.outer_html_bytes(),
2975 );
2976
2977 match results.await {
2978 Ok(v) => v.map(Box::new).unwrap_or_default(),
2979 _ => Default::default(),
2980 }
2981 } else {
2982 Default::default()
2983 };
2984
2985 let mut page_response = set_page_response(!res.is_empty(), res, status_code, final_url);
2986
2987 if !block_bytes {
2988 let scope_url = if jar.is_some() {
2989 let scope_url = page_response
2990 .final_url
2991 .as_deref()
2992 .filter(|u| !u.is_empty())
2993 .or(url_target)
2994 .unwrap_or(source);
2995
2996 url::Url::parse(scope_url).ok()
2997 } else {
2998 None
2999 };
3000
3001 let _ = tokio::time::timeout(
3002 base_timeout,
3003 set_page_response_cookies(&mut page_response, &page, jar, scope_url.as_ref()),
3004 )
3005 .await;
3006 }
3007
3008 if base_timeout.is_zero() && page_response.content.is_none() {
3009 page_response.status_code = StatusCode::REQUEST_TIMEOUT;
3010 }
3011
3012 page_response
3013 };
3014
3015 if content {
3016 if let Some(final_url) = &page_response.final_url {
3017 if final_url.starts_with("about:blank") {
3018 page_response.final_url = None;
3019 }
3020 }
3021 }
3022
3023 page_response
3024 };
3025
3026 let mut content: Option<Box<Vec<u8>>> = None;
3027
3028 let page_response = match rx1 {
3029 Some(rx1) => {
3030 tokio::select! {
3031 v = tokio::time::timeout(base_timeout, run_page_response) => {
3032 v.map_err(|_| CdpError::Timeout)
3033 }
3034 c = rx1 => {
3035 if let Ok(c) = c {
3036 if let Some(c) = c {
3037 let params = GetResponseBodyParams::new(c);
3038
3039 if let Ok(command_response) = page.execute(params).await {
3040 let body_response = command_response;
3041
3042 let media_file = if body_response.base64_encoded {
3043 chromiumoxide::utils::base64::decode(
3044 &body_response.body,
3045 )
3046 .unwrap_or_default()
3047 } else {
3048 body_response.body.as_bytes().to_vec()
3049 };
3050
3051 if !media_file.is_empty() {
3052 content = Some(media_file.into());
3053 }
3054 }
3055 }
3056 }
3057
3058 let mut page_response = PageResponse::default();
3059
3060 let scope_url = if jar.is_some() {
3061 let scope_url = page_response
3062 .final_url
3063 .as_deref()
3064 .filter(|u| !u.is_empty())
3065 .or(url_target)
3066 .unwrap_or(source);
3067
3068 url::Url::parse(scope_url).ok()
3069 } else {
3070 None
3071 };
3072
3073 let _ = tokio::time::timeout(
3074 base_timeout,
3075 set_page_response_cookies(&mut page_response, &page, jar, scope_url.as_ref()),
3076 )
3077 .await;
3078
3079 if let Some(mut chrome_http_req_res1) = chrome_http_req_res1 {
3080 set_page_response_headers(&mut chrome_http_req_res1, &mut page_response);
3081
3082 page_response.status_code = chrome_http_req_res1.status_code;
3083 page_response.waf_check = chrome_http_req_res1.waf_check;
3084
3085 #[cfg(feature = "cache_request")]
3086 if !page_set && cache_request {
3087 let _ = tokio::time::timeout(
3088 base_timeout,
3089 cache_chrome_response(&source, &page_response, chrome_http_req_res1, &cache_options),
3090 )
3091 .await;
3092 }
3093
3094 }
3095
3096 Ok(page_response)
3097 }
3098 }
3099 }
3100 _ => Ok(run_page_response.await),
3101 };
3102
3103 let mut page_response = page_response.unwrap_or_default();
3104
3105 set_page_response_headers(&mut chrome_http_req_res, &mut page_response);
3106 page_response.status_code = chrome_http_req_res.status_code;
3107 page_response.waf_check = chrome_http_req_res.waf_check;
3108 page_response.content = match content {
3109 Some(c) if !c.is_empty() => Some(c.into()),
3110 _ => {
3111 let needs_fill = page_response
3112 .content
3113 .as_ref()
3114 .map_or(true, |b| b.is_empty());
3115
3116 if needs_fill {
3117 tokio::time::timeout(base_timeout, page.outer_html_bytes())
3118 .await
3119 .ok()
3120 .and_then(Result::ok)
3121 .filter(|b| !b.is_empty())
3122 .map(Into::into)
3123 } else {
3124 page_response.content
3125 }
3126 }
3127 };
3128 if page_response.status_code == *UNKNOWN_STATUS_ERROR && page_response.content.is_some() {
3129 page_response.status_code = StatusCode::OK;
3130 }
3131
3132 #[cfg(feature = "cache_request")]
3140 let mut modified_cache = false;
3141
3142 #[cfg(feature = "cache_request")]
3143 if cache_request {
3144 if let Some(mut main_rx) = main_rx {
3145 if let Ok(doc_req_id) = &main_rx.try_recv() {
3146 let cache_url = match &page_response.final_url {
3147 Some(final_url) if !final_url.is_empty() => final_url.as_str(),
3148 _ => target_url,
3149 };
3150
3151 match page
3152 .execute(GetResponseBodyParams::new(doc_req_id.clone()))
3153 .await
3154 {
3155 Ok(body_result) => {
3156 let raw_body: Vec<u8> = if body_result.base64_encoded {
3157 chromiumoxide::utils::base64::decode(&body_result.body)
3158 .unwrap_or_default()
3159 } else {
3160 body_result.body.clone().into_bytes()
3161 };
3162
3163 if !raw_body.is_empty() {
3164 let _ = tokio::time::timeout(
3165 base_timeout,
3166 cache_chrome_response_from_cdp_body(
3167 cache_url,
3168 &raw_body,
3169 &chrome_http_req_res,
3170 &cache_options,
3171 ),
3172 )
3173 .await;
3174 modified_cache = true;
3175 }
3176 }
3177 Err(e) => {
3178 log::error!("{:?}", e)
3179 }
3180 }
3181 }
3182 }
3183 }
3184
3185 if cfg!(not(feature = "chrome_store_page")) {
3186 let _ = page
3187 .send_command(chromiumoxide::cdp::browser_protocol::page::CloseParams::default())
3188 .await;
3189
3190 if let Ok((mut transferred, bytes_map, mut rs, request_map)) = bytes_collected_handle.await
3191 {
3192 let response_map = rs.response_map;
3193
3194 if response_map.is_some() {
3195 let mut _response_map = HashMap::new();
3196
3197 if let Some(response_map) = response_map {
3198 if let Some(bytes_map) = bytes_map {
3199 let detect_anti_bots =
3200 response_map.len() <= 4 && anti_bot_tech == AntiBotTech::None;
3201
3202 for item in response_map {
3203 if detect_anti_bots && item.1.url.starts_with("/_Incapsula_Resource?") {
3204 anti_bot_tech = AntiBotTech::Imperva;
3205 }
3206
3207 let b = if item.1.skipped {
3208 0.0
3209 } else {
3210 match bytes_map.get(&item.0) {
3211 Some(f) => *f,
3212 _ => 0.0,
3213 }
3214 };
3215
3216 if item.1.skipped {
3217 transferred -= item.1.bytes_transferred;
3218 }
3219
3220 _response_map.insert(item.1.url, b);
3221 }
3222 }
3223 }
3224
3225 page_response.response_map = Some(_response_map);
3226
3227 if let Some(status) = rs
3228 .status_code
3229 .and_then(|s| s.try_into().ok())
3230 .and_then(|u: u16| StatusCode::from_u16(u).ok())
3231 {
3232 page_response.status_code = status;
3233 }
3234
3235 set_page_response_headers_raw(&mut rs.headers, &mut page_response);
3236 store_headers(&page_response, &mut chrome_http_req_res);
3237
3238 if anti_bot_tech == AntiBotTech::None {
3239 let final_url = match &page_response.final_url {
3240 Some(final_url)
3241 if !final_url.is_empty()
3242 && !final_url.starts_with("about:blank")
3243 && !final_url.starts_with("chrome-error://chromewebdata") =>
3244 {
3245 final_url
3246 }
3247 _ => target_url,
3248 };
3249 if let Some(h) = &page_response.headers {
3250 if let Some(content) = &page_response.content {
3251 anti_bot_tech = detect_anti_bot_tech_response(
3252 &final_url,
3253 &HeaderSource::HeaderMap(h),
3254 &content,
3255 None,
3256 );
3257 }
3258 }
3259 }
3260
3261 #[cfg(feature = "real_browser")]
3262 if let Some(content) = &page_response.content {
3263 if anti_bot_tech == AntiBotTech::Cloudflare
3265 && page_response.status_code == StatusCode::FORBIDDEN
3266 {
3267 let cf_turnstile = crate::features::solvers::detect_cf_turnstyle(&content);
3268
3269 if !cf_turnstile {
3270 page_response.status_code = StatusCode::OK;
3271 }
3272 }
3273 }
3274 #[cfg(feature = "cache_request")]
3275 if cache_request && !page_set && !rs.main_doc_from_cache && !modified_cache {
3276 let _ = tokio::time::timeout(
3277 base_timeout,
3278 cache_chrome_response(
3279 &source,
3280 &page_response,
3281 chrome_http_req_res,
3282 &cache_options,
3283 ),
3284 )
3285 .await;
3286 }
3287 }
3288 if request_map.is_some() {
3289 page_response.request_map = request_map;
3290 }
3291
3292 page_response.bytes_transferred = Some(transferred);
3293 }
3294 }
3295
3296 page_response.anti_bot_tech = anti_bot_tech;
3297
3298 set_page_response_duration(&mut page_response, duration);
3299
3300 Ok(page_response)
3301}
3302
3303#[cfg(feature = "time")]
3304pub(crate) fn set_page_response_duration(
3306 page_response: &mut PageResponse,
3307 duration: Option<tokio::time::Instant>,
3308) {
3309 page_response.duration = duration;
3310}
3311
3312#[cfg(not(feature = "time"))]
3313pub(crate) fn set_page_response_duration(
3315 _page_response: &mut PageResponse,
3316 _duration: Option<tokio::time::Instant>,
3317) {
3318}
3319
3320#[cfg(feature = "chrome")]
3322fn set_page_response(
3323 ok: bool,
3324 res: Box<Vec<u8>>,
3325 status_code: StatusCode,
3326 final_url: Option<String>,
3327) -> PageResponse {
3328 PageResponse {
3329 content: if ok { Some(res.into()) } else { None },
3330 status_code,
3331 final_url,
3332 ..Default::default()
3333 }
3334}
3335
3336#[cfg(all(feature = "chrome", feature = "headers"))]
3338fn set_page_response_headers(
3339 chrome_http_req_res: &mut ChromeHTTPReqRes,
3340 page_response: &mut PageResponse,
3341) {
3342 let response_headers = convert_headers(&chrome_http_req_res.response_headers);
3343
3344 if !response_headers.is_empty() {
3345 page_response.headers = Some(response_headers);
3346 }
3347}
3348
3349#[cfg(all(feature = "chrome", not(feature = "headers")))]
3351fn set_page_response_headers(
3352 _chrome_http_req_res: &mut ChromeHTTPReqRes,
3353 _page_response: &mut PageResponse,
3354) {
3355}
3356
3357#[cfg(all(feature = "chrome", feature = "headers"))]
3359fn set_page_response_headers_raw(
3360 chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
3361 page_response: &mut PageResponse,
3362) {
3363 if let Some(chrome_headers) = chrome_http_req_res {
3364 let mut header_map = reqwest::header::HeaderMap::new();
3365
3366 if let Some(obj) = chrome_headers.inner().as_object() {
3367 for (index, (key, value)) in obj.iter().enumerate() {
3368 use std::str::FromStr;
3369 if let (Ok(header_name), Ok(header_value)) = (
3370 reqwest::header::HeaderName::from_str(key),
3371 reqwest::header::HeaderValue::from_str(&value.to_string()),
3372 ) {
3373 header_map.insert(header_name, header_value);
3374 }
3375 if index > 1000 {
3376 break;
3377 }
3378 }
3379 }
3380 if !header_map.is_empty() {
3381 page_response.headers = Some(header_map);
3382 }
3383 }
3384}
3385
3386#[cfg(all(feature = "chrome", not(feature = "headers")))]
3388fn set_page_response_headers_raw(
3389 _chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
3390 _page_response: &mut PageResponse,
3391) {
3392}
3393
3394#[cfg(all(feature = "chrome", feature = "cookies"))]
3395async fn set_page_response_cookies(
3396 page_response: &mut PageResponse,
3397 page: &chromiumoxide::Page,
3398 jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
3399 scope_url: Option<&url::Url>,
3400) {
3401 if let Ok(mut cookies) = page.get_cookies().await {
3402 let mut cookies_map: std::collections::HashMap<String, String> =
3403 std::collections::HashMap::new();
3404
3405 for cookie in cookies.drain(..) {
3406 if let Some(scope_url) = scope_url {
3407 if let Some(jar) = jar {
3408 let sc = format!("{}={}; Path=/", cookie.name, cookie.value);
3409 jar.add_cookie_str(&sc, scope_url);
3410 }
3411 }
3412 cookies_map.insert(cookie.name, cookie.value);
3413 }
3414
3415 let response_headers = convert_headers(&cookies_map);
3416 if !response_headers.is_empty() {
3417 page_response.cookies = Some(response_headers);
3418 }
3419 }
3420}
3421
3422#[cfg(feature = "chrome")]
3424pub async fn perform_screenshot(
3425 target_url: &str,
3426 page: &chromiumoxide::Page,
3427 screenshot: &Option<crate::configuration::ScreenShotConfig>,
3428 page_response: &mut PageResponse,
3429) {
3430 use base64::engine::general_purpose::STANDARD;
3431 use base64::Engine;
3432
3433 match &screenshot {
3434 Some(ss) => {
3435 let output_format = string_concat!(
3436 ".",
3437 ss.params
3438 .cdp_params
3439 .format
3440 .as_ref()
3441 .unwrap_or_else(|| &crate::configuration::CaptureScreenshotFormat::Png)
3442 .to_string()
3443 );
3444 let ss_params = chromiumoxide::page::ScreenshotParams::from(ss.params.clone());
3445
3446 let full_page = ss_params.full_page.unwrap_or_default();
3447 let omit_background = ss_params.omit_background.unwrap_or_default();
3448 let mut cdp_params = ss_params.cdp_params;
3449
3450 cdp_params.optimize_for_speed = Some(true);
3451
3452 if full_page {
3453 cdp_params.capture_beyond_viewport = Some(true);
3454 }
3455
3456 if omit_background {
3457 let _ = page.send_command(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams {
3458 color: Some(chromiumoxide::cdp::browser_protocol::dom::Rgba {
3459 r: 0,
3460 g: 0,
3461 b: 0,
3462 a: Some(0.),
3463 }),
3464 })
3465 .await;
3466 }
3467
3468 match page.execute(cdp_params).await {
3469 Ok(b) => {
3470 if let Ok(b) = STANDARD.decode(&b.data) {
3471 if ss.save {
3472 let output_path = create_output_path(
3473 &ss.output_dir.clone().unwrap_or_else(|| "./storage/".into()),
3474 &target_url,
3475 &output_format,
3476 )
3477 .await;
3478 let _ = tokio::fs::write(output_path, &b).await;
3479 }
3480 if ss.bytes {
3481 page_response.screenshot_bytes = Some(b);
3482 }
3483 }
3484 }
3485 Err(e) => {
3486 log::error!("failed to take screenshot: {:?} - {:?}", e, target_url)
3487 }
3488 };
3489
3490 if omit_background {
3491 let _ = page.send_command(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams { color: None })
3492 .await;
3493 }
3494 }
3495 _ => {
3496 let output_path = create_output_path(
3497 &std::env::var("SCREENSHOT_DIRECTORY")
3498 .unwrap_or_else(|_| "./storage/".to_string())
3499 .into(),
3500 &target_url,
3501 &".png",
3502 )
3503 .await;
3504
3505 match page
3506 .save_screenshot(
3507 chromiumoxide::page::ScreenshotParams::builder()
3508 .format(
3509 chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png,
3510 )
3511 .full_page(match std::env::var("SCREENSHOT_FULL_PAGE") {
3512 Ok(t) => t == "true",
3513 _ => true,
3514 })
3515 .omit_background(match std::env::var("SCREENSHOT_OMIT_BACKGROUND") {
3516 Ok(t) => t == "true",
3517 _ => true,
3518 })
3519 .build(),
3520 &output_path,
3521 )
3522 .await
3523 {
3524 Ok(_) => log::debug!("saved screenshot: {:?}", output_path),
3525 Err(e) => log::error!("failed to save screenshot: {:?} - {:?}", e, output_path),
3526 };
3527 }
3528 }
3529}
3530
3531#[cfg(feature = "chrome")]
3532pub async fn get_last_redirect(
3534 target_url: &str,
3535 u: &Option<std::sync::Arc<chromiumoxide::handler::http::HttpRequest>>,
3536 page: &chromiumoxide::Page,
3537) -> Option<String> {
3538 if let Some(http_request) = u {
3539 if let Some(redirect) = http_request.redirect_chain.last() {
3540 if let Some(url) = redirect.url.as_ref() {
3541 return if target_url != url {
3542 Some(url.clone())
3543 } else {
3544 None
3545 };
3546 }
3547 }
3548 }
3549 page.url().await.ok()?
3550}
3551
3552#[cfg(feature = "cookies")]
3554pub fn get_cookies(res: &Response) -> Option<crate::client::header::HeaderMap> {
3555 use crate::client::header::{HeaderMap, HeaderName, HeaderValue};
3556
3557 let mut headers = HeaderMap::new();
3558
3559 for cookie in res.cookies() {
3560 if let Ok(h) = HeaderValue::from_str(cookie.value()) {
3561 if let Ok(n) = HeaderName::from_str(cookie.name()) {
3562 headers.insert(n, h);
3563 }
3564 }
3565 }
3566
3567 if !headers.is_empty() {
3568 Some(headers)
3569 } else {
3570 None
3571 }
3572}
3573
3574#[cfg(not(feature = "cookies"))]
3575pub fn get_cookies(_res: &Response) -> Option<crate::client::header::HeaderMap> {
3577 None
3578}
3579
3580pub(crate) fn block_streaming(res: &Response, only_html: bool) -> bool {
3582 let mut block_streaming = false;
3583
3584 if only_html {
3585 if let Some(content_type) = res.headers().get(crate::client::header::CONTENT_TYPE) {
3586 if let Ok(content_type_str) = content_type.to_str() {
3587 if IGNORE_CONTENT_TYPES.contains(content_type_str) {
3588 block_streaming = true;
3589 }
3590 }
3591 }
3592 }
3593
3594 block_streaming
3595}
3596
3597pub async fn handle_response_bytes(
3599 res: Response,
3600 target_url: &str,
3601 only_html: bool,
3602) -> PageResponse {
3603 let u = res.url().as_str();
3604
3605 let rd = if target_url != u {
3606 Some(u.into())
3607 } else {
3608 None
3609 };
3610
3611 let status_code: StatusCode = res.status();
3612 let headers = res.headers().clone();
3613 #[cfg(feature = "remote_addr")]
3614 let remote_addr = res.remote_addr();
3615 let cookies = get_cookies(&res);
3616
3617 let mut content: Option<Box<Vec<u8>>> = None;
3618 let mut anti_bot_tech = AntiBotTech::default();
3619
3620 let limit = *MAX_SIZE_BYTES;
3621
3622 if limit > 0 {
3623 let base = res
3624 .content_length()
3625 .and_then(|n| usize::try_from(n).ok())
3626 .unwrap_or(0);
3627
3628 let hdr = res
3629 .headers()
3630 .get(CONTENT_LENGTH)
3631 .and_then(|v| v.to_str().ok())
3632 .and_then(|s| s.parse::<usize>().ok())
3633 .unwrap_or(0);
3634
3635 let current_size = base + hdr.saturating_sub(base);
3636
3637 if current_size > limit {
3638 anti_bot_tech = detect_anti_bot_tech_response(
3639 target_url,
3640 &HeaderSource::HeaderMap(&headers),
3641 &Default::default(),
3642 None,
3643 );
3644 return PageResponse {
3645 headers: Some(headers),
3646 #[cfg(feature = "remote_addr")]
3647 remote_addr,
3648 #[cfg(feature = "cookies")]
3649 cookies,
3650 content: None,
3651 final_url: rd,
3652 status_code,
3653 anti_bot_tech,
3654 ..Default::default()
3655 };
3656 }
3657 }
3658
3659 if !block_streaming(&res, only_html) {
3660 let mut data = match res.content_length() {
3661 Some(cap) if cap >= MAX_PRE_ALLOCATED_HTML_PAGE_SIZE => {
3662 Vec::with_capacity(cap.max(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE) as usize)
3663 }
3664 _ => Vec::with_capacity(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE),
3665 };
3666 let mut stream = res.bytes_stream();
3667 let mut first_bytes = true;
3668
3669 while let Some(item) = stream.next().await {
3670 match item {
3671 Ok(text) => {
3672 if only_html && first_bytes {
3673 first_bytes = false;
3674 if is_binary_file(&text) {
3675 break;
3676 }
3677 }
3678
3679 if limit > 0 && data.len() + text.len() > limit {
3680 break;
3681 }
3682
3683 data.extend_from_slice(&text)
3684 }
3685 Err(e) => {
3686 log::error!("{e} in {}", target_url);
3687 break;
3688 }
3689 }
3690 }
3691
3692 anti_bot_tech = detect_anti_bot_tech_response(
3693 &target_url,
3694 &HeaderSource::HeaderMap(&headers),
3695 &data,
3696 None,
3697 );
3698 content.replace(Box::new(data.into()));
3699 }
3700
3701 PageResponse {
3702 headers: Some(headers),
3703 #[cfg(feature = "remote_addr")]
3704 remote_addr,
3705 #[cfg(feature = "cookies")]
3706 cookies,
3707 content,
3708 final_url: rd,
3709 status_code,
3710 anti_bot_tech,
3711 ..Default::default()
3712 }
3713}
3714
3715pub async fn handle_response_bytes_writer<'h, O>(
3717 res: Response,
3718 target_url: &str,
3719 only_html: bool,
3720 rewriter: &mut HtmlRewriter<'h, O>,
3721 collected_bytes: &mut Vec<u8>,
3722) -> (PageResponse, bool)
3723where
3724 O: OutputSink + Send + 'static,
3725{
3726 let u = res.url().as_str();
3727
3728 let final_url: Option<String> = if target_url != u {
3729 Some(u.into())
3730 } else {
3731 None
3732 };
3733
3734 let status_code: StatusCode = res.status();
3735 let headers = res.headers().clone();
3736 #[cfg(feature = "remote_addr")]
3737 let remote_addr = res.remote_addr();
3738 let cookies = get_cookies(&res);
3739 let mut anti_bot_tech = AntiBotTech::default();
3740
3741 let mut rewrite_error = false;
3742
3743 if !block_streaming(&res, only_html) {
3744 let mut stream = res.bytes_stream();
3745 let mut first_bytes = true;
3746 let mut data_len = 0;
3747
3748 while let Some(item) = stream.next().await {
3749 match item {
3750 Ok(res_bytes) => {
3751 if only_html && first_bytes {
3752 first_bytes = false;
3753 if is_binary_file(&res_bytes) {
3754 break;
3755 }
3756 }
3757 let limit = *MAX_SIZE_BYTES;
3758 let bytes_len = res_bytes.len();
3759
3760 if limit > 0 && data_len + bytes_len > limit {
3761 break;
3762 }
3763
3764 data_len += bytes_len;
3765
3766 if !rewrite_error {
3767 if rewriter.write(&res_bytes).is_err() {
3768 rewrite_error = true;
3769 }
3770 }
3771
3772 collected_bytes.extend_from_slice(&res_bytes);
3773 }
3774 Err(e) => {
3775 log::error!("{e} in {}", target_url);
3776 break;
3777 }
3778 }
3779 }
3780
3781 anti_bot_tech = detect_anti_bot_tech_response(
3782 &target_url,
3783 &HeaderSource::HeaderMap(&headers),
3784 &collected_bytes,
3785 None,
3786 );
3787 }
3788
3789 (
3790 PageResponse {
3791 #[cfg(feature = "headers")]
3792 headers: Some(headers),
3793 #[cfg(feature = "remote_addr")]
3794 remote_addr,
3795 #[cfg(feature = "cookies")]
3796 cookies,
3797 final_url,
3798 status_code,
3799 anti_bot_tech,
3800 ..Default::default()
3801 },
3802 rewrite_error,
3803 )
3804}
3805
3806pub(crate) fn valid_parsing_status(res: &Response) -> bool {
3808 res.status().is_success() || res.status() == 404
3809}
3810
3811fn build_error_page_response(target_url: &str, err: RequestError) -> PageResponse {
3813 log::info!("error fetching {}", target_url);
3814
3815 let mut page_response = PageResponse::default();
3816 if let Some(status_code) = err.status() {
3817 page_response.status_code = status_code;
3818 } else {
3819 page_response.status_code = crate::page::get_error_http_status_code(&err);
3820 }
3821 page_response.error_for_status = Some(Err(err));
3822 page_response
3823}
3824
3825fn error_chain_contains_handshake_failure(err: &RequestError) -> bool {
3827 if err.to_string().to_lowercase().contains("handshake failure") {
3828 return true;
3829 }
3830 let mut cur: Option<&(dyn std::error::Error + 'static)> = err.source();
3831
3832 while let Some(e) = cur {
3833 let s = e.to_string().to_lowercase();
3834 if s.contains("handshake failure") {
3835 return true;
3836 }
3837 cur = e.source();
3838 }
3839
3840 false
3841}
3842
3843async fn fetch_page_html_raw_base(
3845 target_url: &str,
3846 client: &Client,
3847 only_html: bool,
3848) -> PageResponse {
3849 async fn attempt_once(
3850 url: &str,
3851 client: &Client,
3852 only_html: bool,
3853 ) -> Result<PageResponse, RequestError> {
3854 let res = client.get(url).send().await?;
3855 Ok(handle_response_bytes(res, url, only_html).await)
3856 }
3857
3858 let duration = if cfg!(feature = "time") {
3859 Some(tokio::time::Instant::now())
3860 } else {
3861 None
3862 };
3863
3864 let mut page_response = match attempt_once(target_url, client, only_html).await {
3865 Ok(pr) => pr,
3866 Err(err) => {
3867 let should_retry = error_chain_contains_handshake_failure(&err);
3868 if should_retry {
3869 if let Some(flipped) = flip_http_https(target_url) {
3870 log::info!(
3871 "TLS handshake failure for {}; retrying with flipped scheme: {}",
3872 target_url,
3873 flipped
3874 );
3875 match attempt_once(&flipped, client, only_html).await {
3876 Ok(pr2) => pr2,
3877 Err(err2) => build_error_page_response(&flipped, err2),
3878 }
3879 } else {
3880 build_error_page_response(target_url, err)
3881 }
3882 } else {
3883 build_error_page_response(target_url, err)
3884 }
3885 }
3886 };
3887
3888 set_page_response_duration(&mut page_response, duration);
3889 page_response
3890}
3891
3892pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageResponse {
3894 fetch_page_html_raw_base(target_url, client, false).await
3895}
3896
3897pub async fn fetch_page_html_raw_only_html(target_url: &str, client: &Client) -> PageResponse {
3899 fetch_page_html_raw_base(target_url, client, false).await
3900}
3901
3902#[cfg(feature = "decentralized")]
3904pub async fn fetch_page(target_url: &str, client: &Client) -> Option<Vec<u8>> {
3905 match client.get(target_url).send().await {
3906 Ok(res) if valid_parsing_status(&res) => match res.bytes().await {
3907 Ok(text) => Some(text.into()),
3908 Err(_) => {
3909 log("- error fetching {}", &target_url);
3910 None
3911 }
3912 },
3913 Ok(_) => None,
3914 Err(_) => {
3915 log("- error parsing html bytes {}", &target_url);
3916 None
3917 }
3918 }
3919}
3920
3921#[cfg(all(feature = "decentralized", feature = "headers"))]
3922pub enum FetchPageResult {
3924 Success(reqwest::header::HeaderMap, Option<Vec<u8>>),
3926 NoSuccess(reqwest::header::HeaderMap),
3928 FetchError,
3930}
3931
3932#[cfg(all(feature = "decentralized", feature = "headers"))]
3933pub async fn fetch_page_and_headers(target_url: &str, client: &Client) -> FetchPageResult {
3935 match client.get(target_url).send().await {
3936 Ok(res) if valid_parsing_status(&res) => {
3937 let headers = res.headers().clone();
3938 let b = match res.bytes().await {
3939 Ok(text) => Some(text),
3940 Err(_) => {
3941 log("- error fetching {}", &target_url);
3942 None
3943 }
3944 };
3945 FetchPageResult::Success(headers, b)
3946 }
3947 Ok(res) => FetchPageResult::NoSuccess(res.headers().clone()),
3948 Err(_) => {
3949 log("- error parsing html bytes {}", &target_url);
3950 FetchPageResult::FetchError
3951 }
3952 }
3953}
3954
3955#[cfg(all(not(feature = "fs"), not(feature = "chrome")))]
3956pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
3958 fetch_page_html_raw(target_url, client).await
3959}
3960
3961#[cfg(all(feature = "fs", not(feature = "chrome")))]
3963pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
3964 use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
3965 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
3966
3967 let duration = if cfg!(feature = "time") {
3968 Some(tokio::time::Instant::now())
3969 } else {
3970 None
3971 };
3972
3973 match client.get(target_url).send().await {
3974 Ok(res) if valid_parsing_status(&res) => {
3975 let u = res.url().as_str();
3976
3977 let rd = if target_url != u {
3978 Some(u.into())
3979 } else {
3980 None
3981 };
3982
3983 let status_code = res.status();
3984 let cookies = get_cookies(&res);
3985 #[cfg(feature = "headers")]
3986 let headers = res.headers().clone();
3987 #[cfg(feature = "remote_addr")]
3988 let remote_addr = res.remote_addr();
3989 let mut stream = res.bytes_stream();
3990 let mut data = Vec::new();
3991 let mut file: Option<tokio::fs::File> = None;
3992 let mut file_path = String::new();
3993
3994 while let Some(item) = stream.next().await {
3995 match item {
3996 Ok(text) => {
3997 let wrote_disk = file.is_some();
3998
3999 if !wrote_disk && data.capacity() < 8192 {
4001 data.extend_from_slice(&text);
4002 } else {
4003 if !wrote_disk {
4004 file_path = string_concat!(
4005 TMP_DIR,
4006 &utf8_percent_encode(target_url, NON_ALPHANUMERIC).to_string()
4007 );
4008 match tokio::fs::File::create(&file_path).await {
4009 Ok(f) => {
4010 let file = file.insert(f);
4011
4012 data.extend_from_slice(&text);
4013
4014 if let Ok(_) = file.write_all(&data.as_ref()).await {
4015 data.clear();
4016 }
4017 }
4018 _ => data.extend_from_slice(&text),
4019 };
4020 } else {
4021 if let Some(f) = file.as_mut() {
4022 if let Err(_) = f.write_all(&text).await {
4023 data.extend_from_slice(&text)
4024 }
4025 }
4026 }
4027 }
4028 }
4029 Err(e) => {
4030 log::error!("{e} in {}", target_url);
4031 break;
4032 }
4033 }
4034 }
4035
4036 PageResponse {
4037 #[cfg(feature = "time")]
4038 duration,
4039 #[cfg(feature = "headers")]
4040 headers: Some(headers),
4041 #[cfg(feature = "remote_addr")]
4042 remote_addr,
4043 #[cfg(feature = "cookies")]
4044 cookies,
4045 content: Some(if file.is_some() {
4046 let mut buffer = vec![];
4047
4048 if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
4049 if let Ok(_) = b.read_to_end(&mut buffer).await {
4050 let _ = tokio::fs::remove_file(file_path).await;
4051 }
4052 }
4053
4054 Box::new(buffer.into())
4055 } else {
4056 Box::new(data.into())
4057 }),
4058 status_code,
4059 final_url: rd,
4060 ..Default::default()
4061 }
4062 }
4063 Ok(res) => {
4064 let u = res.url().as_str();
4065
4066 let rd = if target_url != u {
4067 Some(u.into())
4068 } else {
4069 None
4070 };
4071
4072 PageResponse {
4073 #[cfg(feature = "time")]
4074 duration,
4075 #[cfg(feature = "headers")]
4076 headers: Some(res.headers().clone()),
4077 #[cfg(feature = "remote_addr")]
4078 remote_addr: res.remote_addr(),
4079 #[cfg(feature = "cookies")]
4080 cookies: get_cookies(&res),
4081 status_code: res.status(),
4082 final_url: rd,
4083 ..Default::default()
4084 }
4085 }
4086 Err(err) => {
4087 log::info!("error fetching {}", target_url);
4088 let mut page_response = PageResponse::default();
4089
4090 if let Some(status_code) = err.status() {
4091 page_response.status_code = status_code;
4092 } else {
4093 page_response.status_code = crate::page::get_error_http_status_code(&err);
4094 }
4095
4096 page_response.error_for_status = Some(Err(err));
4097 page_response
4098 }
4099 }
4100}
4101
4102#[cfg(all(feature = "fs", feature = "chrome"))]
4104pub async fn fetch_page_html(
4106 target_url: &str,
4107 client: &Client,
4108 page: &chromiumoxide::Page,
4109 wait_for: &Option<crate::configuration::WaitFor>,
4110 screenshot: &Option<crate::configuration::ScreenShotConfig>,
4111 page_set: bool,
4112 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4113 execution_scripts: &Option<ExecutionScripts>,
4114 automation_scripts: &Option<AutomationScripts>,
4115 viewport: &Option<crate::configuration::Viewport>,
4116 request_timeout: &Option<Box<std::time::Duration>>,
4117 track_events: &Option<crate::configuration::ChromeEventTracker>,
4118 referrer: Option<String>,
4119 max_page_bytes: Option<f64>,
4120 cache_options: Option<CacheOptions>,
4121 cache_policy: &Option<BasicCachePolicy>,
4122 #[cfg(feature = "cookies")] jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4123) -> PageResponse {
4124 use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
4125 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
4126
4127 #[cfg(feature = "time")]
4128 let duration = Some(tokio::time::Instant::now());
4129
4130 let skip_browser = cache_skip_browser(&cache_options);
4131 let cached_html = get_cached_url(&target_url, cache_options.as_ref(), cache_policy).await;
4132 let cached = cached_html.is_some();
4133
4134 if skip_browser {
4136 if let Some(html) = cached_html {
4137 return PageResponse {
4138 content: Some(Box::new(html.into_bytes())),
4139 status_code: StatusCode::OK,
4140 final_url: Some(target_url.to_string()),
4141 #[cfg(feature = "time")]
4142 duration,
4143 ..Default::default()
4144 };
4145 }
4146 }
4147
4148 let mut page_response = match &page {
4149 page => {
4150 match fetch_page_html_chrome_base(
4151 if let Some(cached) = &cached_html {
4152 cached
4153 } else {
4154 target_url
4155 },
4156 &page,
4157 cached,
4158 true,
4159 wait_for,
4160 screenshot,
4161 page_set,
4162 openai_config,
4163 if cached { Some(target_url) } else { None },
4164 execution_scripts,
4165 automation_scripts,
4166 &viewport,
4167 &request_timeout,
4168 &track_events,
4169 referrer,
4170 max_page_bytes,
4171 cache_options,
4172 cache_policy,
4173 &None,
4174 jar,
4175 )
4176 .await
4177 {
4178 Ok(page) => page,
4179 _ => {
4180 log::info!(
4181 "- error fetching chrome page defaulting to raw http request {}",
4182 &target_url,
4183 );
4184
4185 match client.get(target_url).send().await {
4186 Ok(res) if valid_parsing_status(&res) => {
4187 let headers = res.headers().clone();
4188 let cookies = get_cookies(&res);
4189 let status_code = res.status();
4190 let mut stream = res.bytes_stream();
4191 let mut data = Vec::new();
4192
4193 let mut file: Option<tokio::fs::File> = None;
4194 let mut file_path = String::new();
4195
4196 while let Some(item) = stream.next().await {
4197 match item {
4198 Ok(text) => {
4199 let wrote_disk = file.is_some();
4200
4201 if !wrote_disk && data.capacity() < 8192 {
4203 data.extend_from_slice(&text);
4204 } else {
4205 if !wrote_disk {
4206 file_path = string_concat!(
4207 TMP_DIR,
4208 &utf8_percent_encode(
4209 target_url,
4210 NON_ALPHANUMERIC
4211 )
4212 .to_string()
4213 );
4214 match tokio::fs::File::create(&file_path).await {
4215 Ok(f) => {
4216 let file = file.insert(f);
4217
4218 data.extend_from_slice(&text);
4219
4220 if let Ok(_) =
4221 file.write_all(&data.as_ref()).await
4222 {
4223 data.clear();
4224 }
4225 }
4226 _ => data.extend_from_slice(&text),
4227 };
4228 } else {
4229 if let Some(f) = file.as_mut() {
4230 if let Ok(_) = f.write_all(&text).await {
4231 data.extend_from_slice(&text)
4232 }
4233 }
4234 }
4235 }
4236 }
4237 Err(e) => {
4238 log::error!("{e} in {}", target_url);
4239 break;
4240 }
4241 }
4242 }
4243
4244 PageResponse {
4245 #[cfg(feature = "headers")]
4246 headers: Some(headers),
4247 #[cfg(feature = "remote_addr")]
4248 remote_addr: res.remote_addr(),
4249 #[cfg(feature = "cookies")]
4250 cookies,
4251 content: Some(if file.is_some() {
4252 let mut buffer = vec![];
4253
4254 if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
4255 if let Ok(_) = b.read_to_end(&mut buffer).await {
4256 let _ = tokio::fs::remove_file(file_path).await;
4257 }
4258 }
4259
4260 Box::new(buffer.into())
4261 } else {
4262 Box::new(data.into())
4263 }),
4264 status_code,
4265 ..Default::default()
4266 }
4267 }
4268
4269 Ok(res) => PageResponse {
4270 #[cfg(feature = "headers")]
4271 headers: Some(res.headers().clone()),
4272 #[cfg(feature = "remote_addr")]
4273 remote_addr: res.remote_addr(),
4274 #[cfg(feature = "cookies")]
4275 cookies: get_cookies(&res),
4276 status_code: res.status(),
4277 ..Default::default()
4278 },
4279 Err(err) => {
4280 log::info!("error fetching {}", target_url);
4281 let mut page_response = PageResponse::default();
4282
4283 if let Some(status_code) = err.status() {
4284 page_response.status_code = status_code;
4285 } else {
4286 page_response.status_code =
4287 crate::page::get_error_http_status_code(&err);
4288 }
4289
4290 page_response.error_for_status = Some(Err(err));
4291 page_response
4292 }
4293 }
4294 }
4295 }
4296 }
4297 };
4298 set_page_response_duration(&mut page_response, duration);
4299
4300 page_response
4301}
4302
4303#[cfg(any(feature = "cache", feature = "cache_mem"))]
4304pub fn create_cache_key_raw(
4306 uri: &str,
4307 override_method: Option<&str>,
4308 auth: Option<&str>,
4309) -> String {
4310 if let Some(authentication) = auth {
4311 format!(
4312 "{}:{}:{}",
4313 override_method.unwrap_or_else(|| "GET".into()),
4314 uri,
4315 authentication
4316 )
4317 } else {
4318 format!(
4319 "{}:{}",
4320 override_method.unwrap_or_else(|| "GET".into()),
4321 uri
4322 )
4323 }
4324}
4325
4326#[cfg(any(feature = "cache", feature = "cache_mem"))]
4327pub fn create_cache_key(
4329 parts: &http::request::Parts,
4330 override_method: Option<&str>,
4331 auth: Option<&str>,
4332) -> String {
4333 create_cache_key_raw(
4334 &parts.uri.to_string(),
4335 Some(override_method.unwrap_or_else(|| parts.method.as_str())),
4336 auth,
4337 )
4338}
4339
4340#[derive(Default, Debug, Clone, PartialEq, Eq)]
4341#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
4342pub enum CacheOptions {
4344 Yes,
4346 Authorized(String),
4348 #[default]
4349 No,
4351 SkipBrowser,
4353 SkipBrowserAuthorized(String),
4355}
4356
4357#[inline]
4358pub fn cache_auth_token(cache_options: &std::option::Option<CacheOptions>) -> Option<&str> {
4360 cache_options.as_ref().and_then(|opt| match opt {
4361 CacheOptions::Authorized(token) | CacheOptions::SkipBrowserAuthorized(token) => {
4362 Some(token.as_str())
4363 }
4364 _ => None,
4365 })
4366}
4367
4368#[inline]
4369pub fn cache_skip_browser(cache_options: &std::option::Option<CacheOptions>) -> bool {
4371 matches!(
4372 cache_options,
4373 Some(CacheOptions::SkipBrowser) | Some(CacheOptions::SkipBrowserAuthorized(_))
4374 )
4375}
4376
4377#[derive(Debug, Default, Clone, Hash, PartialEq, Eq)]
4379#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
4380pub enum BasicCachePolicy {
4381 AllowStale,
4383 Period(std::time::SystemTime),
4385 #[default]
4386 Normal,
4388}
4389
4390#[cfg(feature = "chrome_remote_cache")]
4391impl BasicCachePolicy {
4392 pub fn from_basic(&self) -> chromiumoxide::cache::BasicCachePolicy {
4394 match &self {
4395 BasicCachePolicy::AllowStale => chromiumoxide::cache::BasicCachePolicy::AllowStale,
4396 BasicCachePolicy::Normal => chromiumoxide::cache::BasicCachePolicy::Normal,
4397 BasicCachePolicy::Period(p) => chromiumoxide::cache::BasicCachePolicy::Period(*p),
4398 }
4399 }
4400}
4401
4402#[cfg(any(feature = "cache", feature = "cache_mem"))]
4403pub async fn get_cached_url_base(
4405 target_url: &str,
4406 cache_options: Option<CacheOptions>,
4407 cache_policy: &Option<BasicCachePolicy>, ) -> Option<String> {
4409 use crate::http_cache_reqwest::CacheManager;
4410
4411 let auth_opt = match cache_options {
4412 Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
4413 Some(CacheOptions::Authorized(token))
4414 | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
4415 Some(CacheOptions::No) | None => return None,
4416 };
4417
4418 let allow_stale = matches!(cache_policy, Some(BasicCachePolicy::AllowStale));
4423 let now = match cache_policy {
4424 Some(BasicCachePolicy::Period(t)) => *t,
4425 _ => std::time::SystemTime::now(),
4426 };
4427
4428 let cache_url = create_cache_key_raw(target_url, None, auth_opt.as_deref());
4429
4430 let result = tokio::time::timeout(Duration::from_millis(60), async {
4431 crate::website::CACACHE_MANAGER.get(&cache_url).await
4432 })
4433 .await;
4434
4435 if let Ok(cache_result) = result {
4436 if let Ok(Some((http_response, stored_policy))) = cache_result {
4437 if allow_stale || !stored_policy.is_stale(now) {
4438 let body = http_response.body;
4439 if !auto_encoder::is_binary_file(&body) {
4440 let accept_lang = http_response
4441 .headers
4442 .get("accept-language")
4443 .and_then(|h| if h.is_empty() { None } else { Some(h) })
4444 .map_or("", |v| v);
4445
4446 return Some(if !accept_lang.is_empty() {
4447 auto_encoder::encode_bytes_from_language(&body, accept_lang)
4448 } else {
4449 auto_encoder::auto_encode_bytes(&body)
4450 });
4451 }
4452 }
4453 }
4454 }
4455
4456 None
4457}
4458
4459#[cfg(any(feature = "cache", feature = "cache_mem"))]
4460pub async fn get_cached_url(
4462 target_url: &str,
4463 cache_options: Option<&CacheOptions>,
4464 cache_policy: &Option<BasicCachePolicy>,
4465) -> Option<String> {
4466 if let Some(body) = get_cached_url_base(target_url, cache_options.cloned(), cache_policy).await
4467 {
4468 return Some(body);
4469 }
4470
4471 let alt_url: Option<String> = if target_url.ends_with('/') {
4472 let trimmed = target_url.trim_end_matches('/');
4473 if trimmed.is_empty() || trimmed == target_url {
4474 None
4475 } else {
4476 Some(trimmed.to_string())
4477 }
4478 } else {
4479 let mut s = String::with_capacity(target_url.len() + 1);
4480 s.push_str(target_url);
4481 s.push('/');
4482 Some(s)
4483 };
4484
4485 if let Some(alt) = alt_url {
4486 if let Some(body) = get_cached_url_base(&alt, cache_options.cloned(), cache_policy).await {
4487 return Some(body);
4488 }
4489 }
4490
4491 None
4492}
4493
4494#[cfg(all(not(feature = "cache"), not(feature = "cache_mem")))]
4495pub async fn get_cached_url(
4497 _target_url: &str,
4498 _cache_options: Option<&CacheOptions>,
4499 _cache_policy: &Option<BasicCachePolicy>,
4500) -> Option<String> {
4501 None
4502}
4503
4504#[cfg(all(not(feature = "fs"), feature = "chrome"))]
4505pub async fn fetch_page_html_base(
4507 target_url: &str,
4508 client: &Client,
4509 page: &chromiumoxide::Page,
4510 wait_for: &Option<crate::configuration::WaitFor>,
4511 screenshot: &Option<crate::configuration::ScreenShotConfig>,
4512 page_set: bool,
4513 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4514 execution_scripts: &Option<ExecutionScripts>,
4515 automation_scripts: &Option<AutomationScripts>,
4516 viewport: &Option<crate::configuration::Viewport>,
4517 request_timeout: &Option<Box<std::time::Duration>>,
4518 track_events: &Option<crate::configuration::ChromeEventTracker>,
4519 referrer: Option<String>,
4520 max_page_bytes: Option<f64>,
4521 cache_options: Option<CacheOptions>,
4522 cache_policy: &Option<BasicCachePolicy>,
4523 seeded_resource: Option<String>,
4524 jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4525 remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4526) -> PageResponse {
4527 let skip_browser = cache_skip_browser(&cache_options);
4528 let cached_html = if seeded_resource.is_some() {
4529 seeded_resource
4530 } else {
4531 get_cached_url(&target_url, cache_options.as_ref(), cache_policy).await
4532 };
4533 let cached = cached_html.is_some();
4534
4535 if skip_browser {
4537 if let Some(html) = cached_html {
4538 return PageResponse {
4539 content: Some(Box::new(html.into_bytes())),
4540 status_code: StatusCode::OK,
4541 final_url: Some(target_url.to_string()),
4542 ..Default::default()
4543 };
4544 }
4545 }
4546
4547 match fetch_page_html_chrome_base(
4548 if let Some(cached) = &cached_html {
4549 cached
4550 } else {
4551 target_url
4552 },
4553 &page,
4554 cached,
4555 true,
4556 wait_for,
4557 screenshot,
4558 page_set,
4559 openai_config,
4560 if cached { Some(target_url) } else { None },
4561 execution_scripts,
4562 automation_scripts,
4563 viewport,
4564 request_timeout,
4565 track_events,
4566 referrer,
4567 max_page_bytes,
4568 cache_options,
4569 cache_policy,
4570 &None,
4571 &None,
4572 jar,
4573 remote_multimodal,
4574 )
4575 .await
4576 {
4577 Ok(page) => page,
4578 Err(err) => {
4579 log::error!("{:?}", err);
4580 fetch_page_html_raw(&target_url, &client).await
4581 }
4582 }
4583}
4584
4585#[cfg(all(not(feature = "fs"), feature = "chrome"))]
4586pub async fn fetch_page_html(
4588 target_url: &str,
4589 client: &Client,
4590 page: &chromiumoxide::Page,
4591 wait_for: &Option<crate::configuration::WaitFor>,
4592 screenshot: &Option<crate::configuration::ScreenShotConfig>,
4593 page_set: bool,
4594 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4595 execution_scripts: &Option<ExecutionScripts>,
4596 automation_scripts: &Option<AutomationScripts>,
4597 viewport: &Option<crate::configuration::Viewport>,
4598 request_timeout: &Option<Box<std::time::Duration>>,
4599 track_events: &Option<crate::configuration::ChromeEventTracker>,
4600 referrer: Option<String>,
4601 max_page_bytes: Option<f64>,
4602 cache_options: Option<CacheOptions>,
4603 cache_policy: &Option<BasicCachePolicy>,
4604 remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4605) -> PageResponse {
4606 fetch_page_html_base(
4607 target_url,
4608 client,
4609 page,
4610 wait_for,
4611 screenshot,
4612 page_set,
4613 openai_config,
4614 execution_scripts,
4615 automation_scripts,
4616 viewport,
4617 request_timeout,
4618 track_events,
4619 referrer,
4620 max_page_bytes,
4621 cache_options,
4622 cache_policy,
4623 None,
4624 None,
4625 remote_multimodal,
4626 )
4627 .await
4628}
4629
4630#[cfg(all(not(feature = "fs"), feature = "chrome"))]
4631pub async fn fetch_page_html_seeded(
4633 target_url: &str,
4634 client: &Client,
4635 page: &chromiumoxide::Page,
4636 wait_for: &Option<crate::configuration::WaitFor>,
4637 screenshot: &Option<crate::configuration::ScreenShotConfig>,
4638 page_set: bool,
4639 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4640 execution_scripts: &Option<ExecutionScripts>,
4641 automation_scripts: &Option<AutomationScripts>,
4642 viewport: &Option<crate::configuration::Viewport>,
4643 request_timeout: &Option<Box<std::time::Duration>>,
4644 track_events: &Option<crate::configuration::ChromeEventTracker>,
4645 referrer: Option<String>,
4646 max_page_bytes: Option<f64>,
4647 cache_options: Option<CacheOptions>,
4648 cache_policy: &Option<BasicCachePolicy>,
4649 seeded_resource: Option<String>,
4650 jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4651 remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4652) -> PageResponse {
4653 fetch_page_html_base(
4654 target_url,
4655 client,
4656 page,
4657 wait_for,
4658 screenshot,
4659 page_set,
4660 openai_config,
4661 execution_scripts,
4662 automation_scripts,
4663 viewport,
4664 request_timeout,
4665 track_events,
4666 referrer,
4667 max_page_bytes,
4668 cache_options,
4669 cache_policy,
4670 seeded_resource,
4671 jar,
4672 remote_multimodal,
4673 )
4674 .await
4675}
4676
4677#[cfg(feature = "chrome")]
4678async fn _fetch_page_html_chrome(
4680 target_url: &str,
4681 client: &Client,
4682 page: &chromiumoxide::Page,
4683 wait_for: &Option<crate::configuration::WaitFor>,
4684 screenshot: &Option<crate::configuration::ScreenShotConfig>,
4685 page_set: bool,
4686 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4687 execution_scripts: &Option<ExecutionScripts>,
4688 automation_scripts: &Option<AutomationScripts>,
4689 viewport: &Option<crate::configuration::Viewport>,
4690 request_timeout: &Option<Box<std::time::Duration>>,
4691 track_events: &Option<crate::configuration::ChromeEventTracker>,
4692 referrer: Option<String>,
4693 max_page_bytes: Option<f64>,
4694 cache_options: Option<CacheOptions>,
4695 cache_policy: &Option<BasicCachePolicy>,
4696 resource: Option<String>,
4697 jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4698 remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4699) -> PageResponse {
4700 let duration = if cfg!(feature = "time") {
4701 Some(tokio::time::Instant::now())
4702 } else {
4703 None
4704 };
4705
4706 let cached_html = if resource.is_some() {
4707 resource
4708 } else {
4709 get_cached_url(&target_url, cache_options.as_ref(), cache_policy).await
4710 };
4711
4712 let cached = !cached_html.is_none();
4713
4714 let mut page_response = match &page {
4715 page => {
4716 match fetch_page_html_chrome_base(
4717 if let Some(cached) = &cached_html {
4718 &cached
4719 } else {
4720 &target_url
4721 },
4722 &page,
4723 cached,
4724 true,
4725 wait_for,
4726 screenshot,
4727 page_set,
4728 openai_config,
4729 if cached { Some(target_url) } else { None },
4730 execution_scripts,
4731 automation_scripts,
4732 viewport,
4733 request_timeout,
4734 track_events,
4735 referrer,
4736 max_page_bytes,
4737 cache_options,
4738 cache_policy,
4739 &None,
4740 &None,
4741 jar,
4742 remote_multimodal,
4743 )
4744 .await
4745 {
4746 Ok(page) => page,
4747 Err(err) => {
4748 log::error!(
4749 "{:?}. Error requesting: {} - defaulting to raw http request",
4750 err,
4751 target_url
4752 );
4753
4754 match client.get(target_url).send().await {
4755 Ok(res) if valid_parsing_status(&res) => {
4756 #[cfg(feature = "headers")]
4757 let headers = res.headers().clone();
4758 #[cfg(feature = "remote_addr")]
4759 let remote_addr = res.remote_addr();
4760 let cookies = get_cookies(&res);
4761 let status_code = res.status();
4762 let mut stream = res.bytes_stream();
4763 let mut data = Vec::new();
4764
4765 while let Some(item) = stream.next().await {
4766 match item {
4767 Ok(text) => {
4768 let limit = *MAX_SIZE_BYTES;
4769
4770 if limit > 0 && data.len() + text.len() > limit {
4771 break;
4772 }
4773
4774 data.extend_from_slice(&text)
4775 }
4776 Err(e) => {
4777 log::error!("{e} in {}", target_url);
4778 break;
4779 }
4780 }
4781 }
4782
4783 PageResponse {
4784 #[cfg(feature = "headers")]
4785 headers: Some(headers),
4786 #[cfg(feature = "remote_addr")]
4787 remote_addr,
4788 #[cfg(feature = "cookies")]
4789 cookies,
4790 content: Some(Box::new(data.into())),
4791 status_code,
4792 ..Default::default()
4793 }
4794 }
4795 Ok(res) => PageResponse {
4796 #[cfg(feature = "headers")]
4797 headers: Some(res.headers().clone()),
4798 #[cfg(feature = "remote_addr")]
4799 remote_addr: res.remote_addr(),
4800 #[cfg(feature = "cookies")]
4801 cookies: get_cookies(&res),
4802 status_code: res.status(),
4803 ..Default::default()
4804 },
4805 Err(err) => {
4806 log::info!("error fetching {}", target_url);
4807 let mut page_response = PageResponse::default();
4808
4809 if let Some(status_code) = err.status() {
4810 page_response.status_code = status_code;
4811 } else {
4812 page_response.status_code =
4813 crate::page::get_error_http_status_code(&err);
4814 }
4815
4816 page_response.error_for_status = Some(Err(err));
4817 page_response
4818 }
4819 }
4820 }
4821 }
4822 }
4823 };
4824
4825 set_page_response_duration(&mut page_response, duration);
4826
4827 page_response
4828}
4829
4830#[cfg(feature = "chrome")]
4831pub async fn fetch_page_html_chrome(
4833 target_url: &str,
4834 client: &Client,
4835 page: &chromiumoxide::Page,
4836 wait_for: &Option<crate::configuration::WaitFor>,
4837 screenshot: &Option<crate::configuration::ScreenShotConfig>,
4838 page_set: bool,
4839 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4840 execution_scripts: &Option<ExecutionScripts>,
4841 automation_scripts: &Option<AutomationScripts>,
4842 viewport: &Option<crate::configuration::Viewport>,
4843 request_timeout: &Option<Box<std::time::Duration>>,
4844 track_events: &Option<crate::configuration::ChromeEventTracker>,
4845 referrer: Option<String>,
4846 max_page_bytes: Option<f64>,
4847 cache_options: Option<CacheOptions>,
4848 cache_policy: &Option<BasicCachePolicy>,
4849 jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4850 remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4851) -> PageResponse {
4852 _fetch_page_html_chrome(
4853 target_url,
4854 client,
4855 page,
4856 wait_for,
4857 screenshot,
4858 page_set,
4859 openai_config,
4860 execution_scripts,
4861 automation_scripts,
4862 viewport,
4863 request_timeout,
4864 track_events,
4865 referrer,
4866 max_page_bytes,
4867 cache_options,
4868 cache_policy,
4869 None,
4870 jar,
4871 remote_multimodal,
4872 )
4873 .await
4874}
4875
4876#[cfg(feature = "chrome")]
4877pub async fn fetch_page_html_chrome_seeded(
4879 target_url: &str,
4880 client: &Client,
4881 page: &chromiumoxide::Page,
4882 wait_for: &Option<crate::configuration::WaitFor>,
4883 screenshot: &Option<crate::configuration::ScreenShotConfig>,
4884 page_set: bool,
4885 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4886 execution_scripts: &Option<ExecutionScripts>,
4887 automation_scripts: &Option<AutomationScripts>,
4888 viewport: &Option<crate::configuration::Viewport>,
4889 request_timeout: &Option<Box<std::time::Duration>>,
4890 track_events: &Option<crate::configuration::ChromeEventTracker>,
4891 referrer: Option<String>,
4892 max_page_bytes: Option<f64>,
4893 cache_options: Option<CacheOptions>,
4894 cache_policy: &Option<BasicCachePolicy>,
4895 resource: Option<String>,
4896 jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4897 remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4898) -> PageResponse {
4899 _fetch_page_html_chrome(
4900 target_url,
4901 client,
4902 page,
4903 wait_for,
4904 screenshot,
4905 page_set,
4906 openai_config,
4907 execution_scripts,
4908 automation_scripts,
4909 viewport,
4910 request_timeout,
4911 track_events,
4912 referrer,
4913 max_page_bytes,
4914 cache_options,
4915 cache_policy,
4916 resource,
4917 jar,
4918 remote_multimodal,
4919 )
4920 .await
4921}
4922
4923#[cfg(not(feature = "openai"))]
4924pub async fn openai_request(
4926 _gpt_configs: &crate::configuration::GPTConfigs,
4927 _resource: String,
4928 _url: &str,
4929 _prompt: &str,
4930) -> crate::features::openai_common::OpenAIReturn {
4931 Default::default()
4932}
4933
4934#[cfg(feature = "openai")]
4935lazy_static! {
4936 static ref CORE_BPE_TOKEN_COUNT: tiktoken_rs::CoreBPE = tiktoken_rs::cl100k_base().unwrap();
4937 static ref SEM: tokio::sync::Semaphore = {
4938 let logical = num_cpus::get();
4939 let physical = num_cpus::get_physical();
4940
4941 let sem_limit = if logical > physical {
4942 (logical) / (physical)
4943 } else {
4944 logical
4945 };
4946
4947 let (sem_limit, sem_max) = if logical == physical {
4948 (sem_limit * physical, 20)
4949 } else {
4950 (sem_limit * 4, 10)
4951 };
4952 let sem_limit = sem_limit / 3;
4953 tokio::sync::Semaphore::const_new(sem_limit.max(sem_max))
4954 };
4955 static ref CLIENT: async_openai::Client<async_openai::config::OpenAIConfig> =
4956 async_openai::Client::new();
4957}
4958
4959#[cfg(feature = "openai")]
4960pub async fn openai_request_base(
4962 gpt_configs: &crate::configuration::GPTConfigs,
4963 resource: String,
4964 url: &str,
4965 prompt: &str,
4966) -> crate::features::openai_common::OpenAIReturn {
4967 match SEM.acquire().await {
4968 Ok(permit) => {
4969 let mut chat_completion_defaults =
4970 async_openai::types::CreateChatCompletionRequestArgs::default();
4971 let gpt_base = chat_completion_defaults
4972 .max_tokens(gpt_configs.max_tokens)
4973 .model(&gpt_configs.model);
4974 let gpt_base = match &gpt_configs.user {
4975 Some(user) => gpt_base.user(user),
4976 _ => gpt_base,
4977 };
4978 let gpt_base = match gpt_configs.temperature {
4979 Some(temp) => gpt_base.temperature(temp),
4980 _ => gpt_base,
4981 };
4982 let gpt_base = match gpt_configs.top_p {
4983 Some(tp) => gpt_base.top_p(tp),
4984 _ => gpt_base,
4985 };
4986
4987 let core_bpe = match tiktoken_rs::get_bpe_from_model(&gpt_configs.model) {
4988 Ok(bpe) => Some(bpe),
4989 _ => None,
4990 };
4991
4992 let (tokens, prompt_tokens) = match &core_bpe {
4993 Some(core_bpe) => (
4994 core_bpe.encode_with_special_tokens(&resource),
4995 core_bpe.encode_with_special_tokens(&prompt),
4996 ),
4997 _ => (
4998 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&resource),
4999 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
5000 ),
5001 };
5002
5003 let output_tokens_count = tokens.len() + prompt_tokens.len();
5005
5006 let mut max_tokens = crate::features::openai::calculate_max_tokens(
5007 &gpt_configs.model,
5008 gpt_configs.max_tokens,
5009 &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
5010 &resource,
5011 &prompt,
5012 );
5013
5014 let resource = if output_tokens_count > max_tokens {
5016 let r = clean_html(&resource);
5017
5018 max_tokens = crate::features::openai::calculate_max_tokens(
5019 &gpt_configs.model,
5020 gpt_configs.max_tokens,
5021 &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
5022 &r,
5023 &prompt,
5024 );
5025
5026 let (tokens, prompt_tokens) = match &core_bpe {
5027 Some(core_bpe) => (
5028 core_bpe.encode_with_special_tokens(&r),
5029 core_bpe.encode_with_special_tokens(&prompt),
5030 ),
5031 _ => (
5032 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
5033 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
5034 ),
5035 };
5036
5037 let output_tokens_count = tokens.len() + prompt_tokens.len();
5038
5039 if output_tokens_count > max_tokens {
5040 let r = clean_html_slim(&r);
5041
5042 max_tokens = crate::features::openai::calculate_max_tokens(
5043 &gpt_configs.model,
5044 gpt_configs.max_tokens,
5045 &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
5046 &r,
5047 &prompt,
5048 );
5049
5050 let (tokens, prompt_tokens) = match &core_bpe {
5051 Some(core_bpe) => (
5052 core_bpe.encode_with_special_tokens(&r),
5053 core_bpe.encode_with_special_tokens(&prompt),
5054 ),
5055 _ => (
5056 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
5057 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
5058 ),
5059 };
5060
5061 let output_tokens_count = tokens.len() + prompt_tokens.len();
5062
5063 if output_tokens_count > max_tokens {
5064 clean_html_full(&r)
5065 } else {
5066 r
5067 }
5068 } else {
5069 r
5070 }
5071 } else {
5072 clean_html(&resource)
5073 };
5074
5075 let mut tokens_used = crate::features::openai_common::OpenAIUsage::default();
5076 let json_mode = gpt_configs.extra_ai_data;
5077
5078 let response_format = {
5079 let mut mode = if json_mode {
5080 async_openai::types::ResponseFormat::JsonObject
5081 } else {
5082 async_openai::types::ResponseFormat::Text
5083 };
5084
5085 if let Some(structure) = &gpt_configs.json_schema {
5086 if let Some(schema) = &structure.schema {
5087 if let Ok(mut schema) =
5088 crate::features::serde_json::from_str::<serde_json::Value>(&schema)
5089 {
5090 if json_mode {
5091 if let Some(properties) = schema.get_mut("properties") {
5093 if let Some(properties_map) = properties.as_object_mut() {
5094 properties_map.insert(
5095 "js".to_string(),
5096 serde_json::json!({
5097 "type": "string"
5098 }),
5099 );
5100 }
5101 }
5102 }
5103
5104 mode = async_openai::types::ResponseFormat::JsonSchema {
5105 json_schema: async_openai::types::ResponseFormatJsonSchema {
5106 description: structure.description.clone(),
5107 name: structure.name.clone(),
5108 schema: if schema.is_null() { None } else { Some(schema) },
5109 strict: structure.strict,
5110 },
5111 }
5112 }
5113 }
5114 }
5115
5116 mode
5117 };
5118
5119 match async_openai::types::ChatCompletionRequestAssistantMessageArgs::default()
5120 .content(string_concat!("URL: ", url, "\n", "HTML: ", resource))
5121 .build()
5122 {
5123 Ok(resource_completion) => {
5124 let mut messages: Vec<async_openai::types::ChatCompletionRequestMessage> =
5125 vec![crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT.clone()];
5126
5127 if json_mode {
5128 messages.push(
5129 crate::features::openai::BROWSER_ACTIONS_SYSTEM_EXTRA_PROMPT.clone(),
5130 );
5131 }
5132
5133 messages.push(resource_completion.into());
5134
5135 if !prompt.is_empty() {
5136 messages.push(
5137 match async_openai::types::ChatCompletionRequestUserMessageArgs::default()
5138 .content(prompt)
5139 .build()
5140 {
5141 Ok(o) => o,
5142 _ => Default::default(),
5143 }
5144 .into()
5145 )
5146 }
5147
5148 let v = match gpt_base
5149 .max_tokens(max_tokens as u32)
5150 .messages(messages)
5151 .response_format(response_format)
5152 .build()
5153 {
5154 Ok(request) => {
5155 let res = match &gpt_configs.api_key {
5156 Some(key) => {
5157 if !key.is_empty() {
5158 let conf = CLIENT.config().to_owned();
5159 async_openai::Client::with_config(conf.with_api_key(key))
5160 .chat()
5161 .create(request)
5162 .await
5163 } else {
5164 CLIENT.chat().create(request).await
5165 }
5166 }
5167 _ => CLIENT.chat().create(request).await,
5168 };
5169
5170 match res {
5171 Ok(mut response) => {
5172 let mut choice = response.choices.first_mut();
5173
5174 if let Some(usage) = response.usage.take() {
5175 tokens_used.prompt_tokens = usage.prompt_tokens;
5176 tokens_used.completion_tokens = usage.completion_tokens;
5177 tokens_used.total_tokens = usage.total_tokens;
5178 }
5179
5180 match choice.as_mut() {
5181 Some(c) => match c.message.content.take() {
5182 Some(content) => content,
5183 _ => Default::default(),
5184 },
5185 _ => Default::default(),
5186 }
5187 }
5188 Err(err) => {
5189 log::error!("{:?}", err);
5190 Default::default()
5191 }
5192 }
5193 }
5194 _ => Default::default(),
5195 };
5196
5197 drop(permit);
5198
5199 crate::features::openai_common::OpenAIReturn {
5200 response: v,
5201 usage: tokens_used,
5202 error: None,
5203 }
5204 }
5205 Err(e) => {
5206 let mut d = crate::features::openai_common::OpenAIReturn::default();
5207
5208 d.error = Some(e.to_string());
5209
5210 d
5211 }
5212 }
5213 }
5214 Err(e) => {
5215 let mut d = crate::features::openai_common::OpenAIReturn::default();
5216
5217 d.error = Some(e.to_string());
5218
5219 d
5220 }
5221 }
5222}
5223
5224#[cfg(all(feature = "openai", not(feature = "cache_openai")))]
5225pub async fn openai_request(
5227 gpt_configs: &crate::configuration::GPTConfigs,
5228 resource: String,
5229 url: &str,
5230 prompt: &str,
5231) -> crate::features::openai_common::OpenAIReturn {
5232 openai_request_base(gpt_configs, resource, url, prompt).await
5233}
5234
5235#[cfg(all(feature = "openai", feature = "cache_openai"))]
5236pub async fn openai_request(
5238 gpt_configs: &crate::configuration::GPTConfigs,
5239 resource: String,
5240 url: &str,
5241 prompt: &str,
5242) -> crate::features::openai_common::OpenAIReturn {
5243 match &gpt_configs.cache {
5244 Some(cache) => {
5245 use std::hash::{Hash, Hasher};
5246 let mut s = ahash::AHasher::default();
5247
5248 url.hash(&mut s);
5249 prompt.hash(&mut s);
5250 gpt_configs.model.hash(&mut s);
5251 gpt_configs.max_tokens.hash(&mut s);
5252 gpt_configs.extra_ai_data.hash(&mut s);
5253 resource.hash(&mut s);
5255
5256 let key = s.finish();
5257
5258 match cache.get(&key).await {
5259 Some(cache) => {
5260 let mut c = cache;
5261 c.usage.cached = true;
5262 c
5263 }
5264 _ => {
5265 let r = openai_request_base(gpt_configs, resource, url, prompt).await;
5266 let _ = cache.insert(key, r.clone()).await;
5267 r
5268 }
5269 }
5270 }
5271 _ => openai_request_base(gpt_configs, resource, url, prompt).await,
5272 }
5273}
5274
5275#[cfg(any(feature = "gemini", feature = "real_browser"))]
5276lazy_static! {
5277 pub static ref GEMINI_SEM: tokio::sync::Semaphore = {
5279 let sem_limit = (num_cpus::get() * 2).max(8);
5280 tokio::sync::Semaphore::const_new(sem_limit)
5281 };
5282}
5283
5284#[cfg(not(feature = "gemini"))]
5285pub async fn gemini_request(
5287 _gemini_configs: &crate::configuration::GeminiConfigs,
5288 _resource: String,
5289 _url: &str,
5290 _prompt: &str,
5291) -> crate::features::gemini_common::GeminiReturn {
5292 Default::default()
5293}
5294
5295#[cfg(feature = "gemini")]
5296pub async fn gemini_request_base(
5298 gemini_configs: &crate::configuration::GeminiConfigs,
5299 resource: String,
5300 url: &str,
5301 prompt: &str,
5302) -> crate::features::gemini_common::GeminiReturn {
5303 use crate::features::gemini_common::{GeminiReturn, GeminiUsage, DEFAULT_GEMINI_MODEL};
5304
5305 match GEMINI_SEM.acquire().await {
5306 Ok(permit) => {
5307 let api_key = match &gemini_configs.api_key {
5309 Some(key) if !key.is_empty() => key.clone(),
5310 _ => match std::env::var("GEMINI_API_KEY") {
5311 Ok(key) => key,
5312 Err(_) => {
5313 return GeminiReturn {
5314 error: Some("GEMINI_API_KEY not set".to_string()),
5315 ..Default::default()
5316 };
5317 }
5318 },
5319 };
5320
5321 let model = if gemini_configs.model.is_empty() {
5323 DEFAULT_GEMINI_MODEL.to_string()
5324 } else {
5325 gemini_configs.model.clone()
5326 };
5327
5328 let client = match gemini_rust::Gemini::with_model(&api_key, model) {
5330 Ok(c) => c,
5331 Err(e) => {
5332 drop(permit);
5333 return GeminiReturn {
5334 error: Some(format!("Failed to create Gemini client: {}", e)),
5335 ..Default::default()
5336 };
5337 }
5338 };
5339
5340 let resource = clean_html(&resource);
5342
5343 let json_mode = gemini_configs.extra_ai_data;
5345 let system_prompt = if json_mode {
5346 format!(
5347 "{}\n\n{}",
5348 *crate::features::gemini::BROWSER_ACTIONS_SYSTEM_PROMPT,
5349 *crate::features::gemini::BROWSER_ACTIONS_SYSTEM_EXTRA_PROMPT
5350 )
5351 } else {
5352 crate::features::gemini::BROWSER_ACTIONS_SYSTEM_PROMPT.clone()
5353 };
5354
5355 let full_prompt = format!(
5356 "{}\n\nURL: {}\nHTML: {}\n\nUser Request: {}",
5357 system_prompt, url, resource, prompt
5358 );
5359
5360 let gen_config = gemini_rust::GenerationConfig {
5362 max_output_tokens: Some(gemini_configs.max_tokens as i32),
5363 temperature: gemini_configs.temperature,
5364 top_p: gemini_configs.top_p,
5365 top_k: gemini_configs.top_k,
5366 response_mime_type: if gemini_configs.json_schema.is_some() {
5367 Some("application/json".to_string())
5368 } else {
5369 None
5370 },
5371 response_schema: gemini_configs.json_schema.as_ref().and_then(|schema| {
5372 schema
5373 .schema
5374 .as_ref()
5375 .and_then(|s| serde_json::from_str::<serde_json::Value>(s).ok())
5376 }),
5377 ..Default::default()
5378 };
5379
5380 let result = client
5382 .generate_content()
5383 .with_user_message(&full_prompt)
5384 .with_generation_config(gen_config)
5385 .execute()
5386 .await;
5387
5388 drop(permit);
5389
5390 match result {
5391 Ok(response) => {
5392 let text = response.text();
5393
5394 let usage = if let Some(meta) = response.usage_metadata {
5396 GeminiUsage {
5397 prompt_tokens: meta.prompt_token_count.unwrap_or(0) as u32,
5398 completion_tokens: meta.candidates_token_count.unwrap_or(0) as u32,
5399 total_tokens: meta.total_token_count.unwrap_or(0) as u32,
5400 cached: false,
5401 }
5402 } else {
5403 GeminiUsage::default()
5404 };
5405
5406 GeminiReturn {
5407 response: text,
5408 usage,
5409 error: None,
5410 }
5411 }
5412 Err(e) => {
5413 log::error!("Gemini request failed: {:?}", e);
5414 GeminiReturn {
5415 error: Some(e.to_string()),
5416 ..Default::default()
5417 }
5418 }
5419 }
5420 }
5421 Err(e) => GeminiReturn {
5422 error: Some(e.to_string()),
5423 ..Default::default()
5424 },
5425 }
5426}
5427
5428#[cfg(all(feature = "gemini", not(feature = "cache_gemini")))]
5429pub async fn gemini_request(
5431 gemini_configs: &crate::configuration::GeminiConfigs,
5432 resource: String,
5433 url: &str,
5434 prompt: &str,
5435) -> crate::features::gemini_common::GeminiReturn {
5436 gemini_request_base(gemini_configs, resource, url, prompt).await
5437}
5438
5439#[cfg(all(feature = "gemini", feature = "cache_gemini"))]
5440pub async fn gemini_request(
5442 gemini_configs: &crate::configuration::GeminiConfigs,
5443 resource: String,
5444 url: &str,
5445 prompt: &str,
5446) -> crate::features::gemini_common::GeminiReturn {
5447 match &gemini_configs.cache {
5448 Some(cache) => {
5449 use std::hash::{Hash, Hasher};
5450 let mut s = ahash::AHasher::default();
5451
5452 url.hash(&mut s);
5453 prompt.hash(&mut s);
5454 gemini_configs.model.hash(&mut s);
5455 gemini_configs.max_tokens.hash(&mut s);
5456 gemini_configs.extra_ai_data.hash(&mut s);
5457 resource.hash(&mut s);
5458
5459 let key = s.finish();
5460
5461 match cache.get(&key).await {
5462 Some(cached) => {
5463 let mut c = cached;
5464 c.usage.cached = true;
5465 c
5466 }
5467 _ => {
5468 let r = gemini_request_base(gemini_configs, resource, url, prompt).await;
5469 let _ = cache.insert(key, r.clone()).await;
5470 r
5471 }
5472 }
5473 }
5474 _ => gemini_request_base(gemini_configs, resource, url, prompt).await,
5475 }
5476}
5477
5478#[inline]
5480pub fn clean_html_raw(html: &str) -> String {
5481 html.to_string()
5482}
5483
5484pub fn clean_html_base(html: &str) -> String {
5488 use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
5489
5490 match rewrite_str(
5491 html,
5492 RewriteStrSettings {
5493 element_content_handlers: vec![
5494 element!("script", |el| {
5495 el.remove();
5496 Ok(())
5497 }),
5498 element!("style", |el| {
5499 el.remove();
5500 Ok(())
5501 }),
5502 element!("link", |el| {
5503 el.remove();
5504 Ok(())
5505 }),
5506 element!("iframe", |el| {
5507 el.remove();
5508 Ok(())
5509 }),
5510 element!("[style*='display:none']", |el| {
5511 el.remove();
5512 Ok(())
5513 }),
5514 element!("[id*='ad']", |el| {
5515 el.remove();
5516 Ok(())
5517 }),
5518 element!("[class*='ad']", |el| {
5519 el.remove();
5520 Ok(())
5521 }),
5522 element!("[id*='tracking']", |el| {
5523 el.remove();
5524 Ok(())
5525 }),
5526 element!("[class*='tracking']", |el| {
5527 el.remove();
5528 Ok(())
5529 }),
5530 element!("meta", |el| {
5531 if let Some(attribute) = el.get_attribute("name") {
5532 if attribute != "title" && attribute != "description" {
5533 el.remove();
5534 }
5535 } else {
5536 el.remove();
5537 }
5538 Ok(())
5539 }),
5540 ],
5541 document_content_handlers: vec![doc_comments!(|c| {
5542 c.remove();
5543 Ok(())
5544 })],
5545 ..RewriteStrSettings::default()
5546 },
5547 ) {
5548 Ok(r) => r,
5549 _ => html.into(),
5550 }
5551}
5552
5553pub fn clean_html_slim(html: &str) -> String {
5555 use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
5556
5557 match rewrite_str(
5558 html,
5559 RewriteStrSettings {
5560 element_content_handlers: vec![
5561 element!("script", |el| {
5562 el.remove();
5563 Ok(())
5564 }),
5565 element!("style", |el| {
5566 el.remove();
5567 Ok(())
5568 }),
5569 element!("svg", |el| {
5570 el.remove();
5571 Ok(())
5572 }),
5573 element!("noscript", |el| {
5574 el.remove();
5575 Ok(())
5576 }),
5577 element!("link", |el| {
5578 el.remove();
5579 Ok(())
5580 }),
5581 element!("iframe", |el| {
5582 el.remove();
5583 Ok(())
5584 }),
5585 element!("canvas", |el| {
5586 el.remove();
5587 Ok(())
5588 }),
5589 element!("video", |el| {
5590 el.remove();
5591 Ok(())
5592 }),
5593 element!("img", |el| {
5594 if let Some(src) = el.get_attribute("src") {
5595 if src.starts_with("data:image") {
5596 el.remove();
5597 }
5598 }
5599 Ok(())
5600 }),
5601 element!("picture", |el| {
5602 if let Some(src) = el.get_attribute("src") {
5605 if src.starts_with("data:image") {
5606 el.remove();
5607 }
5608 }
5609 Ok(())
5610 }),
5611 element!("[style*='display:none']", |el| {
5612 el.remove();
5613 Ok(())
5614 }),
5615 element!("[id*='ad']", |el| {
5616 el.remove();
5617 Ok(())
5618 }),
5619 element!("[class*='ad']", |el| {
5620 el.remove();
5621 Ok(())
5622 }),
5623 element!("[id*='tracking']", |el| {
5624 el.remove();
5625 Ok(())
5626 }),
5627 element!("[class*='tracking']", |el| {
5628 el.remove();
5629 Ok(())
5630 }),
5631 element!("meta", |el| {
5632 if let Some(attribute) = el.get_attribute("name") {
5633 if attribute != "title" && attribute != "description" {
5634 el.remove();
5635 }
5636 } else {
5637 el.remove();
5638 }
5639 Ok(())
5640 }),
5641 ],
5642 document_content_handlers: vec![doc_comments!(|c| {
5643 c.remove();
5644 Ok(())
5645 })],
5646 ..RewriteStrSettings::default()
5647 },
5648 ) {
5649 Ok(r) => r,
5650 _ => html.into(),
5651 }
5652}
5653
5654pub fn clean_html_full(html: &str) -> String {
5657 use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
5658
5659 match rewrite_str(
5660 html,
5661 RewriteStrSettings {
5662 element_content_handlers: vec![
5663 element!("nav, footer", |el| {
5664 el.remove();
5665 Ok(())
5666 }),
5667 element!("meta", |el| {
5668 let name = el.get_attribute("name").map(|n| n.to_lowercase());
5669 if !matches!(name.as_deref(), Some("viewport") | Some("charset")) {
5670 el.remove();
5671 }
5672 Ok(())
5673 }),
5674 element!("*", |el| {
5675 let mut to_remove: Vec<String> = Vec::new();
5677 for attr in el.attributes().iter() {
5678 let n = attr.name();
5679 let keep = n == "id" || n == "class" || n.starts_with("data-");
5680 if !keep {
5681 to_remove.push(n);
5682 }
5683 }
5684 for attr in to_remove {
5685 el.remove_attribute(&attr);
5686 }
5687 Ok(())
5688 }),
5689 ],
5690 document_content_handlers: vec![doc_comments!(|c| {
5691 c.remove();
5692 Ok(())
5693 })],
5694 ..RewriteStrSettings::default()
5695 },
5696 ) {
5697 Ok(r) => r,
5698 _ => html.into(),
5699 }
5700}
5701
5702#[cfg(feature = "openai_slim_fit")]
5706#[inline]
5707pub fn clean_html(html: &str) -> String {
5708 clean_html_slim(html)
5709}
5710
5711#[cfg(not(feature = "openai_slim_fit"))]
5713#[inline]
5714pub fn clean_html(html: &str) -> String {
5715 clean_html_base(html)
5716}
5717
5718pub fn log(message: &'static str, data: impl AsRef<str>) {
5720 if log_enabled!(Level::Info) {
5721 info!("{message} - {}", data.as_ref());
5722 }
5723}
5724
5725#[cfg(feature = "control")]
5726#[derive(PartialEq, Debug)]
5728pub enum Handler {
5729 Start,
5731 Pause,
5733 Resume,
5735 Shutdown,
5737}
5738
5739#[cfg(feature = "control")]
5740lazy_static! {
5741 pub static ref CONTROLLER: std::sync::Arc<tokio::sync::RwLock<(tokio::sync::watch::Sender<(String, Handler)>,
5743 tokio::sync::watch::Receiver<(String, Handler)>)>> =
5744 std::sync::Arc::new(tokio::sync::RwLock::new(tokio::sync::watch::channel(("handles".to_string(), Handler::Start))));
5745}
5746
5747#[cfg(feature = "control")]
5748pub async fn pause(target: &str) {
5750 if let Err(e) = CONTROLLER
5751 .write()
5752 .await
5753 .0
5754 .send((target.into(), Handler::Pause))
5755 {
5756 log::error!("PAUSE: {:?}", e);
5757 }
5758}
5759
5760#[cfg(feature = "control")]
5761pub async fn resume(target: &str) {
5763 if let Err(e) = CONTROLLER
5764 .write()
5765 .await
5766 .0
5767 .send((target.into(), Handler::Resume))
5768 {
5769 log::error!("RESUME: {:?}", e);
5770 }
5771}
5772
5773#[cfg(feature = "control")]
5774pub async fn shutdown(target: &str) {
5776 if let Err(e) = CONTROLLER
5777 .write()
5778 .await
5779 .0
5780 .send((target.into(), Handler::Shutdown))
5781 {
5782 log::error!("SHUTDOWN: {:?}", e);
5783 }
5784}
5785
5786#[cfg(feature = "control")]
5787pub async fn reset(target: &str) {
5789 if let Err(e) = CONTROLLER
5790 .write()
5791 .await
5792 .0
5793 .send((target.into(), Handler::Start))
5794 {
5795 log::error!("RESET: {:?}", e);
5796 }
5797}
5798
5799pub(crate) fn setup_website_selectors(url: &str, allowed: AllowedDomainTypes) -> RelativeSelectors {
5801 let subdomains = allowed.subdomains;
5802 let tld = allowed.tld;
5803
5804 crate::page::get_page_selectors_base(url, subdomains, tld)
5805}
5806
5807#[derive(Debug, Default, Clone, Copy)]
5809pub struct AllowedDomainTypes {
5810 pub subdomains: bool,
5812 pub tld: bool,
5814}
5815
5816impl AllowedDomainTypes {
5817 pub fn new(subdomains: bool, tld: bool) -> Self {
5819 Self { subdomains, tld }
5820 }
5821}
5822
5823pub(crate) fn modify_selectors(
5825 prior_domain: &Option<Box<Url>>,
5826 domain: &str,
5827 domain_parsed: &mut Option<Box<Url>>,
5828 url: &mut Box<CaseInsensitiveString>,
5829 base: &mut RelativeSelectors,
5830 allowed: AllowedDomainTypes,
5831) {
5832 *domain_parsed = parse_absolute_url(domain);
5833 *url = Box::new(domain.into());
5834 let s = setup_website_selectors(url.inner(), allowed);
5835 base.0 = s.0;
5836 base.1 = s.1;
5837 if let Some(prior_domain) = prior_domain {
5838 if let Some(dname) = prior_domain.host_str() {
5839 base.2 = dname.into();
5840 }
5841 }
5842}
5843
5844pub fn get_last_segment(path: &str) -> &str {
5846 if let Some(pos) = path.rfind('/') {
5847 let next_position = pos + 1;
5848 if next_position < path.len() {
5849 &path[next_position..]
5850 } else {
5851 ""
5852 }
5853 } else {
5854 path
5855 }
5856}
5857
5858pub(crate) fn get_path_from_url(url: &str) -> &str {
5860 if let Some(start_pos) = url.find("//") {
5861 let mut pos = start_pos + 2;
5862
5863 if let Some(third_slash_pos) = url[pos..].find('/') {
5864 pos += third_slash_pos;
5865 &url[pos..]
5866 } else {
5867 "/"
5868 }
5869 } else {
5870 "/"
5871 }
5872}
5873
5874pub(crate) fn get_domain_from_url(url: &str) -> &str {
5876 if let Some(start_pos) = url.find("//") {
5877 let pos = start_pos + 2;
5878
5879 if let Some(first_slash_pos) = url[pos..].find('/') {
5880 &url[pos..pos + first_slash_pos]
5881 } else {
5882 &url[pos..]
5883 }
5884 } else {
5885 if let Some(first_slash_pos) = url.find('/') {
5886 &url[..first_slash_pos]
5887 } else {
5888 &url
5889 }
5890 }
5891}
5892
5893pub fn networking_capable(url: &str) -> bool {
5895 url.starts_with("https://")
5896 || url.starts_with("http://")
5897 || url.starts_with("file://")
5898 || url.starts_with("ftp://")
5899}
5900
5901pub fn prepare_url(u: &str) -> String {
5903 if let Some(index) = u.find("://") {
5904 let split_index = u
5905 .char_indices()
5906 .nth(index + 3)
5907 .map(|(i, _)| i)
5908 .unwrap_or(u.len());
5909
5910 format!("https://{}", &u[split_index..])
5911 } else {
5912 format!("https://{}", u)
5913 }
5914}
5915
5916pub(crate) async fn normalize_html(html: &[u8]) -> Vec<u8> {
5918 use lol_html::{element, send::Settings};
5919
5920 let mut output = Vec::new();
5921
5922 let mut rewriter = HtmlRewriter::new(
5923 Settings {
5924 element_content_handlers: vec![
5925 element!("a[href]", |el| {
5926 el.remove_attribute("href");
5927 Ok(())
5928 }),
5929 element!("script, style, iframe, base, noscript", |el| {
5930 el.remove();
5931 Ok(())
5932 }),
5933 element!("*", |el| {
5934 let mut remove_attr = vec![];
5935
5936 for attr in el.attributes() {
5937 let name = attr.name();
5938 let remove =
5939 !(name.starts_with("data-") || name == "id" || name == "class");
5940 if remove {
5941 remove_attr.push(name);
5942 }
5943 }
5944
5945 for name in remove_attr {
5946 el.remove_attribute(&name);
5947 }
5948
5949 Ok(())
5950 }),
5951 ],
5952 ..Settings::new_send()
5953 },
5954 |c: &[u8]| output.extend_from_slice(c),
5955 );
5956
5957 let chunks = html.chunks(*STREAMING_CHUNK_SIZE);
5958 let mut stream = tokio_stream::iter(chunks);
5959 let mut wrote_error = false;
5960
5961 while let Some(chunk) = stream.next().await {
5962 if rewriter.write(chunk).is_err() {
5963 wrote_error = true;
5964 break;
5965 }
5966 }
5967
5968 if !wrote_error {
5969 let _ = rewriter.end();
5970 }
5971
5972 output
5973}
5974
5975pub(crate) async fn hash_html(html: &[u8]) -> u64 {
5977 let normalized_html = normalize_html(html).await;
5978
5979 if !normalized_html.is_empty() {
5980 use std::hash::{Hash, Hasher};
5981 let mut s = ahash::AHasher::default();
5982 normalized_html.hash(&mut s);
5983 let key = s.finish();
5984 key
5985 } else {
5986 Default::default()
5987 }
5988}
5989
5990#[cfg(feature = "tracing")]
5991pub(crate) fn spawn_task<F>(task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
5993where
5994 F: std::future::Future + Send + 'static,
5995 F::Output: Send + 'static,
5996{
5997 tokio::task::Builder::new()
5998 .name(task_name)
5999 .spawn(future)
6000 .expect("failed to spawn task")
6001}
6002
6003#[cfg(not(feature = "tracing"))]
6004#[allow(unused)]
6005pub(crate) fn spawn_task<F>(_task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
6007where
6008 F: std::future::Future + Send + 'static,
6009 F::Output: Send + 'static,
6010{
6011 tokio::task::spawn(future)
6012}
6013
6014#[cfg(feature = "tracing")]
6015pub(crate) fn spawn_set<F, T>(
6017 task_name: &str,
6018 set: &mut tokio::task::JoinSet<T>,
6019 future: F,
6020) -> tokio::task::AbortHandle
6021where
6022 F: Future<Output = T>,
6023 F: Send + 'static,
6024 T: Send + 'static,
6025{
6026 set.build_task()
6027 .name(task_name)
6028 .spawn(future)
6029 .expect("set should spawn")
6030}
6031
6032#[cfg(not(feature = "tracing"))]
6033pub(crate) fn spawn_set<F, T>(
6035 _task_name: &str,
6036 set: &mut tokio::task::JoinSet<T>,
6037 future: F,
6038) -> tokio::task::AbortHandle
6039where
6040 F: Future<Output = T>,
6041 F: Send + 'static,
6042 T: Send + 'static,
6043{
6044 set.spawn(future)
6045}
6046
6047#[cfg(feature = "balance")]
6048const REBALANCE_TIME: std::time::Duration = std::time::Duration::from_millis(100);
6050
6051#[cfg(feature = "balance")]
6053pub async fn get_semaphore(semaphore: &Arc<Semaphore>, detect: bool) -> &Arc<Semaphore> {
6054 let cpu_load = if detect {
6055 detect_system::get_global_cpu_state().await
6056 } else {
6057 0
6058 };
6059
6060 if cpu_load == 2 {
6061 tokio::time::sleep(REBALANCE_TIME).await;
6062 }
6063
6064 if cpu_load >= 1 {
6065 &*crate::website::SEM_SHARED
6066 } else {
6067 semaphore
6068 }
6069}
6070
6071pub fn crawl_duration_expired(crawl_timeout: &Option<Duration>, start: &Option<Instant>) -> bool {
6073 crawl_timeout
6074 .and_then(|duration| start.map(|start| start.elapsed() >= duration))
6075 .unwrap_or(false)
6076}
6077
6078static HTML_TAGS: phf::Set<&'static [u8]> = phf_set! {
6080 b"<!doctype html",
6081 b"<html",
6082 b"<document",
6083};
6084
6085pub fn is_html_content_check(bytes: &[u8]) -> bool {
6087 let check_bytes = if bytes.len() > 1024 {
6088 &bytes[..1024]
6089 } else {
6090 bytes
6091 };
6092
6093 for tag in HTML_TAGS.iter() {
6094 if check_bytes
6095 .windows(tag.len())
6096 .any(|window| window.eq_ignore_ascii_case(tag))
6097 {
6098 return true;
6099 }
6100 }
6101
6102 false
6103}
6104
6105#[cfg(not(feature = "balance"))]
6107pub async fn get_semaphore(semaphore: &Arc<Semaphore>, _detect: bool) -> &Arc<Semaphore> {
6108 semaphore
6109}
6110
6111pub fn split_hashset_round_robin<T>(mut set: HashSet<T>, parts: usize) -> Vec<HashSet<T>>
6147where
6148 T: Eq + std::hash::Hash,
6149{
6150 if parts <= 1 {
6151 return vec![set];
6152 }
6153 let len = set.len();
6154 let mut buckets: Vec<HashSet<T>> = (0..parts)
6155 .map(|_| HashSet::with_capacity(len / parts + 1))
6156 .collect();
6157
6158 let mut i = 0usize;
6159 for v in set.drain() {
6160 buckets[i % parts].insert(v);
6161 i += 1;
6162 }
6163 buckets
6164}
6165#[cfg(feature = "tracing")]
6167pub fn emit_log(link: &str) {
6168 tracing::info!("fetch {}", &link);
6169}
6170#[cfg(not(feature = "tracing"))]
6172pub fn emit_log(link: &str) {
6173 log::info!("fetch {}", &link);
6174}
6175
6176#[cfg(feature = "tracing")]
6178pub fn emit_log_shutdown(link: &str) {
6179 tracing::info!("shutdown {}", &link);
6180}
6181#[cfg(not(feature = "tracing"))]
6183pub fn emit_log_shutdown(link: &str) {
6184 log::info!("shutdown {}", &link);
6185}