Skip to main content

spider/utils/
mod.rs

1/// Absolute path domain handling.
2pub mod abs;
3/// Connect layer for reqwest.
4pub mod connect;
5/// Generic CSS selectors.
6pub mod css_selectors;
7/// Fragment templates.
8pub mod templates;
9
10#[cfg(feature = "chrome")]
11pub(crate) mod detect_chrome;
12#[cfg(any(feature = "balance", feature = "disk"))]
13/// CPU and Memory detection to balance limitations.
14pub mod detect_system;
15/// Utils to modify the HTTP header.
16pub mod header_utils;
17/// String interner.
18pub mod interner;
19/// A trie struct.
20pub mod trie;
21/// Validate html false positives.
22pub mod validation;
23
24#[cfg(all(not(feature = "fs"), feature = "chrome"))]
25use crate::features::automation::RemoteMultimodalConfigs;
26use crate::{
27    page::{AntiBotTech, Metadata, STREAMING_CHUNK_SIZE},
28    RelativeSelectors,
29};
30use abs::parse_absolute_url;
31use aho_corasick::AhoCorasick;
32use auto_encoder::is_binary_file;
33use case_insensitive_string::CaseInsensitiveString;
34
35#[cfg(feature = "chrome")]
36use hashbrown::HashMap;
37use hashbrown::HashSet;
38
39use lol_html::{send::HtmlRewriter, OutputSink};
40use phf::phf_set;
41use reqwest::header::CONTENT_LENGTH;
42#[cfg(feature = "chrome")]
43use reqwest::header::{HeaderMap, HeaderValue};
44use std::{
45    error::Error,
46    future::Future,
47    str::FromStr,
48    sync::Arc,
49    time::{Duration, Instant},
50};
51use tokio::sync::Semaphore;
52use url::Url;
53
54#[cfg(feature = "chrome")]
55use crate::features::chrome_common::{AutomationScripts, ExecutionScripts};
56use crate::page::{MAX_PRE_ALLOCATED_HTML_PAGE_SIZE, MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE};
57use crate::tokio_stream::StreamExt;
58use crate::Client;
59
60#[cfg(feature = "cache_chrome_hybrid")]
61use http_cache_semantics::{RequestLike, ResponseLike};
62
63use log::{info, log_enabled, Level};
64
65#[cfg(not(feature = "wreq"))]
66use reqwest::{Response, StatusCode};
67#[cfg(feature = "wreq")]
68use wreq::{Response, StatusCode};
69
70#[cfg(all(not(feature = "cache_request"), not(feature = "wreq")))]
71pub(crate) type RequestError = reqwest::Error;
72
73/// The request error (for `wreq`).
74#[cfg(all(not(feature = "cache_request"), feature = "wreq"))]
75pub(crate) type RequestError = wreq::Error;
76
77/// The request error (for `reqwest_middleware` with caching).
78#[cfg(feature = "cache_request")]
79pub(crate) type RequestError = reqwest_middleware::Error;
80
81/// The request response.
82pub(crate) type RequestResponse = Response;
83
84/// The wait for duration timeouts.
85#[cfg(feature = "chrome")]
86const WAIT_TIMEOUTS: [u64; 6] = [0, 20, 50, 100, 100, 500];
87// /// The wait for duration timeouts.
88// #[cfg(feature = "chrome")]
89// const DOM_WAIT_TIMEOUTS: [u64; 6] = [100, 200, 300, 300, 400, 500];
90
91/// Ignore the content types.
92pub static IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf_set! {
93    "application/pdf",
94    "application/zip",
95    "application/x-rar-compressed",
96    "application/x-tar",
97    "image/png",
98    "image/jpeg",
99    "image/gif",
100    "image/bmp",
101    "image/svg+xml",
102    "video/mp4",
103    "video/x-msvideo",
104    "video/x-matroska",
105    "video/webm",
106    "audio/mpeg",
107    "audio/ogg",
108    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
109    "application/vnd.ms-excel",
110    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
111    "application/vnd.ms-powerpoint",
112    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
113    "application/x-7z-compressed",
114    "application/x-rpm",
115    "application/x-shockwave-flash",
116};
117
118lazy_static! {
119    /// Apache server forbidden.
120    pub static ref APACHE_FORBIDDEN: &'static [u8; 317] = br#"<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
121<html><head>
122<title>403 Forbidden</title>
123</head><body>
124<h1>Forbidden</h1>
125<p>You don't have permission to access this resource.</p>
126<p>Additionally, a 403 Forbidden
127error was encountered while trying to use an ErrorDocument to handle the request.</p>
128</body></html>"#;
129
130    /// Open Resty forbidden.
131    pub static ref OPEN_RESTY_FORBIDDEN: &'static [u8; 125] = br#"<html><head><title>403 Forbidden</title></head>
132<body>
133<center><h1>403 Forbidden</h1></center>
134<hr><center>openresty</center>"#;
135
136    /// Empty html.
137    pub static ref EMPTY_HTML_BASIC: &'static [u8; 13] = b"<html></html>";
138
139    /// Scan for error anti-bot pages.
140    static ref AC_BODY_SCAN: AhoCorasick = AhoCorasick::new([
141        "cf-error-code",
142        "Access to this page has been denied",
143        "DataDome",
144        "perimeterx",
145        "funcaptcha",
146        "Request unsuccessful. Incapsula incident ID",
147    ]).unwrap();
148
149    static ref AC_URL_SCAN: AhoCorasick = AhoCorasick::builder()
150        .match_kind(aho_corasick::MatchKind::LeftmostFirst) // optional: stops at first match
151        .build([
152            "/cdn-cgi/challenge-platform",       // 0
153            "datadome.co",                       // 1
154            "dd-api.io",                         // 2
155            "perimeterx.net",                    // 3
156            "px-captcha",                        // 4
157            "arkoselabs.com",                    // 5
158            "funcaptcha",                        // 6
159            "kasada.io",                         // 7
160            "fingerprint.com",                   // 8
161            "fpjs.io",                           // 9
162            "incapsula",                         // 10
163            "imperva",                           // 11
164            "radwarebotmanager",                 // 12
165            "reblaze.com",                       // 13
166            "cheq.ai",                           // 14
167        ])
168        .unwrap();
169}
170
171#[cfg(feature = "fs")]
172lazy_static! {
173    static ref TMP_DIR: String = {
174        use std::fs;
175        let mut tmp = std::env::temp_dir();
176
177        tmp.push("spider/");
178
179        // make sure spider dir is created.
180        match fs::create_dir_all(&tmp) {
181            Ok(_) => {
182                let dir_name = tmp.display().to_string();
183
184                match std::time::SystemTime::now().duration_since(std::time::SystemTime::UNIX_EPOCH) {
185                    Ok(dur) => {
186                        string_concat!(dir_name, dur.as_secs().to_string())
187                    }
188                    _ => dir_name,
189                }
190            }
191            _ => "/tmp/".to_string()
192        }
193    };
194}
195
196#[cfg(feature = "chrome")]
197lazy_static! {
198    /// Mask the chrome connection interception bytes from responses. Rejected responses send 17.0 bytes for the response.
199    pub(crate) static ref MASK_BYTES_INTERCEPTION: bool = {
200        std::env::var("MASK_BYTES_INTERCEPTION").unwrap_or_default() == "true"
201    };
202    /// Cloudflare turnstile wait.
203    pub(crate) static ref CF_WAIT_FOR: crate::features::chrome_common::WaitFor = {
204        let mut wait_for = crate::features::chrome_common::WaitFor::default();
205        wait_for.delay = crate::features::chrome_common::WaitForDelay::new(Some(core::time::Duration::from_millis(1000))).into();
206        // wait_for.dom = crate::features::chrome_common::WaitForSelector::new(Some(core::time::Duration::from_millis(1000)), "body".into()).into();
207        wait_for.idle_network = crate::features::chrome_common::WaitForIdleNetwork::new(core::time::Duration::from_secs(8).into()).into();
208        wait_for
209    };
210}
211
212/// Detect if openresty hard 403 is forbidden and should not retry.
213#[inline(always)]
214pub fn detect_open_resty_forbidden(b: &[u8]) -> bool {
215    b.starts_with(*OPEN_RESTY_FORBIDDEN)
216}
217
218/// Detect if a page is forbidden and should not retry.
219#[inline(always)]
220pub fn detect_hard_forbidden_content(b: &[u8]) -> bool {
221    b == *APACHE_FORBIDDEN || detect_open_resty_forbidden(b)
222}
223
224lazy_static! {
225    /// Prevent fetching resources beyond the bytes limit.
226    pub(crate) static ref MAX_SIZE_BYTES: usize = {
227        match std::env::var("SPIDER_MAX_SIZE_BYTES") {
228            Ok(b) => {
229                const DEFAULT_MAX_SIZE_BYTES: usize = 1_073_741_824; // 1GB in bytes
230
231                let b = b.parse::<usize>().unwrap_or(DEFAULT_MAX_SIZE_BYTES);
232
233                if b == 0 {
234                    0
235                } else {
236                    b.max(1_048_576) // min 1mb
237                }
238            },
239            _ => 0
240        }
241    };
242}
243
244/// The response of a web page.
245#[derive(Debug, Default)]
246pub struct PageResponse {
247    /// The page response resource.
248    pub content: Option<Box<Vec<u8>>>,
249    /// The headers of the response. (Always None if a webdriver protocol is used for fetching.).
250    pub headers: Option<reqwest::header::HeaderMap>,
251    #[cfg(feature = "remote_addr")]
252    /// The remote address of the page.
253    pub remote_addr: Option<core::net::SocketAddr>,
254    #[cfg(feature = "cookies")]
255    /// The cookies of the response.
256    pub cookies: Option<reqwest::header::HeaderMap>,
257    /// The status code of the request.
258    pub status_code: StatusCode,
259    /// The final url destination after any redirects.
260    pub final_url: Option<String>,
261    /// The message of the response error if any.
262    pub error_for_status: Option<Result<Response, RequestError>>,
263    #[cfg(feature = "chrome")]
264    /// The screenshot bytes of the page. The ScreenShotConfig bytes boolean needs to be set to true.
265    pub screenshot_bytes: Option<Vec<u8>>,
266    #[cfg(feature = "openai")]
267    /// The credits used from OpenAI in order.
268    pub openai_credits_used: Option<Vec<crate::features::openai_common::OpenAIUsage>>,
269    #[cfg(feature = "openai")]
270    /// The extra data from the AI, example extracting data etc...
271    pub extra_ai_data: Option<Vec<crate::page::AIResults>>,
272    #[cfg(feature = "gemini")]
273    /// The credits used from Gemini in order.
274    pub gemini_credits_used: Option<Vec<crate::features::gemini_common::GeminiUsage>>,
275    #[cfg(feature = "gemini")]
276    /// The extra data from the Gemini AI.
277    pub extra_gemini_data: Option<Vec<crate::page::AIResults>>,
278    /// The usage from remote multimodal automation (extraction, etc.).
279    /// Works with both Chrome and HTTP-only crawls.
280    pub remote_multimodal_usage: Option<Vec<crate::features::automation::AutomationUsage>>,
281    /// The extra data from the remote multimodal automation (extraction results, etc.).
282    /// Works with both Chrome and HTTP-only crawls.
283    pub extra_remote_multimodal_data: Option<Vec<crate::page::AutomationResults>>,
284    /// A WAF was found on the page.
285    pub waf_check: bool,
286    /// The total bytes transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead.
287    pub bytes_transferred: Option<f64>,
288    /// The signature of the page to use for handling de-duplication.
289    pub signature: Option<u64>,
290    #[cfg(feature = "chrome")]
291    /// All of the response events mapped with the amount of bytes used.
292    pub response_map: Option<HashMap<String, f64>>,
293    #[cfg(feature = "chrome")]
294    /// All of the request events mapped with the time period of the event sent.
295    pub request_map: Option<HashMap<String, f64>>,
296    /// The anti-bot tech used.
297    pub anti_bot_tech: crate::page::AntiBotTech,
298    /// The metadata of the page.
299    pub metadata: Option<Box<Metadata>>,
300    /// The duration of the request.
301    #[cfg(feature = "time")]
302    pub duration: Option<tokio::time::Instant>,
303}
304
305/// wait for event with timeout
306#[cfg(feature = "chrome")]
307pub async fn wait_for_event<T>(page: &chromiumoxide::Page, timeout: Option<core::time::Duration>)
308where
309    T: chromiumoxide::cdp::IntoEventKind + Unpin + std::fmt::Debug,
310{
311    if let Ok(mut events) = page.event_listener::<T>().await {
312        let wait_until = async {
313            let mut index = 0;
314
315            loop {
316                let current_timeout = WAIT_TIMEOUTS[index];
317                let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
318
319                tokio::select! {
320                    _ = sleep => (),
321                    v = events.next() => {
322                        if !v.is_none () {
323                            break;
324                        }
325                    }
326                }
327
328                index = (index + 1) % WAIT_TIMEOUTS.len();
329            }
330        };
331        match timeout {
332            Some(timeout) => if let Err(_) = tokio::time::timeout(timeout, wait_until).await {},
333            _ => wait_until.await,
334        }
335    }
336}
337
338/// wait for a selector
339#[cfg(feature = "chrome")]
340pub async fn wait_for_selector(
341    page: &chromiumoxide::Page,
342    timeout: Option<core::time::Duration>,
343    selector: &str,
344) -> bool {
345    let mut valid = false;
346    let wait_until = async {
347        let mut index = 0;
348
349        loop {
350            let current_timeout = WAIT_TIMEOUTS[index];
351            let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
352
353            tokio::select! {
354                _ = sleep => (),
355                v = page.find_element(selector) => {
356                    if v.is_ok() {
357                        valid = true;
358                        break;
359                    }
360                }
361            }
362
363            index = (index + 1) % WAIT_TIMEOUTS.len();
364        }
365    };
366
367    match timeout {
368        Some(timeout) => {
369            if let Err(_) = tokio::time::timeout(timeout, wait_until).await {
370                valid = false;
371            }
372        }
373        _ => wait_until.await,
374    };
375
376    valid
377}
378
379/// wait for dom to finish updating target selector
380#[cfg(feature = "chrome")]
381pub async fn wait_for_dom(
382    page: &chromiumoxide::Page,
383    timeout: Option<core::time::Duration>,
384    selector: &str,
385) {
386    let max = timeout.unwrap_or_else(|| core::time::Duration::from_millis(1200));
387
388    let script = crate::features::chrome_common::generate_wait_for_dom_js_v2(
389        max.as_millis() as u32,
390        selector,
391        500,
392        2,
393        true,
394        false,
395    );
396
397    let hard = max + core::time::Duration::from_millis(200);
398
399    let _ = tokio::time::timeout(hard, async {
400        if let Ok(v) = page.evaluate(script).await {
401            if let Some(val) = v.value().and_then(|x| x.as_bool()) {
402                let _ = val;
403            }
404        }
405    })
406    .await;
407}
408
409/// Get the output path of a screenshot and create any parent folders if needed.
410#[cfg(feature = "chrome")]
411pub async fn create_output_path(
412    base: &std::path::PathBuf,
413    target_url: &str,
414    format: &str,
415) -> String {
416    let out = string_concat!(
417        &percent_encoding::percent_encode(
418            target_url.as_bytes(),
419            percent_encoding::NON_ALPHANUMERIC
420        )
421        .to_string(),
422        format
423    );
424
425    let b = base.join(&out);
426
427    if let Some(p) = b.parent() {
428        let _ = tokio::fs::create_dir_all(&p).await;
429    }
430
431    b.display().to_string()
432}
433
434#[cfg(feature = "chrome")]
435/// Wait for page events.
436/// 1. First wait for idle networks.
437/// 2. Wait for selectors.
438/// 3. Wait for the dom element to finish updated.
439/// 4. Wait for hard delay.
440pub async fn page_wait(
441    page: &chromiumoxide::Page,
442    wait_for: &Option<crate::configuration::WaitFor>,
443) {
444    if let Some(wait_for) = wait_for {
445        if let Some(wait) = &wait_for.idle_network {
446            wait_for_event::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
447                page,
448                wait.timeout,
449            )
450            .await;
451        }
452
453        if let Some(wait) = &wait_for.almost_idle_network0 {
454            if let Some(timeout) = wait.timeout {
455                let _ = page
456                    .wait_for_network_almost_idle_with_timeout(timeout)
457                    .await;
458            } else {
459                let _ = page.wait_for_network_almost_idle().await;
460            }
461        }
462
463        if let Some(wait) = &wait_for.idle_network0 {
464            if let Some(timeout) = wait.timeout {
465                let _ = page.wait_for_network_idle_with_timeout(timeout).await;
466            } else {
467                let _ = page.wait_for_network_idle().await;
468            }
469        }
470
471        if let Some(wait) = &wait_for.selector {
472            wait_for_selector(page, wait.timeout, &wait.selector).await;
473        }
474
475        if let Some(wait) = &wait_for.dom {
476            wait_for_dom(page, wait.timeout, &wait.selector).await;
477        }
478
479        if let Some(wait) = &wait_for.delay {
480            if let Some(timeout) = wait.timeout {
481                tokio::time::sleep(timeout).await
482            }
483        }
484    }
485}
486
487#[derive(Debug, Default)]
488#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
489#[cfg(feature = "openai")]
490/// The json response from OpenAI.
491pub struct JsonResponse {
492    /// The content returned.
493    content: Vec<String>,
494    /// The js script for the browser.
495    js: String,
496    #[cfg_attr(feature = "serde", serde(default))]
497    /// The AI failed to parse the data.
498    error: Option<String>,
499}
500
501/// Handle the OpenAI credits used. This does nothing without 'openai' feature flag.
502#[cfg(feature = "openai")]
503pub fn handle_openai_credits(
504    page_response: &mut PageResponse,
505    tokens_used: crate::features::openai_common::OpenAIUsage,
506) {
507    match page_response.openai_credits_used.as_mut() {
508        Some(v) => v.push(tokens_used),
509        None => page_response.openai_credits_used = Some(vec![tokens_used]),
510    };
511}
512
513#[cfg(not(feature = "openai"))]
514/// Handle the OpenAI credits used. This does nothing without 'openai' feature flag.
515pub fn handle_openai_credits(
516    _page_response: &mut PageResponse,
517    _tokens_used: crate::features::openai_common::OpenAIUsage,
518) {
519}
520
521#[cfg(feature = "gemini")]
522/// Handle the Gemini credits used.
523pub fn handle_gemini_credits(
524    page_response: &mut PageResponse,
525    tokens_used: crate::features::gemini_common::GeminiUsage,
526) {
527    match page_response.gemini_credits_used.as_mut() {
528        Some(v) => v.push(tokens_used),
529        None => page_response.gemini_credits_used = Some(vec![tokens_used]),
530    };
531}
532
533#[cfg(not(feature = "gemini"))]
534/// Handle the Gemini credits used. This does nothing without 'gemini' feature flag.
535pub fn handle_gemini_credits(
536    _page_response: &mut PageResponse,
537    _tokens_used: crate::features::gemini_common::GeminiUsage,
538) {
539}
540
541/// Handle extra OpenAI data used. This does nothing without 'openai' feature flag.
542#[cfg(feature = "openai")]
543pub fn handle_extra_ai_data(
544    page_response: &mut PageResponse,
545    prompt: &str,
546    x: JsonResponse,
547    screenshot_output: Option<Vec<u8>>,
548    error: Option<String>,
549) {
550    let ai_response = crate::page::AIResults {
551        input: prompt.into(),
552        js_output: x.js,
553        content_output: x
554            .content
555            .iter()
556            .map(|c| c.trim_start().into())
557            .collect::<Vec<_>>(),
558        screenshot_output,
559        error,
560    };
561
562    match page_response.extra_ai_data.as_mut() {
563        Some(v) => v.push(ai_response),
564        None => page_response.extra_ai_data = Some(Vec::from([ai_response])),
565    };
566}
567
568/// Accepts different header types (for flexibility).
569pub enum HeaderSource<'a> {
570    /// From reqwest or internal HeaderMap.
571    HeaderMap(&'a crate::client::header::HeaderMap),
572    /// From a string-based HashMap.
573    Map(&'a std::collections::HashMap<String, String>),
574}
575
576#[inline(always)]
577/// Has the header value.
578fn header_value<'a>(headers: &'a HeaderSource, key: &str) -> Option<&'a str> {
579    match headers {
580        HeaderSource::HeaderMap(hm) => hm.get(key).and_then(|v| v.to_str().ok()),
581        HeaderSource::Map(map) => map.get(key).map(|s| s.as_str()),
582    }
583}
584
585#[inline(always)]
586/// Has the header key.
587fn has_key(headers: &HeaderSource, key: &str) -> bool {
588    match headers {
589        HeaderSource::HeaderMap(hm) => hm.contains_key(key),
590        HeaderSource::Map(map) => map.contains_key(key),
591    }
592}
593
594#[inline(always)]
595/// Equal case.
596fn eq_icase_trim(a: &str, b: &str) -> bool {
597    a.trim().eq_ignore_ascii_case(b)
598}
599
600/// Detect from headers (optimized: minimal lookups, no allocations).
601#[inline]
602pub fn detect_anti_bot_from_headers(headers: &HeaderSource) -> Option<AntiBotTech> {
603    // Cloudflare
604    if has_key(headers, "cf-chl-bypass") || has_key(headers, "cf-ray") {
605        return Some(AntiBotTech::Cloudflare);
606    }
607
608    // DataDome
609    if has_key(headers, "x-captcha-endpoint") {
610        return Some(AntiBotTech::DataDome);
611    }
612
613    // PerimeterX
614    if has_key(headers, "x-perimeterx") || has_key(headers, "pxhd") {
615        return Some(AntiBotTech::PerimeterX);
616    }
617
618    // Akamai
619    if has_key(headers, "x-akamaibot") {
620        return Some(AntiBotTech::AkamaiBotManager);
621    }
622
623    // Imperva (strong signals first)
624    if has_key(headers, "x-imperva-id") || has_key(headers, "x-iinfo") {
625        return Some(AntiBotTech::Imperva);
626    }
627
628    // Reblaze
629    if has_key(headers, "x-reblaze-uuid") {
630        return Some(AntiBotTech::Reblaze);
631    }
632
633    if header_value(headers, "x-cdn").is_some_and(|v| eq_icase_trim(v, "imperva")) {
634        return Some(AntiBotTech::Imperva);
635    }
636
637    None
638}
639
640/// Detect the anti-bot technology.
641pub fn detect_anti_bot_from_body(body: &Vec<u8>) -> Option<AntiBotTech> {
642    // Scan body for anti-bot fingerprints (only for small pages)
643    if body.len() < 30_000 {
644        if let Ok(finder) = AC_BODY_SCAN.try_find_iter(body) {
645            for mat in finder {
646                match mat.pattern().as_usize() {
647                    0 => return Some(AntiBotTech::Cloudflare),
648                    1 | 2 => return Some(AntiBotTech::DataDome),
649                    3 => return Some(AntiBotTech::PerimeterX),
650                    4 => return Some(AntiBotTech::ArkoseLabs),
651                    5 => return Some(AntiBotTech::Imperva),
652                    _ => (),
653                }
654            }
655        }
656    }
657
658    None
659}
660
661/// Detect antibot from url
662pub fn detect_antibot_from_url(url: &str) -> Option<AntiBotTech> {
663    if let Some(mat) = AC_URL_SCAN.find(url) {
664        let tech = match mat.pattern().as_usize() {
665            0 => AntiBotTech::Cloudflare,
666            1 | 2 => AntiBotTech::DataDome,
667            3 | 4 => AntiBotTech::PerimeterX,
668            5 | 6 => AntiBotTech::ArkoseLabs,
669            7 => AntiBotTech::Kasada,
670            8 | 9 => AntiBotTech::FingerprintJS,
671            10 | 11 => AntiBotTech::Imperva,
672            12 => AntiBotTech::RadwareBotManager,
673            13 => AntiBotTech::Reblaze,
674            14 => AntiBotTech::CHEQ,
675            _ => return None,
676        };
677        Some(tech)
678    } else {
679        None
680    }
681}
682
683/// Flip http -> https protocols.
684pub fn flip_http_https(url: &str) -> Option<String> {
685    if let Some(rest) = url.strip_prefix("http://") {
686        Some(format!("https://{rest}"))
687    } else if let Some(rest) = url.strip_prefix("https://") {
688        Some(format!("http://{rest}"))
689    } else {
690        None
691    }
692}
693
694/// Detect the anti-bot used from the request.
695pub fn detect_anti_bot_tech_response(
696    url: &str,
697    headers: &HeaderSource,
698    body: &Vec<u8>,
699    subject_name: Option<&str>,
700) -> AntiBotTech {
701    // Check by TLS subject (Chrome/CDP TLS details)
702    if let Some(subject) = subject_name {
703        if subject == "challenges.cloudflare.com" {
704            return AntiBotTech::Cloudflare;
705        }
706    }
707
708    if let Some(tech) = detect_anti_bot_from_headers(headers) {
709        return tech;
710    }
711
712    if let Some(tech) = detect_antibot_from_url(url) {
713        return tech;
714    }
715
716    if let Some(anti_bot) = detect_anti_bot_from_body(body) {
717        return anti_bot;
718    }
719
720    AntiBotTech::None
721}
722
723/// Extract to JsonResponse struct. This does nothing without 'openai' feature flag.
724#[cfg(feature = "openai")]
725pub fn handle_ai_data(js: &str) -> Option<JsonResponse> {
726    match serde_json::from_str::<JsonResponse>(&js) {
727        Ok(x) => Some(x),
728        _ => None,
729    }
730}
731
732#[cfg(feature = "chrome")]
733#[derive(Default, Clone, Debug)]
734/// The chrome HTTP response.
735pub struct ChromeHTTPReqRes {
736    /// Is the request blocked by a firewall?
737    pub waf_check: bool,
738    /// The HTTP status code.
739    pub status_code: StatusCode,
740    /// The HTTP method of the request.
741    pub method: String,
742    /// The HTTP response headers for the request.
743    pub response_headers: std::collections::HashMap<String, String>,
744    /// The HTTP request headers for the request.
745    pub request_headers: std::collections::HashMap<String, String>,
746    /// The HTTP protocol of the request.
747    pub protocol: String,
748    /// The anti-bot tech used.
749    pub anti_bot_tech: crate::page::AntiBotTech,
750}
751
752#[cfg(feature = "chrome")]
753impl ChromeHTTPReqRes {
754    /// Is this an empty default
755    pub fn is_empty(&self) -> bool {
756        self.method.is_empty()
757            && self.protocol.is_empty()
758            && self.anti_bot_tech == crate::page::AntiBotTech::None
759            && self.request_headers.is_empty()
760            && self.response_headers.is_empty()
761    }
762}
763
764#[cfg(feature = "chrome")]
765/// Is a cyper mismatch.
766fn is_cipher_mismatch(err: &chromiumoxide::error::CdpError) -> bool {
767    match err {
768        chromiumoxide::error::CdpError::ChromeMessage(msg) => {
769            msg.contains("net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH")
770        }
771        other => other
772            .to_string()
773            .contains("net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH"),
774    }
775}
776
777#[cfg(feature = "chrome")]
778/// Perform a chrome http request.
779pub async fn perform_chrome_http_request(
780    page: &chromiumoxide::Page,
781    source: &str,
782    referrer: Option<String>,
783) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
784    async fn attempt_once(
785        page: &chromiumoxide::Page,
786        source: &str,
787        referrer: Option<String>,
788    ) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
789        let mut waf_check = false;
790        let mut status_code = StatusCode::OK;
791        let mut method = String::from("GET");
792        let mut response_headers: std::collections::HashMap<String, String> =
793            std::collections::HashMap::default();
794        let mut request_headers = std::collections::HashMap::default();
795        let mut protocol = String::from("http/1.1");
796        let mut anti_bot_tech = AntiBotTech::default();
797
798        let frame_id = page.mainframe().await?;
799
800        let page_base =
801            page.http_future(chromiumoxide::cdp::browser_protocol::page::NavigateParams {
802                url: source.to_string(),
803                transition_type: Some(
804                    chromiumoxide::cdp::browser_protocol::page::TransitionType::Other,
805                ),
806                frame_id,
807                referrer,
808                referrer_policy: None,
809            })?;
810
811        match page_base.await {
812            Ok(page_base) => {
813                if let Some(http_request) = page_base {
814                    if let Some(http_method) = http_request.method.as_deref() {
815                        method = http_method.into();
816                    }
817
818                    request_headers.clone_from(&http_request.headers);
819
820                    if let Some(response) = &http_request.response {
821                        if let Some(p) = &response.protocol {
822                            protocol.clone_from(p);
823                        }
824
825                        if let Some(res_headers) = response.headers.inner().as_object() {
826                            for (k, v) in res_headers {
827                                response_headers.insert(k.to_string(), v.to_string());
828                            }
829                        }
830
831                        let mut firewall = false;
832
833                        waf_check = detect_antibot_from_url(&response.url).is_some();
834
835                        // IMPORTANT: compare against the attempted URL (source param),
836                        // so retries behave correctly.
837                        if !response.url.starts_with(source) {
838                            match &response.security_details {
839                                Some(security_details) => {
840                                    anti_bot_tech = detect_anti_bot_tech_response(
841                                        &response.url,
842                                        &HeaderSource::Map(&response_headers),
843                                        &Default::default(),
844                                        Some(&security_details.subject_name),
845                                    );
846                                    firewall = true;
847                                }
848                                _ => {
849                                    anti_bot_tech = detect_anti_bot_tech_response(
850                                        &response.url,
851                                        &HeaderSource::Map(&response_headers),
852                                        &Default::default(),
853                                        None,
854                                    );
855                                    if anti_bot_tech == AntiBotTech::Cloudflare {
856                                        if let Some(xframe_options) =
857                                            response_headers.get("x-frame-options")
858                                        {
859                                            if xframe_options == r#"\"DENY\""# {
860                                                firewall = true;
861                                            }
862                                        } else if let Some(encoding) =
863                                            response_headers.get("Accept-Encoding")
864                                        {
865                                            if encoding == r#"cf-ray"# {
866                                                firewall = true;
867                                            }
868                                        }
869                                    } else {
870                                        firewall = true;
871                                    }
872                                }
873                            };
874
875                            waf_check = waf_check
876                                || firewall && !matches!(anti_bot_tech, AntiBotTech::None);
877
878                            if !waf_check {
879                                waf_check = match &response.protocol {
880                                    Some(protocol) => protocol == "blob",
881                                    _ => false,
882                                }
883                            }
884                        }
885
886                        status_code = StatusCode::from_u16(response.status as u16)
887                            .unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
888                    } else if let Some(failure_text) = &http_request.failure_text {
889                        if failure_text == "net::ERR_FAILED" {
890                            waf_check = true;
891                        }
892                    }
893                }
894            }
895            Err(e) => return Err(e),
896        }
897
898        Ok(ChromeHTTPReqRes {
899            waf_check,
900            status_code,
901            method,
902            response_headers,
903            request_headers,
904            protocol,
905            anti_bot_tech,
906        })
907    }
908
909    match attempt_once(page, source, referrer.clone()).await {
910        Ok(ok) => Ok(ok),
911        Err(e) => {
912            if is_cipher_mismatch(&e) {
913                if let Some(flipped) = flip_http_https(source) {
914                    return attempt_once(page, &flipped, referrer).await;
915                }
916            }
917            Err(e)
918        }
919    }
920}
921
922#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
923/// Perform a http future with chrome cached.
924pub async fn perform_chrome_http_request_cache(
925    page: &chromiumoxide::Page,
926    source: &str,
927    referrer: Option<String>,
928    cache_options: &Option<CacheOptions>,
929    cache_policy: &Option<BasicCachePolicy>,
930) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
931    async fn attempt_once(
932        page: &chromiumoxide::Page,
933        source: &str,
934        referrer: Option<String>,
935        cache_options: &Option<CacheOptions>,
936        cache_policy: &Option<BasicCachePolicy>,
937    ) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
938        let mut waf_check = false;
939        let mut status_code = StatusCode::OK;
940        let mut method = String::from("GET");
941        let mut response_headers: std::collections::HashMap<String, String> =
942            std::collections::HashMap::default();
943        let mut request_headers = std::collections::HashMap::default();
944        let mut protocol = String::from("http/1.1");
945        let mut anti_bot_tech = AntiBotTech::default();
946
947        let frame_id = page.mainframe().await?;
948
949        let cmd = chromiumoxide::cdp::browser_protocol::page::NavigateParams {
950            url: source.to_string(),
951            transition_type: Some(
952                chromiumoxide::cdp::browser_protocol::page::TransitionType::Other,
953            ),
954            frame_id,
955            referrer,
956            referrer_policy: None,
957        };
958
959        let auth_opt = cache_auth_token(cache_options);
960        let cache_policy = cache_policy.as_ref().map(|f| f.from_basic());
961        let cache_strategy = None;
962        let remote = None;
963
964        let page_base = page.http_future_with_cache_intercept_enabled(
965            cmd,
966            auth_opt,
967            cache_policy,
968            cache_strategy,
969            remote,
970        );
971
972        match page_base.await {
973            Ok(http_request) => {
974                if let Some(http_method) = http_request.method.as_deref() {
975                    method = http_method.into();
976                }
977
978                request_headers.clone_from(&http_request.headers);
979
980                if let Some(response) = &http_request.response {
981                    if let Some(p) = &response.protocol {
982                        protocol.clone_from(p);
983                    }
984
985                    if let Some(res_headers) = response.headers.inner().as_object() {
986                        for (k, v) in res_headers {
987                            response_headers.insert(k.to_string(), v.to_string());
988                        }
989                    }
990
991                    let mut firewall = false;
992
993                    waf_check = detect_antibot_from_url(&response.url).is_some();
994
995                    if !response.url.starts_with(source) {
996                        match &response.security_details {
997                            Some(security_details) => {
998                                anti_bot_tech = detect_anti_bot_tech_response(
999                                    &response.url,
1000                                    &HeaderSource::Map(&response_headers),
1001                                    &Default::default(),
1002                                    Some(&security_details.subject_name),
1003                                );
1004                                firewall = true;
1005                            }
1006                            _ => {
1007                                anti_bot_tech = detect_anti_bot_tech_response(
1008                                    &response.url,
1009                                    &HeaderSource::Map(&response_headers),
1010                                    &Default::default(),
1011                                    None,
1012                                );
1013                                if anti_bot_tech == AntiBotTech::Cloudflare {
1014                                    if let Some(xframe_options) =
1015                                        response_headers.get("x-frame-options")
1016                                    {
1017                                        if xframe_options == r#"\"DENY\""# {
1018                                            firewall = true;
1019                                        }
1020                                    } else if let Some(encoding) =
1021                                        response_headers.get("Accept-Encoding")
1022                                    {
1023                                        if encoding == r#"cf-ray"# {
1024                                            firewall = true;
1025                                        }
1026                                    }
1027                                } else {
1028                                    firewall = true;
1029                                }
1030                            }
1031                        };
1032
1033                        waf_check =
1034                            waf_check || firewall && !matches!(anti_bot_tech, AntiBotTech::None);
1035
1036                        if !waf_check {
1037                            waf_check = match &response.protocol {
1038                                Some(protocol) => protocol == "blob",
1039                                _ => false,
1040                            }
1041                        }
1042                    }
1043
1044                    status_code = StatusCode::from_u16(response.status as u16)
1045                        .unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
1046                } else if let Some(failure_text) = &http_request.failure_text {
1047                    if failure_text == "net::ERR_FAILED" {
1048                        waf_check = true;
1049                    }
1050                }
1051            }
1052            Err(e) => return Err(e),
1053        }
1054
1055        Ok(ChromeHTTPReqRes {
1056            waf_check,
1057            status_code,
1058            method,
1059            response_headers,
1060            request_headers,
1061            protocol,
1062            anti_bot_tech,
1063        })
1064    }
1065
1066    match attempt_once(page, source, referrer.clone(), cache_options, cache_policy).await {
1067        Ok(ok) => Ok(ok),
1068        Err(e) => {
1069            if is_cipher_mismatch(&e) {
1070                if let Some(flipped) = flip_http_https(source) {
1071                    return attempt_once(page, &flipped, referrer, cache_options, cache_policy)
1072                        .await;
1073                }
1074            }
1075            Err(e)
1076        }
1077    }
1078}
1079
1080/// Use OpenAI to extend the crawl. This does nothing without 'openai' feature flag.
1081#[cfg(all(feature = "chrome", not(feature = "openai")))]
1082pub async fn run_openai_request(
1083    _source: &str,
1084    _page: &chromiumoxide::Page,
1085    _wait_for: &Option<crate::configuration::WaitFor>,
1086    _openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
1087    _page_response: &mut PageResponse,
1088    _ok: bool,
1089) {
1090}
1091
1092/// Use OpenAI to extend the crawl. This does nothing without 'openai' feature flag.
1093#[cfg(all(feature = "chrome", feature = "openai"))]
1094pub async fn run_openai_request(
1095    source: &str,
1096    page: &chromiumoxide::Page,
1097    wait_for: &Option<crate::configuration::WaitFor>,
1098    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
1099    mut page_response: &mut PageResponse,
1100    ok: bool,
1101) {
1102    if let Some(gpt_configs) = openai_config {
1103        let gpt_configs = match &gpt_configs.prompt_url_map {
1104            Some(h) => {
1105                let c = h.get::<case_insensitive_string::CaseInsensitiveString>(&source.into());
1106
1107                if !c.is_some() && gpt_configs.paths_map {
1108                    h.get::<case_insensitive_string::CaseInsensitiveString>(
1109                        &get_path_from_url(&source).into(),
1110                    )
1111                } else {
1112                    c
1113                }
1114            }
1115            _ => Some(gpt_configs),
1116        };
1117
1118        if let Some(gpt_configs) = gpt_configs {
1119            let mut prompts = gpt_configs.prompt.clone();
1120
1121            while let Some(prompt) = prompts.next() {
1122                let gpt_results = if !gpt_configs.model.is_empty() && ok {
1123                    openai_request(
1124                        gpt_configs,
1125                        match page_response.content.as_ref() {
1126                            Some(html) => auto_encoder::auto_encode_bytes(html),
1127                            _ => Default::default(),
1128                        },
1129                        &source,
1130                        &prompt,
1131                    )
1132                    .await
1133                } else {
1134                    Default::default()
1135                };
1136
1137                let js_script = gpt_results.response;
1138                let tokens_used = gpt_results.usage;
1139                let gpt_error = gpt_results.error;
1140
1141                // set the credits used for the request
1142                handle_openai_credits(&mut page_response, tokens_used);
1143
1144                let json_res = if gpt_configs.extra_ai_data {
1145                    match handle_ai_data(&js_script) {
1146                        Some(jr) => jr,
1147                        _ => {
1148                            let mut jr = JsonResponse::default();
1149                            jr.error = Some("An issue occured with serialization.".into());
1150
1151                            jr
1152                        }
1153                    }
1154                } else {
1155                    let mut x = JsonResponse::default();
1156                    x.js = js_script;
1157                    x
1158                };
1159
1160                // perform the js script on the page.
1161                if !json_res.js.is_empty() {
1162                    let html: Option<Box<Vec<u8>>> = match page
1163                        .evaluate_function(string_concat!(
1164                            "async function() { ",
1165                            json_res.js,
1166                            "; return document.documentElement.outerHTML; }"
1167                        ))
1168                        .await
1169                    {
1170                        Ok(h) => match h.into_value() {
1171                            Ok(hh) => Some(hh),
1172                            _ => None,
1173                        },
1174                        _ => None,
1175                    };
1176
1177                    if html.is_some() {
1178                        page_wait(&page, &wait_for).await;
1179                        if json_res.js.len() <= 400 && json_res.js.contains("window.location") {
1180                            if let Ok(b) = page.outer_html_bytes().await {
1181                                page_response.content = Some(b.into());
1182                            }
1183                        } else {
1184                            page_response.content = html;
1185                        }
1186                    }
1187                }
1188
1189                // attach the data to the page
1190                if gpt_configs.extra_ai_data {
1191                    let screenshot_bytes = if gpt_configs.screenshot && !json_res.js.is_empty() {
1192                        let format = chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png;
1193
1194                        let screenshot_configs = chromiumoxide::page::ScreenshotParams::builder()
1195                            .format(format)
1196                            .full_page(true)
1197                            .quality(45)
1198                            .omit_background(false);
1199
1200                        match page.screenshot(screenshot_configs.build()).await {
1201                            Ok(b) => {
1202                                log::debug!("took screenshot: {:?}", source);
1203                                Some(b)
1204                            }
1205                            Err(e) => {
1206                                log::error!("failed to take screenshot: {:?} - {:?}", e, source);
1207                                None
1208                            }
1209                        }
1210                    } else {
1211                        None
1212                    };
1213
1214                    handle_extra_ai_data(
1215                        page_response,
1216                        &prompt,
1217                        json_res,
1218                        screenshot_bytes,
1219                        gpt_error,
1220                    );
1221                }
1222            }
1223        }
1224    }
1225}
1226
1227/// Represents an HTTP version
1228#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1229#[non_exhaustive]
1230pub enum HttpVersion {
1231    /// HTTP Version 0.9
1232    Http09,
1233    /// HTTP Version 1.0
1234    Http10,
1235    /// HTTP Version 1.1
1236    Http11,
1237    /// HTTP Version 2.0
1238    H2,
1239    /// HTTP Version 3.0
1240    H3,
1241}
1242
1243/// A basic generic type that represents an HTTP response.
1244#[derive(Debug, Clone)]
1245pub struct HttpResponse {
1246    /// HTTP response body
1247    pub body: Vec<u8>,
1248    /// HTTP response headers
1249    pub headers: std::collections::HashMap<String, String>,
1250    /// HTTP response status code
1251    pub status: u16,
1252    /// HTTP response url
1253    pub url: url::Url,
1254    /// HTTP response version
1255    pub version: HttpVersion,
1256}
1257
1258/// A HTTP request type for caching.
1259#[cfg(feature = "cache_chrome_hybrid")]
1260pub struct HttpRequestLike {
1261    ///  The URI component of a request.
1262    pub uri: http::uri::Uri,
1263    /// The http method.
1264    pub method: reqwest::Method,
1265    /// The http headers.
1266    pub headers: http::HeaderMap,
1267}
1268
1269#[cfg(feature = "cache_chrome_hybrid")]
1270/// A HTTP response type for caching.
1271pub struct HttpResponseLike {
1272    /// The http status code.
1273    pub status: StatusCode,
1274    /// The http headers.
1275    pub headers: http::HeaderMap,
1276}
1277
1278#[cfg(feature = "cache_chrome_hybrid")]
1279impl RequestLike for HttpRequestLike {
1280    fn uri(&self) -> http::uri::Uri {
1281        self.uri.clone()
1282    }
1283    fn is_same_uri(&self, other: &http::Uri) -> bool {
1284        &self.uri == other
1285    }
1286    fn method(&self) -> &reqwest::Method {
1287        &self.method
1288    }
1289    fn headers(&self) -> &http::HeaderMap {
1290        &self.headers
1291    }
1292}
1293
1294#[cfg(feature = "cache_chrome_hybrid")]
1295impl ResponseLike for HttpResponseLike {
1296    fn status(&self) -> StatusCode {
1297        self.status
1298    }
1299    fn headers(&self) -> &http::HeaderMap {
1300        &self.headers
1301    }
1302}
1303
1304/// Convert headers to header map
1305#[cfg(any(
1306    feature = "cache_chrome_hybrid",
1307    feature = "headers",
1308    feature = "cookies"
1309))]
1310pub fn convert_headers(
1311    headers: &std::collections::HashMap<String, String>,
1312) -> reqwest::header::HeaderMap {
1313    let mut header_map = reqwest::header::HeaderMap::new();
1314
1315    for (index, items) in headers.iter().enumerate() {
1316        if let Ok(head) = reqwest::header::HeaderValue::from_str(items.1) {
1317            use std::str::FromStr;
1318            if let Ok(key) = reqwest::header::HeaderName::from_str(items.0) {
1319                header_map.insert(key, head);
1320            }
1321        }
1322        // mal headers
1323        if index > 1000 {
1324            break;
1325        }
1326    }
1327
1328    header_map
1329}
1330
1331#[cfg(feature = "cache_chrome_hybrid")]
1332/// Store the page to cache to be re-used across HTTP request.
1333pub async fn put_hybrid_cache(
1334    cache_key: &str,
1335    http_response: HttpResponse,
1336    method: &str,
1337    http_request_headers: std::collections::HashMap<String, String>,
1338) {
1339    use crate::http_cache_reqwest::CacheManager;
1340    use http_cache_semantics::CachePolicy;
1341
1342    match http_response.url.as_str().parse::<http::uri::Uri>() {
1343        Ok(u) => {
1344            let req = HttpRequestLike {
1345                uri: u,
1346                method: reqwest::Method::from_bytes(method.as_bytes())
1347                    .unwrap_or(reqwest::Method::GET),
1348                headers: convert_headers(&http_response.headers),
1349            };
1350
1351            let res = HttpResponseLike {
1352                status: StatusCode::from_u16(http_response.status)
1353                    .unwrap_or(StatusCode::EXPECTATION_FAILED),
1354                headers: convert_headers(&http_request_headers),
1355            };
1356
1357            let policy = CachePolicy::new(&req, &res);
1358
1359            let _ = crate::website::CACACHE_MANAGER
1360                .put(
1361                    cache_key.into(),
1362                    http_cache_reqwest::HttpResponse {
1363                        url: http_response.url,
1364                        body: http_response.body,
1365                        headers: http_response.headers,
1366                        version: match http_response.version {
1367                            HttpVersion::H2 => http_cache::HttpVersion::H2,
1368                            HttpVersion::Http10 => http_cache::HttpVersion::Http10,
1369                            HttpVersion::H3 => http_cache::HttpVersion::H3,
1370                            HttpVersion::Http09 => http_cache::HttpVersion::Http09,
1371                            HttpVersion::Http11 => http_cache::HttpVersion::Http11,
1372                        },
1373                        status: http_response.status,
1374                    },
1375                    policy,
1376                )
1377                .await;
1378        }
1379        _ => (),
1380    }
1381}
1382
1383#[cfg(not(feature = "cache_chrome_hybrid"))]
1384/// Store the page to cache to be re-used across HTTP request.
1385pub async fn put_hybrid_cache(
1386    _cache_key: &str,
1387    _http_response: HttpResponse,
1388    _method: &str,
1389    _http_request_headers: std::collections::HashMap<String, String>,
1390) {
1391}
1392
1393/// Subtract the duration with overflow handling.
1394#[cfg(feature = "chrome")]
1395fn sub_duration(
1396    base_timeout: std::time::Duration,
1397    elapsed: std::time::Duration,
1398) -> std::time::Duration {
1399    match base_timeout.checked_sub(elapsed) {
1400        Some(remaining_time) => remaining_time,
1401        None => Default::default(),
1402    }
1403}
1404
1405/// Get the initial page headers of the page with navigation.
1406#[cfg(feature = "chrome")]
1407async fn navigate(
1408    page: &chromiumoxide::Page,
1409    url: &str,
1410    chrome_http_req_res: &mut ChromeHTTPReqRes,
1411    referrer: Option<String>,
1412) -> Result<(), chromiumoxide::error::CdpError> {
1413    *chrome_http_req_res = perform_chrome_http_request(page, url, referrer).await?;
1414    Ok(())
1415}
1416
1417/// Get the initial page headers of the page with navigation from the remote cache.
1418#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1419async fn navigate_cache(
1420    page: &chromiumoxide::Page,
1421    url: &str,
1422    chrome_http_req_res: &mut ChromeHTTPReqRes,
1423    referrer: Option<String>,
1424    cache_options: &Option<CacheOptions>,
1425    cache_policy: &Option<BasicCachePolicy>,
1426) -> Result<(), chromiumoxide::error::CdpError> {
1427    *chrome_http_req_res =
1428        perform_chrome_http_request_cache(page, url, referrer, cache_options, cache_policy).await?;
1429    Ok(())
1430}
1431
1432#[cfg(all(feature = "real_browser", feature = "chrome"))]
1433/// Generate random mouse movement. This does nothing without the 'real_browser' flag enabled.
1434pub async fn perform_smart_mouse_movement(
1435    page: &chromiumoxide::Page,
1436    viewport: &Option<crate::configuration::Viewport>,
1437) {
1438    use chromiumoxide::layout::Point;
1439    use fastrand::Rng;
1440    use spider_fingerprint::spoof_mouse_movement::GaussianMouse;
1441    use tokio::time::{sleep, Duration};
1442
1443    let (viewport_width, viewport_height) = match viewport {
1444        Some(vp) => (vp.width as f64, vp.height as f64),
1445        None => (800.0, 600.0),
1446    };
1447
1448    let mut rng = Rng::new();
1449
1450    for (x, y) in GaussianMouse::generate_random_coordinates(viewport_width, viewport_height) {
1451        let _ = page.move_mouse(Point::new(x, y)).await;
1452
1453        // Occasionally introduce a short pause (~25%)
1454        if rng.f32() < 0.25 {
1455            let delay_micros = if rng.f32() < 0.9 {
1456                rng.u64(300..=1200) // 0.3–1.2 ms
1457            } else {
1458                rng.u64(2000..=8000) // rare 2–8 ms (real hesitation)
1459            };
1460            sleep(Duration::from_micros(delay_micros)).await;
1461        }
1462    }
1463}
1464
1465#[cfg(all(not(feature = "real_browser"), feature = "chrome"))]
1466/// Generate random mouse movement. This does nothing without the 'real_browser' flag enabled.
1467async fn perform_smart_mouse_movement(
1468    _page: &chromiumoxide::Page,
1469    _viewport: &Option<crate::configuration::Viewport>,
1470) {
1471}
1472
1473/// Cache the chrome response
1474#[cfg(all(
1475    feature = "chrome",
1476    feature = "cache_chrome_hybrid",
1477    feature = "cache_chrome_hybrid_mem"
1478))]
1479pub async fn cache_chrome_response(
1480    target_url: &str,
1481    page_response: &PageResponse,
1482    chrome_http_req_res: ChromeHTTPReqRes,
1483) {
1484    if let Ok(u) = url::Url::parse(target_url) {
1485        let http_response = HttpResponse {
1486            url: u,
1487            body: match page_response.content.as_ref() {
1488                Some(b) => b.into(),
1489                _ => Default::default(),
1490            },
1491            status: chrome_http_req_res.status_code.into(),
1492            version: match chrome_http_req_res.protocol.as_str() {
1493                "http/0.9" => HttpVersion::Http09,
1494                "http/1" | "http/1.0" => HttpVersion::Http10,
1495                "http/1.1" => HttpVersion::Http11,
1496                "http/2.0" | "http/2" => HttpVersion::H2,
1497                "http/3.0" | "http/3" => HttpVersion::H3,
1498                _ => HttpVersion::Http11,
1499            },
1500            headers: chrome_http_req_res.response_headers,
1501        };
1502        let auth_opt = match cache_options {
1503            Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
1504            Some(CacheOptions::Authorized(token))
1505            | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
1506            Some(CacheOptions::No) | None => None,
1507        };
1508        let cache_key = create_cache_key_raw(
1509            target_url,
1510            Some(&chrome_http_req_res.method),
1511            auth_opt.as_deref(),
1512        );
1513
1514        put_hybrid_cache(
1515            &cache_key,
1516            http_response,
1517            &chrome_http_req_res.method,
1518            chrome_http_req_res.request_headers,
1519        )
1520        .await;
1521    }
1522}
1523
1524/// Cache the chrome response
1525#[cfg(all(
1526    feature = "chrome",
1527    feature = "cache_chrome_hybrid",
1528    not(feature = "cache_chrome_hybrid_mem")
1529))]
1530pub async fn cache_chrome_response(
1531    target_url: &str,
1532    page_response: &PageResponse,
1533    chrome_http_req_res: ChromeHTTPReqRes,
1534    cache_options: &Option<CacheOptions>,
1535) {
1536    if let Ok(u) = url::Url::parse(target_url) {
1537        let http_response = HttpResponse {
1538            url: u,
1539            body: match page_response.content.as_ref() {
1540                Some(b) => b.to_vec(),
1541                _ => Default::default(),
1542            },
1543            status: chrome_http_req_res.status_code.into(),
1544            version: match chrome_http_req_res.protocol.as_str() {
1545                "http/0.9" => HttpVersion::Http09,
1546                "http/1" | "http/1.0" => HttpVersion::Http10,
1547                "http/1.1" => HttpVersion::Http11,
1548                "http/2.0" | "http/2" => HttpVersion::H2,
1549                "http/3.0" | "http/3" => HttpVersion::H3,
1550                _ => HttpVersion::Http11,
1551            },
1552            headers: chrome_http_req_res.response_headers,
1553        };
1554
1555        let auth_opt = match cache_options {
1556            Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
1557            Some(CacheOptions::Authorized(token))
1558            | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
1559            Some(CacheOptions::No) | None => None,
1560        };
1561        let cache_key = create_cache_key_raw(
1562            target_url,
1563            Some(&chrome_http_req_res.method),
1564            auth_opt.as_deref().map(|x| x.as_str()),
1565        );
1566        put_hybrid_cache(
1567            &cache_key,
1568            http_response,
1569            &chrome_http_req_res.method,
1570            chrome_http_req_res.request_headers,
1571        )
1572        .await;
1573    }
1574}
1575
1576/// Cache the chrome response
1577#[cfg(all(feature = "chrome", not(feature = "cache_chrome_hybrid")))]
1578pub async fn cache_chrome_response(
1579    _target_url: &str,
1580    _page_response: &PageResponse,
1581    _chrome_http_req_res: ChromeHTTPReqRes,
1582    _cache_options: &Option<CacheOptions>,
1583) {
1584}
1585
1586/// 5 mins in ms
1587pub(crate) const FIVE_MINUTES: u32 = 300_000;
1588
1589/// Max page timeout for events.
1590#[cfg(feature = "chrome")]
1591const MAX_PAGE_TIMEOUT: tokio::time::Duration =
1592    tokio::time::Duration::from_millis(FIVE_MINUTES as u64);
1593/// Half of the max timeout
1594#[cfg(feature = "chrome")]
1595const HALF_MAX_PAGE_TIMEOUT: tokio::time::Duration =
1596    tokio::time::Duration::from_millis(FIVE_MINUTES as u64 / 2);
1597
1598#[cfg(all(feature = "chrome", feature = "headers"))]
1599/// Store the page headers. This does nothing without the 'headers' flag enabled.
1600fn store_headers(page_response: &PageResponse, chrome_http_req_res: &mut ChromeHTTPReqRes) {
1601    if let Some(response_headers) = &page_response.headers {
1602        chrome_http_req_res.response_headers =
1603            crate::utils::header_utils::header_map_to_hash_map(&response_headers);
1604    }
1605}
1606
1607#[cfg(all(feature = "chrome", not(feature = "headers")))]
1608/// Store the page headers. This does nothing without the 'headers' flag enabled.
1609fn store_headers(_page_response: &PageResponse, _chrome_http_req_res: &mut ChromeHTTPReqRes) {}
1610
1611#[inline]
1612/// f64 to u64 floor.
1613#[cfg(feature = "chrome")]
1614fn f64_to_u64_floor(x: f64) -> u64 {
1615    if !x.is_finite() || x <= 0.0 {
1616        0
1617    } else if x >= u64::MAX as f64 {
1618        u64::MAX
1619    } else {
1620        x as u64
1621    }
1622}
1623
1624#[cfg(all(feature = "chrome", feature = "cache_request"))]
1625/// Cache a chrome response from CDP body.
1626async fn cache_chrome_response_from_cdp_body(
1627    target_url: &str,
1628    body: &[u8],
1629    chrome_http_req_res: &ChromeHTTPReqRes,
1630    cache_options: &Option<CacheOptions>,
1631) {
1632    use crate::utils::create_cache_key_raw;
1633
1634    if let Ok(u) = url::Url::parse(target_url) {
1635        let http_response = HttpResponse {
1636            url: u,
1637            body: body.to_vec(),
1638            status: chrome_http_req_res.status_code.into(),
1639            version: match chrome_http_req_res.protocol.as_str() {
1640                "http/0.9" => HttpVersion::Http09,
1641                "http/1" | "http/1.0" => HttpVersion::Http10,
1642                "http/1.1" => HttpVersion::Http11,
1643                "http/2.0" | "http/2" => HttpVersion::H2,
1644                "http/3.0" | "http/3" => HttpVersion::H3,
1645                _ => HttpVersion::Http11,
1646            },
1647            headers: chrome_http_req_res.response_headers.clone(),
1648        };
1649
1650        let auth_opt = match cache_options {
1651            Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
1652            Some(CacheOptions::Authorized(token))
1653            | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
1654            Some(CacheOptions::No) | None => None,
1655        };
1656        let cache_key = create_cache_key_raw(
1657            target_url,
1658            Some(&chrome_http_req_res.method),
1659            auth_opt.as_deref().map(|x| x.as_str()),
1660        );
1661
1662        put_hybrid_cache(
1663            &cache_key,
1664            http_response,
1665            &chrome_http_req_res.method,
1666            chrome_http_req_res.request_headers.clone(),
1667        )
1668        .await;
1669    }
1670}
1671
1672#[derive(Debug, Clone, Default)]
1673#[cfg(feature = "chrome")]
1674/// Map of the response.
1675struct ResponseMap {
1676    /// The url of the request
1677    url: String,
1678    /// The network request was skipped.
1679    skipped: bool,
1680    /// The bytes transferred
1681    bytes_transferred: f64,
1682}
1683
1684#[derive(Debug, Clone, Default)]
1685#[cfg(feature = "chrome")]
1686struct ResponseBase {
1687    /// The map of the response.
1688    response_map: Option<hashbrown::HashMap<String, ResponseMap>>,
1689    /// The headers of request.
1690    headers: Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
1691    /// The status code.
1692    status_code: Option<i64>,
1693    #[cfg(feature = "cache_request")]
1694    /// Is the main document cached?
1695    main_doc_from_cache: bool,
1696}
1697
1698#[cfg(feature = "chrome")]
1699#[inline]
1700/// The log target.
1701fn log_target<'a>(source: &'a str, url_target: Option<&'a str>) -> &'a str {
1702    url_target.unwrap_or(source)
1703}
1704
1705#[cfg(feature = "chrome")]
1706#[inline]
1707/// Is this a timeout error?
1708fn is_timeout(e: &chromiumoxide::error::CdpError) -> bool {
1709    matches!(e, chromiumoxide::error::CdpError::Timeout)
1710}
1711
1712#[cfg(feature = "chrome")]
1713/// Go to the html with interception.
1714async fn goto_with_html_once(
1715    page: &chromiumoxide::Page,
1716    target_url: &str,
1717    html: &str,
1718    block_bytes: &mut bool,
1719    resp_headers: &Option<reqwest::header::HeaderMap<reqwest::header::HeaderValue>>,
1720    chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1721) -> Result<(), chromiumoxide::error::CdpError> {
1722    use base64::Engine;
1723    use chromiumoxide::cdp::browser_protocol::fetch::{
1724        DisableParams, EnableParams, EventRequestPaused, FulfillRequestParams, RequestPattern,
1725        RequestStage,
1726    };
1727    use chromiumoxide::cdp::browser_protocol::network::ResourceType;
1728    use tokio_stream::StreamExt;
1729
1730    let mut paused = page.event_listener::<EventRequestPaused>().await?;
1731
1732    let url_prefix = target_url.to_string();
1733    let fulfill_headers =
1734        chrome_fulfill_headers_from_reqwest(resp_headers.as_ref(), "text/html; charset=utf-8");
1735
1736    let interception_required = chrome_intercept.map(|c| !c.enabled).unwrap_or(false);
1737
1738    if interception_required {
1739        page.execute(EnableParams {
1740            patterns: Some(vec![RequestPattern {
1741                url_pattern: Some("*".into()),
1742                resource_type: Some(ResourceType::Document),
1743                request_stage: Some(RequestStage::Request),
1744            }]),
1745            handle_auth_requests: Some(false),
1746        })
1747        .await?;
1748    }
1749
1750    let mut did_goto = false;
1751
1752    loop {
1753        tokio::select! {
1754            biased;
1755            res = page.goto(target_url), if !did_goto => {
1756                did_goto = true;
1757                if let Err(e) = res {
1758                    if matches!(e, chromiumoxide::error::CdpError::Timeout) {
1759                        *block_bytes = true;
1760                    }
1761                    if interception_required {
1762                        let _ = page.execute(DisableParams {}).await;
1763                    } else {
1764                        let _ = page.set_request_interception(true).await;
1765                    }
1766                    return Err(e);
1767                }
1768            }
1769            maybe_ev = paused.next() => {
1770                let Some(ev) = maybe_ev else {
1771                    break;
1772                };
1773
1774                if ev.resource_type != ResourceType::Document {
1775                    continue;
1776                }
1777                if !ev.request.url.starts_with(&url_prefix) {
1778                    continue;
1779                }
1780
1781                let body_b64 = base64::engine::general_purpose::STANDARD.encode(html.as_bytes());
1782
1783                let res = page.execute(FulfillRequestParams {
1784                    request_id: ev.request_id.clone(),
1785                    response_code: 200,
1786                    response_phrase: None,
1787                    response_headers: Some(fulfill_headers.clone()),
1788                    body: Some(chromiumoxide::Binary(body_b64)),
1789                    binary_response_headers: None,
1790                }).await;
1791
1792                if interception_required {
1793                    let _ = page.execute(DisableParams {}).await;
1794                } else {
1795                    let _ = page.set_request_interception(true).await;
1796                }
1797
1798                match res {
1799                    Ok(_) => return Ok(()),
1800                    Err(e) => {
1801                        if matches!(e, chromiumoxide::error::CdpError::Timeout) {
1802                            *block_bytes = true;
1803                        }
1804                        return Err(e);
1805                    }
1806                }
1807            }
1808        }
1809    }
1810
1811    if interception_required {
1812        let _ = page.execute(DisableParams {}).await;
1813    } else {
1814        let _ = page.set_request_interception(true).await;
1815    }
1816
1817    Ok(())
1818}
1819
1820#[cfg(feature = "chrome")]
1821/// Set the document if requested.
1822async fn set_document_content_if_requested(
1823    page: &chromiumoxide::Page,
1824    source: &str,
1825    url_target: Option<&str>,
1826    block_bytes: &mut bool,
1827    resp_headers: &Option<HeaderMap<HeaderValue>>,
1828    chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1829) {
1830    if let Some(target_url) = url_target {
1831        let _ = goto_with_html_once(
1832            page,
1833            target_url,
1834            source,
1835            block_bytes,
1836            &resp_headers,
1837            chrome_intercept,
1838        )
1839        .await;
1840    }
1841}
1842
1843#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1844/// Set the document if requested cached.
1845async fn set_document_content_if_requested_cached(
1846    page: &chromiumoxide::Page,
1847    source: &str,
1848    url_target: Option<&str>,
1849    block_bytes: &mut bool,
1850    cache_options: &Option<CacheOptions>,
1851    cache_policy: &Option<BasicCachePolicy>,
1852    resp_headers: &Option<HeaderMap<HeaderValue>>,
1853    chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1854) {
1855    let auth_opt = cache_auth_token(cache_options);
1856    let cache_policy = cache_policy.as_ref().map(|f| f.from_basic());
1857    let cache_strategy = None;
1858    let remote = Some("true");
1859    let target_url = url_target.unwrap_or_default();
1860    let cache_site = chromiumoxide::cache::manager::site_key_for_target_url(&target_url, auth_opt);
1861
1862    let _ = page
1863        .set_cache_key((Some(cache_site.clone()), cache_policy.clone()))
1864        .await;
1865
1866    let cache_future = async {
1867        if let Some(target_url) = url_target {
1868            let _ = goto_with_html_once(
1869                page,
1870                target_url,
1871                source,
1872                block_bytes,
1873                &resp_headers,
1874                chrome_intercept,
1875            )
1876            .await;
1877        }
1878    };
1879
1880    let (_, __, _cache_future) = tokio::join!(
1881        page.spawn_cache_listener(
1882            &cache_site,
1883            auth_opt.map(|f| f.into()),
1884            cache_strategy.clone(),
1885            remote.map(|f| f.into())
1886        ),
1887        page.seed_cache(&target_url, auth_opt, remote),
1888        cache_future
1889    );
1890
1891    let _ = page.clear_local_cache(&cache_site);
1892}
1893
1894#[cfg(feature = "chrome")]
1895async fn navigate_if_requested(
1896    page: &chromiumoxide::Page,
1897    source: &str,
1898    url_target: Option<&str>,
1899    chrome_http_req_res: &mut ChromeHTTPReqRes,
1900    referrer: Option<String>,
1901    block_bytes: &mut bool,
1902) -> Result<(), chromiumoxide::error::CdpError> {
1903    if let Err(e) = navigate(page, source, chrome_http_req_res, referrer).await {
1904        log::info!(
1905            "Navigation Error({:?}) - {:?}",
1906            e,
1907            log_target(source, url_target)
1908        );
1909        if is_timeout(&e) {
1910            *block_bytes = true;
1911        }
1912        return Err(e);
1913    }
1914    Ok(())
1915}
1916
1917#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1918/// Navigate with the cache options.
1919async fn navigate_if_requested_cache(
1920    page: &chromiumoxide::Page,
1921    source: &str,
1922    url_target: Option<&str>,
1923    chrome_http_req_res: &mut ChromeHTTPReqRes,
1924    referrer: Option<String>,
1925    block_bytes: &mut bool,
1926    cache_options: &Option<CacheOptions>,
1927    cache_policy: &Option<BasicCachePolicy>,
1928) -> Result<(), chromiumoxide::error::CdpError> {
1929    if let Err(e) = navigate_cache(
1930        page,
1931        source,
1932        chrome_http_req_res,
1933        referrer,
1934        cache_options,
1935        cache_policy,
1936    )
1937    .await
1938    {
1939        log::info!(
1940            "Navigation Error({:?}) - {:?}",
1941            e,
1942            log_target(source, url_target)
1943        );
1944        if is_timeout(&e) {
1945            *block_bytes = true;
1946        }
1947        return Err(e);
1948    }
1949    Ok(())
1950}
1951
1952#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1953/// Is cache enabled?
1954fn cache_enabled(cache_options: &Option<CacheOptions>) -> bool {
1955    matches!(
1956        cache_options,
1957        Some(CacheOptions::Yes | CacheOptions::Authorized(_))
1958    )
1959}
1960
1961#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
1962/// The chrome cache policy
1963fn chrome_cache_policy(
1964    cache_policy: &Option<BasicCachePolicy>,
1965) -> chromiumoxide::cache::BasicCachePolicy {
1966    cache_policy
1967        .as_ref()
1968        .map(|p| p.from_basic())
1969        .unwrap_or(chromiumoxide::cache::BasicCachePolicy::Normal)
1970}
1971
1972#[cfg(all(feature = "chrome", not(feature = "chrome_remote_cache")))]
1973/// Core logic: either set document content or navigate.
1974///
1975/// Semantics preserved:
1976/// - If `page_set == true`: no-op.
1977/// - If `content == true`: tries SetDocumentContent; logs errors; sets `block_bytes` on timeout; does NOT return Err.
1978/// - Else: performs navigation; returns Err on failure; sets `block_bytes` on timeout.
1979pub async fn run_navigate_or_content_set_core(
1980    page: &chromiumoxide::Page,
1981    page_set: bool,
1982    content: bool,
1983    source: &str,
1984    url_target: Option<&str>,
1985    chrome_http_req_res: &mut ChromeHTTPReqRes,
1986    referrer: Option<String>,
1987    block_bytes: &mut bool,
1988    _cache_options: &Option<CacheOptions>,
1989    _cache_policy: &Option<BasicCachePolicy>,
1990    resp_headers: &Option<HeaderMap<HeaderValue>>,
1991    chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
1992) -> Result<(), chromiumoxide::error::CdpError> {
1993    if page_set {
1994        return Ok(());
1995    }
1996
1997    if content {
1998        // check cf for the antibot
1999        if crate::features::solvers::detect_cf_turnstyle(source.as_bytes()) {
2000            chrome_http_req_res.anti_bot_tech = AntiBotTech::Cloudflare;
2001        }
2002        set_document_content_if_requested(
2003            page,
2004            source,
2005            url_target,
2006            block_bytes,
2007            resp_headers,
2008            chrome_intercept,
2009        )
2010        .await;
2011        return Ok(());
2012    }
2013
2014    navigate_if_requested(
2015        page,
2016        source,
2017        url_target,
2018        chrome_http_req_res,
2019        referrer,
2020        block_bytes,
2021    )
2022    .await
2023}
2024
2025#[cfg(all(feature = "chrome", feature = "chrome_remote_cache"))]
2026/// Core logic: either set document content or navigate.
2027///
2028/// Semantics preserved:
2029/// - If `page_set == true`: no-op.
2030/// - If `content == true`: tries SetDocumentContent; logs errors; sets `block_bytes` on timeout; does NOT return Err.
2031/// - Else: performs navigation; returns Err on failure; sets `block_bytes` on timeout.
2032pub async fn run_navigate_or_content_set_core(
2033    page: &chromiumoxide::Page,
2034    page_set: bool,
2035    content: bool,
2036    source: &str,
2037    url_target: Option<&str>,
2038    chrome_http_req_res: &mut ChromeHTTPReqRes,
2039    referrer: Option<String>,
2040    block_bytes: &mut bool,
2041    cache_options: &Option<CacheOptions>,
2042    cache_policy: &Option<BasicCachePolicy>,
2043    resp_headers: &Option<HeaderMap<HeaderValue>>,
2044    chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
2045) -> Result<(), chromiumoxide::error::CdpError> {
2046    if page_set {
2047        return Ok(());
2048    }
2049
2050    let cache = cache_enabled(cache_options);
2051
2052    if content {
2053        // check cf for the antibot
2054        if crate::features::solvers::detect_cf_turnstyle(source.as_bytes()) {
2055            chrome_http_req_res.anti_bot_tech = AntiBotTech::Cloudflare;
2056        }
2057
2058        if cache {
2059            set_document_content_if_requested_cached(
2060                page,
2061                source,
2062                url_target,
2063                block_bytes,
2064                cache_options,
2065                cache_policy,
2066                &resp_headers,
2067                chrome_intercept,
2068            )
2069            .await;
2070        } else {
2071            set_document_content_if_requested(
2072                page,
2073                source,
2074                url_target,
2075                block_bytes,
2076                resp_headers,
2077                chrome_intercept,
2078            )
2079            .await;
2080        }
2081        return Ok(());
2082    }
2083
2084    if cache {
2085        navigate_if_requested_cache(
2086            page,
2087            source,
2088            url_target,
2089            chrome_http_req_res,
2090            referrer,
2091            block_bytes,
2092            cache_options,
2093            cache_policy,
2094        )
2095        .await
2096    } else {
2097        navigate_if_requested(
2098            page,
2099            source,
2100            url_target,
2101            chrome_http_req_res,
2102            referrer,
2103            block_bytes,
2104        )
2105        .await
2106    }
2107}
2108
2109#[cfg(feature = "chrome")]
2110/// Get the base redirect for the website.
2111pub async fn get_final_redirect(
2112    page: &chromiumoxide::Page,
2113    source: &str,
2114    base_timeout: Duration,
2115) -> Option<String> {
2116    let last_redirect = tokio::time::timeout(base_timeout, async {
2117        match page.wait_for_navigation_response().await {
2118            Ok(u) => get_last_redirect(&source, &u, &page).await,
2119            _ => None,
2120        }
2121    })
2122    .await;
2123
2124    match last_redirect {
2125        Ok(final_url) => {
2126            if final_url.as_deref() == Some("about:blank")
2127                || final_url.as_deref() == Some("chrome-error://chromewebdata/")
2128            {
2129                None
2130            } else {
2131                final_url
2132            }
2133        }
2134        _ => None,
2135    }
2136}
2137
2138#[cfg(feature = "chrome")]
2139/// Fullfil the headers.
2140pub fn chrome_fulfill_headers_from_reqwest(
2141    headers: Option<&reqwest::header::HeaderMap<reqwest::header::HeaderValue>>,
2142    default_content_type: &'static str,
2143) -> Vec<chromiumoxide::cdp::browser_protocol::fetch::HeaderEntry> {
2144    use chromiumoxide::cdp::browser_protocol::fetch::HeaderEntry;
2145
2146    let mut out: Vec<HeaderEntry> = Vec::new();
2147
2148    // Convert reqwest headers -> CDP HeaderEntry (filter hop-by-hop)
2149    if let Some(hm) = headers {
2150        for (name, value) in hm.iter() {
2151            let k = name.as_str();
2152
2153            // Hop-by-hop / unsafe in synthetic fulfill responses
2154            match k.to_ascii_lowercase().as_str() {
2155                "content-length" | "transfer-encoding" | "connection" | "keep-alive"
2156                | "proxy-connection" | "te" | "trailers" | "upgrade" => continue,
2157                _ => {}
2158            }
2159
2160            let v = match value.to_str() {
2161                Ok(s) => s.to_string(),
2162                Err(_) => String::from_utf8_lossy(value.as_bytes()).into_owned(),
2163            };
2164
2165            out.push(HeaderEntry {
2166                name: k.to_string(),
2167                value: v,
2168            });
2169        }
2170    }
2171
2172    // Ensure Content-Type exists
2173    let has_ct = out
2174        .iter()
2175        .any(|h| h.name.eq_ignore_ascii_case("content-type"));
2176    if !has_ct {
2177        out.push(HeaderEntry {
2178            name: "Content-Type".into(),
2179            value: default_content_type.into(),
2180        });
2181    }
2182
2183    // Good default for synthetic responses (avoid caching weirdness)
2184    if !out
2185        .iter()
2186        .any(|h| h.name.eq_ignore_ascii_case("cache-control"))
2187    {
2188        out.push(HeaderEntry {
2189            name: "Cache-Control".into(),
2190            value: "no-store".into(),
2191        });
2192    }
2193
2194    out
2195}
2196
2197#[cfg(feature = "chrome")]
2198/// Skip bytes tracker.
2199const SKIP_BYTES_AMOUNT: f64 = 17.0;
2200
2201#[cfg(feature = "chrome")]
2202/// Perform a network request to a resource extracting all content as text streaming via chrome.
2203pub async fn fetch_page_html_chrome_base(
2204    source: &str,
2205    page: &chromiumoxide::Page,
2206    content: bool,
2207    wait_for_navigation: bool,
2208    wait_for: &Option<crate::configuration::WaitFor>,
2209    screenshot: &Option<crate::configuration::ScreenShotConfig>,
2210    page_set: bool,
2211    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2212    url_target: Option<&str>,
2213    execution_scripts: &Option<ExecutionScripts>,
2214    automation_scripts: &Option<AutomationScripts>,
2215    viewport: &Option<crate::configuration::Viewport>,
2216    request_timeout: &Option<Box<Duration>>,
2217    track_events: &Option<crate::configuration::ChromeEventTracker>,
2218    referrer: Option<String>,
2219    max_page_bytes: Option<f64>,
2220    cache_options: Option<CacheOptions>,
2221    cache_policy: &Option<BasicCachePolicy>,
2222    resp_headers: &Option<HeaderMap<HeaderValue>>,
2223    chrome_intercept: &Option<&crate::features::chrome_common::RequestInterceptConfiguration>,
2224    jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
2225    remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
2226) -> Result<PageResponse, chromiumoxide::error::CdpError> {
2227    use crate::page::{is_asset_url, DOWNLOADABLE_MEDIA_TYPES, UNKNOWN_STATUS_ERROR};
2228    use chromiumoxide::{
2229        cdp::browser_protocol::network::{
2230            EventDataReceived, EventLoadingFailed, EventRequestWillBeSent, EventResponseReceived,
2231            GetResponseBodyParams, RequestId, ResourceType,
2232        },
2233        error::CdpError,
2234    };
2235    use tokio::{
2236        sync::{oneshot, OnceCell},
2237        time::Instant,
2238    };
2239
2240    let duration = if cfg!(feature = "time") {
2241        Some(tokio::time::Instant::now())
2242    } else {
2243        None
2244    };
2245
2246    let mut chrome_http_req_res = ChromeHTTPReqRes::default();
2247    let mut metadata: Option<Vec<crate::page::AutomationResults>> = None;
2248    let mut block_bytes = false;
2249
2250    // the base networking timeout to prevent any hard hangs.
2251    let mut base_timeout = match request_timeout {
2252        Some(timeout) => **timeout.min(&Box::new(MAX_PAGE_TIMEOUT)),
2253        _ => MAX_PAGE_TIMEOUT,
2254    };
2255
2256    // track the initial base without modifying.
2257    let base_timeout_measurement = base_timeout;
2258    let target_url = url_target.unwrap_or(source);
2259    let asset = is_asset_url(target_url);
2260
2261    let (tx1, rx1) = if asset {
2262        let c = oneshot::channel::<Option<RequestId>>();
2263
2264        (Some(c.0), Some(c.1))
2265    } else {
2266        (None, None)
2267    };
2268
2269    let should_block = max_page_bytes.is_some();
2270
2271    let (track_requests, track_responses, track_automation) = match track_events {
2272        Some(tracker) => (tracker.requests, tracker.responses, tracker.automation),
2273        _ => (false, false, false),
2274    };
2275
2276    let (
2277        event_loading_listener,
2278        cancel_listener,
2279        received_listener,
2280        event_sent_listener,
2281        event_data_received,
2282    ) = tokio::join!(
2283        page.event_listener::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
2284        ),
2285        page.event_listener::<EventLoadingFailed>(),
2286        page.event_listener::<EventResponseReceived>(),
2287        async {
2288            if track_requests {
2289                page.event_listener::<EventRequestWillBeSent>().await
2290            } else {
2291                Err(CdpError::NotFound)
2292            }
2293        },
2294        async {
2295            if should_block {
2296                page.event_listener::<EventDataReceived>().await
2297            } else {
2298                Err(CdpError::NotFound)
2299            }
2300        }
2301    );
2302
2303    #[cfg(feature = "cache_request")]
2304    let cache_request = match cache_options {
2305        Some(CacheOptions::No) => false,
2306        _ => true,
2307    };
2308
2309    let (tx, rx) = oneshot::channel::<bool>();
2310
2311    #[cfg(feature = "cache_request")]
2312    let (main_tx, main_rx) = if cache_request {
2313        let c = oneshot::channel::<RequestId>();
2314        (Some(c.0), Some(c.1))
2315    } else {
2316        (None, None)
2317    };
2318
2319    let page_clone = if should_block {
2320        Some(page.clone())
2321    } else {
2322        None
2323    };
2324
2325    let html_source_size = source.len();
2326
2327    // Listen for network events. todo: capture the last values endtime to track period.
2328    // TODO: optional check if spawn required.
2329    let bytes_collected_handle = tokio::spawn(async move {
2330        let finished_media: Option<OnceCell<RequestId>> =
2331            if asset { Some(OnceCell::new()) } else { None };
2332
2333        let f1 = async {
2334            let mut total = 0.0;
2335
2336            let mut response_map: Option<HashMap<String, f64>> = if track_responses {
2337                Some(HashMap::new())
2338            } else {
2339                None
2340            };
2341
2342            if let Ok(mut listener) = event_loading_listener {
2343                while let Some(event) = listener.next().await {
2344                    total += event.encoded_data_length;
2345                    if let Some(response_map) = response_map.as_mut() {
2346                        response_map
2347                            .entry(event.request_id.inner().clone())
2348                            .and_modify(|e| *e += event.encoded_data_length)
2349                            .or_insert(event.encoded_data_length);
2350                    }
2351                    if asset {
2352                        if let Some(once) = &finished_media {
2353                            if let Some(request_id) = once.get() {
2354                                if request_id == &event.request_id {
2355                                    if let Some(tx1) = tx1 {
2356                                        let _ = tx1.send(Some(request_id.clone()));
2357                                        break;
2358                                    }
2359                                }
2360                            }
2361                        }
2362                    }
2363                }
2364            }
2365
2366            (total, response_map)
2367        };
2368
2369        let f2 = async {
2370            if let Ok(mut listener) = cancel_listener {
2371                let mut net_aborted = false;
2372
2373                while let Some(event) = listener.next().await {
2374                    if event.r#type == ResourceType::Document
2375                        && event.error_text == "net::ERR_ABORTED"
2376                        && event.canceled.unwrap_or_default()
2377                    {
2378                        net_aborted = true;
2379                        break;
2380                    }
2381                }
2382
2383                if net_aborted {
2384                    let _ = tx.send(true);
2385                }
2386            }
2387        };
2388
2389        let f3 = async {
2390            let mut response_map: Option<HashMap<String, ResponseMap>> = if track_responses {
2391                Some(HashMap::new())
2392            } else {
2393                None
2394            };
2395
2396            let mut status_code = None;
2397            let mut headers = None;
2398            #[cfg(feature = "cache_request")]
2399            let mut main_doc_request_id: Option<RequestId> = None;
2400            #[cfg(feature = "cache_request")]
2401            let mut main_doc_from_cache = false;
2402
2403            let persist_event = asset || track_responses;
2404
2405            if let Ok(mut listener) = received_listener {
2406                let mut initial_asset = false;
2407                let mut allow_download = false;
2408                let mut intial_request = false;
2409
2410                while let Some(event) = listener.next().await {
2411                    let document = event.r#type == ResourceType::Document;
2412
2413                    if !intial_request && document {
2414                        // todo: capture the redirect code.
2415                        let redirect = event.response.status >= 300 && event.response.status <= 399;
2416
2417                        if !redirect {
2418                            intial_request = true;
2419                            status_code = Some(event.response.status);
2420                            headers = Some(event.response.headers.clone());
2421                            #[cfg(feature = "cache_request")]
2422                            {
2423                                main_doc_request_id = Some(event.request_id.clone());
2424                                // DevTools cache flags
2425                                let from_disk = event.response.from_disk_cache.unwrap_or(false);
2426                                let from_prefetch =
2427                                    event.response.from_prefetch_cache.unwrap_or(false);
2428                                let from_sw = event.response.from_service_worker.unwrap_or(false);
2429                                main_doc_from_cache = from_disk || from_prefetch || from_sw;
2430                            }
2431
2432                            if !persist_event {
2433                                break;
2434                            }
2435
2436                            if content {
2437                                if let Some(response_map) = response_map.as_mut() {
2438                                    response_map.insert(
2439                                        event.request_id.inner().clone(),
2440                                        ResponseMap {
2441                                            url: event.response.url.clone(),
2442                                            // encoded length should add 78.0 via chrome
2443                                            bytes_transferred: (html_source_size as f64)
2444                                                + event.response.encoded_data_length,
2445                                            skipped: true,
2446                                        },
2447                                    );
2448                                    continue;
2449                                }
2450                            }
2451                        }
2452                    }
2453                    // check if media asset needs to be downloaded ( this will trigger after the inital document )
2454                    else if asset {
2455                        if !initial_asset && document {
2456                            allow_download =
2457                                DOWNLOADABLE_MEDIA_TYPES.contains(&event.response.mime_type);
2458                        }
2459                        if event.r#type == ResourceType::Media && allow_download {
2460                            if let Some(once) = &finished_media {
2461                                let _ = once.set(event.request_id.clone());
2462                            }
2463                        }
2464                        initial_asset = true;
2465                    }
2466
2467                    if let Some(response_map) = response_map.as_mut() {
2468                        response_map.insert(
2469                            event.request_id.inner().clone(),
2470                            ResponseMap {
2471                                url: event.response.url.clone(),
2472                                bytes_transferred: event.response.encoded_data_length,
2473                                skipped: *MASK_BYTES_INTERCEPTION
2474                                    && event.response.connection_id == 0.0
2475                                    && event.response.encoded_data_length <= SKIP_BYTES_AMOUNT,
2476                            },
2477                        );
2478                    }
2479                }
2480            }
2481
2482            #[cfg(feature = "cache_request")]
2483            if let Some(request_id) = &main_doc_request_id {
2484                if let Some(tx) = main_tx {
2485                    let _ = tx.send(request_id.clone());
2486                }
2487            }
2488
2489            ResponseBase {
2490                response_map,
2491                status_code,
2492                headers,
2493                #[cfg(feature = "cache_request")]
2494                main_doc_from_cache,
2495            }
2496        };
2497
2498        let f4 = async {
2499            let mut request_map: Option<HashMap<String, f64>> = if track_requests {
2500                Some(HashMap::new())
2501            } else {
2502                None
2503            };
2504
2505            if request_map.is_some() {
2506                if let Some(response_map) = request_map.as_mut() {
2507                    if let Ok(mut listener) = event_sent_listener {
2508                        while let Some(event) = listener.next().await {
2509                            response_map
2510                                .insert(event.request.url.clone(), *event.timestamp.inner());
2511                        }
2512                    }
2513                }
2514            }
2515
2516            request_map
2517        };
2518
2519        let f5 = async {
2520            if let Some(page_clone) = &page_clone {
2521                if let Ok(mut listener) = event_data_received {
2522                    let mut total_bytes: u64 = 0;
2523                    let total_max = f64_to_u64_floor(max_page_bytes.unwrap_or_default());
2524                    while let Some(event) = listener.next().await {
2525                        let encoded = event.encoded_data_length.max(0) as u64;
2526                        total_bytes = total_bytes.saturating_add(encoded);
2527                        if total_bytes > total_max {
2528                            let _ = page_clone.force_stop_all().await;
2529                            break;
2530                        }
2531                    }
2532                }
2533            }
2534        };
2535
2536        let (t1, _, res_map, req_map, __) = tokio::join!(f1, f2, f3, f4, f5);
2537
2538        (t1.0, t1.1, res_map, req_map)
2539    });
2540
2541    let page_navigation = async {
2542        run_navigate_or_content_set_core(
2543            page,
2544            page_set,
2545            content,
2546            source,
2547            url_target,
2548            &mut chrome_http_req_res,
2549            referrer,
2550            &mut block_bytes,
2551            &cache_options,
2552            &cache_policy,
2553            resp_headers,
2554            chrome_intercept,
2555        )
2556        .await
2557    };
2558
2559    let start_time = Instant::now();
2560
2561    let mut request_cancelled = false;
2562
2563    let page_navigate = async {
2564        if cfg!(feature = "real_browser") {
2565            let notify = tokio::sync::Notify::new();
2566
2567            let mouse_loop = async {
2568                let mut index = 0;
2569
2570                loop {
2571                    tokio::select! {
2572                        _ = notify.notified() => {
2573                            break;
2574                        }
2575                        _ = perform_smart_mouse_movement(&page, &viewport) => {
2576                            tokio::time::sleep(std::time::Duration::from_millis(WAIT_TIMEOUTS[index])).await;
2577                        }
2578                    }
2579
2580                    index = (index + 1) % WAIT_TIMEOUTS.len();
2581                }
2582            };
2583
2584            let navigation_loop = async {
2585                let result = page_navigation.await;
2586                notify.notify_waiters();
2587                result
2588            };
2589
2590            let (result, _) = tokio::join!(navigation_loop, mouse_loop);
2591
2592            result
2593        } else {
2594            page_navigation.await
2595        }
2596    };
2597
2598    tokio::select! {
2599        v = tokio::time::timeout(base_timeout + Duration::from_millis(50), page_navigate) => {
2600            if v.is_err() {
2601                request_cancelled = true;
2602            }
2603        }
2604        v = rx => {
2605            if let Ok(v) = v {
2606                request_cancelled = !v;
2607            }
2608        }
2609    };
2610
2611    base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2612
2613    // we do not need to wait for navigation if content is assigned. The method set_content already handles this.
2614    let final_url = if wait_for_navigation && !request_cancelled && !block_bytes {
2615        let last_redirect = get_final_redirect(page, &source, base_timeout).await;
2616        base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2617        last_redirect
2618    } else {
2619        None
2620    };
2621
2622    let chrome_http_req_res1 = if asset {
2623        Some(chrome_http_req_res.clone())
2624    } else {
2625        None
2626    };
2627
2628    let run_events = !base_timeout.is_zero()
2629        && !block_bytes
2630        && !request_cancelled
2631        && !(chrome_http_req_res.is_empty() && !content)
2632        && (!chrome_http_req_res.status_code.is_server_error()
2633            && !chrome_http_req_res.status_code.is_client_error()
2634            || chrome_http_req_res.status_code == *UNKNOWN_STATUS_ERROR
2635            || chrome_http_req_res.status_code == 404
2636            || chrome_http_req_res.status_code == 403
2637            || chrome_http_req_res.status_code == 524
2638            || chrome_http_req_res.status_code.is_redirection()
2639            || chrome_http_req_res.status_code.is_success());
2640
2641    block_bytes = chrome_http_req_res.status_code == StatusCode::REQUEST_TIMEOUT;
2642
2643    let waf_check = chrome_http_req_res.waf_check;
2644    let mut status_code = chrome_http_req_res.status_code;
2645    let mut anti_bot_tech = chrome_http_req_res.anti_bot_tech;
2646    let mut validate_cf = false;
2647
2648    let run_page_response = async move {
2649        let mut page_response = if run_events {
2650            if waf_check {
2651                base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2652                if let Err(elasped) = tokio::time::timeout(
2653                    base_timeout,
2654                    perform_smart_mouse_movement(&page, &viewport),
2655                )
2656                .await
2657                {
2658                    log::warn!("mouse movement timeout exceeded {elasped}");
2659                }
2660            }
2661
2662            if wait_for.is_some() {
2663                base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2664                if let Err(elasped) =
2665                    tokio::time::timeout(base_timeout, page_wait(&page, &wait_for)).await
2666                {
2667                    log::warn!("max wait for timeout {elasped}");
2668                }
2669            }
2670
2671            base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2672
2673            if execution_scripts.is_some() || automation_scripts.is_some() {
2674                let target_url = final_url
2675                    .as_deref()
2676                    .or(url_target)
2677                    .unwrap_or(source)
2678                    .to_string();
2679
2680                if let Err(elasped) = tokio::time::timeout(base_timeout, async {
2681                    let mut _metadata = Vec::new();
2682
2683                    if track_automation {
2684                        tokio::join!(
2685                            crate::features::chrome_common::eval_execution_scripts(
2686                                &page,
2687                                &target_url,
2688                                &execution_scripts
2689                            ),
2690                            crate::features::chrome_common::eval_automation_scripts_tracking(
2691                                &page,
2692                                &target_url,
2693                                &automation_scripts,
2694                                &mut _metadata
2695                            )
2696                        );
2697                        metadata = Some(_metadata);
2698                    } else {
2699                        tokio::join!(
2700                            crate::features::chrome_common::eval_execution_scripts(
2701                                &page,
2702                                &target_url,
2703                                &execution_scripts
2704                            ),
2705                            crate::features::chrome_common::eval_automation_scripts(
2706                                &page,
2707                                &target_url,
2708                                &automation_scripts
2709                            )
2710                        );
2711                    }
2712                })
2713                .await
2714                {
2715                    log::warn!("eval scripts timeout exceeded {elasped}");
2716                }
2717            }
2718
2719            let xml_target = match &final_url {
2720                Some(f) => f.ends_with(".xml"),
2721                _ => target_url.ends_with(".xml"),
2722            };
2723
2724            let page_fn = async {
2725                if !xml_target {
2726                    return page.outer_html_bytes().await;
2727                }
2728                match page.content_bytes_xml().await {
2729                    Ok(b) if !b.is_empty() => Ok(b),
2730                    _ => page.outer_html_bytes().await,
2731                }
2732            };
2733
2734            let results = tokio::time::timeout(base_timeout.max(HALF_MAX_PAGE_TIMEOUT), page_fn);
2735
2736            let mut res: Box<Vec<u8>> = match results.await {
2737                Ok(v) => v.map(Box::new).unwrap_or_default(),
2738                _ => Default::default(),
2739            };
2740
2741            let forbidden = waf_check && res.starts_with(b"<html><head>\n    <style global=") && res.ends_with(b";</script><iframe height=\"1\" width=\"1\" style=\"position: absolute; top: 0px; left: 0px; border: none; visibility: hidden;\"></iframe>\n\n</body></html>");
2742
2743            #[cfg(feature = "real_browser")]
2744            {
2745                // guard entry to real pages.
2746                if res.len() <= crate::page::TURNSTILE_WALL_PAGE_SIZE {
2747                    if anti_bot_tech == AntiBotTech::Cloudflare || waf_check {
2748                        if crate::features::solvers::detect_cf_turnstyle(&res) {
2749                            if let Err(_e) = tokio::time::timeout(base_timeout, async {
2750                                if let Ok(success) = crate::features::solvers::cf_handle(
2751                                    &mut res,
2752                                    &page,
2753                                    &target_url,
2754                                    &viewport,
2755                                )
2756                                .await
2757                                {
2758                                    if success {
2759                                        status_code = StatusCode::OK;
2760                                    }
2761                                }
2762                            })
2763                            .await
2764                            {
2765                                validate_cf = true;
2766                            }
2767                        }
2768                    } else if anti_bot_tech == AntiBotTech::Imperva {
2769                        if crate::features::solvers::looks_like_imperva_verify(res.len(), &*res) {
2770                            if let Err(_e) = tokio::time::timeout(base_timeout, async {
2771                                if let Ok(success) = crate::features::solvers::imperva_handle(
2772                                    &mut res,
2773                                    &page,
2774                                    &target_url,
2775                                    &viewport,
2776                                )
2777                                .await
2778                                {
2779                                    if success {
2780                                        status_code = StatusCode::OK;
2781                                    }
2782                                }
2783                            })
2784                            .await
2785                            {
2786                                validate_cf = true;
2787                            }
2788                        }
2789                    } else if crate::features::solvers::detect_recaptcha(&res) {
2790                        if let Err(_e) = tokio::time::timeout(base_timeout, async {
2791                            if let Ok(solved) = crate::features::solvers::recaptcha_handle(
2792                                &mut res, &page, &viewport,
2793                            )
2794                            .await
2795                            {
2796                                if solved {
2797                                    status_code = StatusCode::OK;
2798                                }
2799                            }
2800                        })
2801                        .await
2802                        {
2803                            validate_cf = true;
2804                        }
2805                    } else if crate::features::solvers::detect_geetest(&res) {
2806                        if let Err(_e) = tokio::time::timeout(base_timeout, async {
2807                            if let Ok(solved) =
2808                                crate::features::solvers::geetest_handle(&mut res, &page, &viewport)
2809                                    .await
2810                            {
2811                                if solved {
2812                                    status_code = StatusCode::OK;
2813                                }
2814                            }
2815                        })
2816                        .await
2817                        {
2818                            validate_cf = true;
2819                        }
2820                    } else if crate::features::solvers::detect_lemin(&res) {
2821                        if let Err(_e) = tokio::time::timeout(base_timeout, async {
2822                            if let Ok(solved) =
2823                                crate::features::solvers::lemin_handle(&mut res, &page, &viewport)
2824                                    .await
2825                            {
2826                                if solved {
2827                                    status_code = StatusCode::OK;
2828                                }
2829                            }
2830                        })
2831                        .await
2832                        {
2833                            validate_cf = true;
2834                        }
2835                    }
2836                }
2837            }
2838
2839            let ok = !res.is_empty();
2840
2841            #[cfg(feature = "real_browser")]
2842            if validate_cf && ok {
2843                if !crate::features::solvers::detect_cf_turnstyle(&res)
2844                    && status_code == StatusCode::FORBIDDEN
2845                {
2846                    status_code = StatusCode::OK;
2847                }
2848            }
2849
2850            let mut page_response = set_page_response(
2851                ok,
2852                res,
2853                if forbidden {
2854                    StatusCode::FORBIDDEN
2855                } else {
2856                    status_code
2857                },
2858                final_url,
2859            );
2860
2861            base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2862
2863            let scope_url = if jar.is_some() {
2864                let scope_url = page_response
2865                    .final_url
2866                    .as_deref()
2867                    .filter(|u| !u.is_empty())
2868                    .or(url_target)
2869                    .unwrap_or(source);
2870
2871                url::Url::parse(scope_url).ok()
2872            } else {
2873                None
2874            };
2875
2876            let _ = tokio::time::timeout(
2877                base_timeout,
2878                set_page_response_cookies(&mut page_response, &page, jar, scope_url.as_ref()),
2879            )
2880            .await;
2881
2882            if openai_config.is_some() && !base_timeout.is_zero() {
2883                base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2884
2885                let openai_request = run_openai_request(
2886                    match &url_target {
2887                        Some(ut) => ut,
2888                        _ => source,
2889                    },
2890                    page,
2891                    wait_for,
2892                    openai_config,
2893                    &mut page_response,
2894                    ok,
2895                );
2896
2897                let _ = tokio::time::timeout(base_timeout, openai_request).await;
2898            }
2899
2900            if remote_multimodal.is_some() && !base_timeout.is_zero() {
2901                use crate::features::automation::run_remote_multimodal_if_enabled;
2902
2903                base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
2904
2905                let multi_modal_request = run_remote_multimodal_if_enabled(
2906                    remote_multimodal,
2907                    page,
2908                    match &url_target {
2909                        Some(ut) => ut,
2910                        _ => source,
2911                    },
2912                );
2913
2914                let multimodal_success =
2915                    match tokio::time::timeout(base_timeout, multi_modal_request).await {
2916                        Ok(Ok(Some(result))) => {
2917                            let success = result.success;
2918
2919                            // Store usage on page_response
2920                            match page_response.remote_multimodal_usage.as_mut() {
2921                                Some(v) => v.push(result.usage.clone()),
2922                                None => page_response.remote_multimodal_usage = Some(vec![result.usage.clone()]),
2923                            }
2924
2925                            // Store extracted data if available
2926                            if result.extracted.is_some() || result.screenshot.is_some() {
2927                                let automation_result = result.to_automation_results();
2928                                match page_response.extra_remote_multimodal_data.as_mut() {
2929                                    Some(v) => v.push(automation_result),
2930                                    None => page_response.extra_remote_multimodal_data = Some(vec![automation_result]),
2931                                }
2932                            }
2933
2934                            success
2935                        }
2936                        Ok(Ok(None)) => false,
2937                        Ok(Err(_e)) => false,
2938                        Err(_elapsed) => false,
2939                    };
2940
2941                if multimodal_success {
2942                    let next_content = tokio::time::timeout(base_timeout, page.outer_html_bytes())
2943                        .await
2944                        .ok()
2945                        .and_then(Result::ok)
2946                        .filter(|b| !b.is_empty())
2947                        .map(|b| Box::new(b.into()));
2948
2949                    if next_content.is_some() {
2950                        page_response.content = next_content;
2951                    }
2952                }
2953            }
2954
2955            if cfg!(feature = "chrome_screenshot") || screenshot.is_some() {
2956                let _ = tokio::time::timeout(
2957                    base_timeout + tokio::time::Duration::from_secs(30),
2958                    perform_screenshot(source, page, screenshot, &mut page_response),
2959                )
2960                .await;
2961            }
2962
2963            if metadata.is_some() {
2964                let mut default_metadata = Metadata::default();
2965                default_metadata.automation = metadata;
2966                page_response.metadata = Some(Box::new(default_metadata));
2967            }
2968
2969            page_response
2970        } else {
2971            let res = if !block_bytes {
2972                let results = tokio::time::timeout(
2973                    base_timeout.max(HALF_MAX_PAGE_TIMEOUT),
2974                    page.outer_html_bytes(),
2975                );
2976
2977                match results.await {
2978                    Ok(v) => v.map(Box::new).unwrap_or_default(),
2979                    _ => Default::default(),
2980                }
2981            } else {
2982                Default::default()
2983            };
2984
2985            let mut page_response = set_page_response(!res.is_empty(), res, status_code, final_url);
2986
2987            if !block_bytes {
2988                let scope_url = if jar.is_some() {
2989                    let scope_url = page_response
2990                        .final_url
2991                        .as_deref()
2992                        .filter(|u| !u.is_empty())
2993                        .or(url_target)
2994                        .unwrap_or(source);
2995
2996                    url::Url::parse(scope_url).ok()
2997                } else {
2998                    None
2999                };
3000
3001                let _ = tokio::time::timeout(
3002                    base_timeout,
3003                    set_page_response_cookies(&mut page_response, &page, jar, scope_url.as_ref()),
3004                )
3005                .await;
3006            }
3007
3008            if base_timeout.is_zero() && page_response.content.is_none() {
3009                page_response.status_code = StatusCode::REQUEST_TIMEOUT;
3010            }
3011
3012            page_response
3013        };
3014
3015        if content {
3016            if let Some(final_url) = &page_response.final_url {
3017                if final_url.starts_with("about:blank") {
3018                    page_response.final_url = None;
3019                }
3020            }
3021        }
3022
3023        page_response
3024    };
3025
3026    let mut content: Option<Box<Vec<u8>>> = None;
3027
3028    let page_response = match rx1 {
3029        Some(rx1) => {
3030            tokio::select! {
3031                v = tokio::time::timeout(base_timeout, run_page_response) => {
3032                    v.map_err(|_| CdpError::Timeout)
3033                }
3034                c = rx1 => {
3035                    if let Ok(c) = c {
3036                        if let Some(c) = c {
3037                            let params = GetResponseBodyParams::new(c);
3038
3039                            if let Ok(command_response) = page.execute(params).await {
3040                              let body_response = command_response;
3041
3042                              let media_file = if body_response.base64_encoded {
3043                                  chromiumoxide::utils::base64::decode(
3044                                      &body_response.body,
3045                                  )
3046                                  .unwrap_or_default()
3047                              } else {
3048                                  body_response.body.as_bytes().to_vec()
3049                              };
3050
3051                              if !media_file.is_empty() {
3052                                  content = Some(media_file.into());
3053                              }
3054                          }
3055                        }
3056                    }
3057
3058            let mut page_response = PageResponse::default();
3059
3060            let scope_url = if jar.is_some() {
3061            let scope_url = page_response
3062                .final_url
3063                .as_deref()
3064                .filter(|u| !u.is_empty())
3065                .or(url_target)
3066                .unwrap_or(source);
3067
3068              url::Url::parse(scope_url).ok()
3069            } else {
3070                None
3071            };
3072
3073                let _ = tokio::time::timeout(
3074                    base_timeout,
3075                    set_page_response_cookies(&mut page_response, &page, jar, scope_url.as_ref()),
3076                )
3077                .await;
3078
3079                    if let Some(mut chrome_http_req_res1) = chrome_http_req_res1 {
3080                        set_page_response_headers(&mut chrome_http_req_res1, &mut page_response);
3081
3082                        page_response.status_code = chrome_http_req_res1.status_code;
3083                        page_response.waf_check = chrome_http_req_res1.waf_check;
3084
3085                        #[cfg(feature = "cache_request")]
3086                        if !page_set && cache_request {
3087                            let _ = tokio::time::timeout(
3088                                base_timeout,
3089                                cache_chrome_response(&source, &page_response, chrome_http_req_res1, &cache_options),
3090                            )
3091                            .await;
3092                        }
3093
3094                    }
3095
3096                    Ok(page_response)
3097                }
3098            }
3099        }
3100        _ => Ok(run_page_response.await),
3101    };
3102
3103    let mut page_response = page_response.unwrap_or_default();
3104
3105    set_page_response_headers(&mut chrome_http_req_res, &mut page_response);
3106    page_response.status_code = chrome_http_req_res.status_code;
3107    page_response.waf_check = chrome_http_req_res.waf_check;
3108    page_response.content = match content {
3109        Some(c) if !c.is_empty() => Some(c.into()),
3110        _ => {
3111            let needs_fill = page_response
3112                .content
3113                .as_ref()
3114                .map_or(true, |b| b.is_empty());
3115
3116            if needs_fill {
3117                tokio::time::timeout(base_timeout, page.outer_html_bytes())
3118                    .await
3119                    .ok()
3120                    .and_then(Result::ok)
3121                    .filter(|b| !b.is_empty())
3122                    .map(Into::into)
3123            } else {
3124                page_response.content
3125            }
3126        }
3127    };
3128    if page_response.status_code == *UNKNOWN_STATUS_ERROR && page_response.content.is_some() {
3129        page_response.status_code = StatusCode::OK;
3130    }
3131
3132    // run initial handling hidden anchors
3133    // if let Ok(new_links) = page.evaluate(crate::features::chrome::ANCHOR_EVENTS).await {
3134    //     if let Ok(results) = new_links.into_value::<hashbrown::HashSet<CaseInsensitiveString>>() {
3135    //         links.extend(page.extract_links_raw(&base, &results).await);
3136    //     }
3137    // }
3138
3139    #[cfg(feature = "cache_request")]
3140    let mut modified_cache = false;
3141
3142    #[cfg(feature = "cache_request")]
3143    if cache_request {
3144        if let Some(mut main_rx) = main_rx {
3145            if let Ok(doc_req_id) = &main_rx.try_recv() {
3146                let cache_url = match &page_response.final_url {
3147                    Some(final_url) if !final_url.is_empty() => final_url.as_str(),
3148                    _ => target_url,
3149                };
3150
3151                match page
3152                    .execute(GetResponseBodyParams::new(doc_req_id.clone()))
3153                    .await
3154                {
3155                    Ok(body_result) => {
3156                        let raw_body: Vec<u8> = if body_result.base64_encoded {
3157                            chromiumoxide::utils::base64::decode(&body_result.body)
3158                                .unwrap_or_default()
3159                        } else {
3160                            body_result.body.clone().into_bytes()
3161                        };
3162
3163                        if !raw_body.is_empty() {
3164                            let _ = tokio::time::timeout(
3165                                base_timeout,
3166                                cache_chrome_response_from_cdp_body(
3167                                    cache_url,
3168                                    &raw_body,
3169                                    &chrome_http_req_res,
3170                                    &cache_options,
3171                                ),
3172                            )
3173                            .await;
3174                            modified_cache = true;
3175                        }
3176                    }
3177                    Err(e) => {
3178                        log::error!("{:?}", e)
3179                    }
3180                }
3181            }
3182        }
3183    }
3184
3185    if cfg!(not(feature = "chrome_store_page")) {
3186        let _ = page
3187            .send_command(chromiumoxide::cdp::browser_protocol::page::CloseParams::default())
3188            .await;
3189
3190        if let Ok((mut transferred, bytes_map, mut rs, request_map)) = bytes_collected_handle.await
3191        {
3192            let response_map = rs.response_map;
3193
3194            if response_map.is_some() {
3195                let mut _response_map = HashMap::new();
3196
3197                if let Some(response_map) = response_map {
3198                    if let Some(bytes_map) = bytes_map {
3199                        let detect_anti_bots =
3200                            response_map.len() <= 4 && anti_bot_tech == AntiBotTech::None;
3201
3202                        for item in response_map {
3203                            if detect_anti_bots && item.1.url.starts_with("/_Incapsula_Resource?") {
3204                                anti_bot_tech = AntiBotTech::Imperva;
3205                            }
3206
3207                            let b = if item.1.skipped {
3208                                0.0
3209                            } else {
3210                                match bytes_map.get(&item.0) {
3211                                    Some(f) => *f,
3212                                    _ => 0.0,
3213                                }
3214                            };
3215
3216                            if item.1.skipped {
3217                                transferred -= item.1.bytes_transferred;
3218                            }
3219
3220                            _response_map.insert(item.1.url, b);
3221                        }
3222                    }
3223                }
3224
3225                page_response.response_map = Some(_response_map);
3226
3227                if let Some(status) = rs
3228                    .status_code
3229                    .and_then(|s| s.try_into().ok())
3230                    .and_then(|u: u16| StatusCode::from_u16(u).ok())
3231                {
3232                    page_response.status_code = status;
3233                }
3234
3235                set_page_response_headers_raw(&mut rs.headers, &mut page_response);
3236                store_headers(&page_response, &mut chrome_http_req_res);
3237
3238                if anti_bot_tech == AntiBotTech::None {
3239                    let final_url = match &page_response.final_url {
3240                        Some(final_url)
3241                            if !final_url.is_empty()
3242                                && !final_url.starts_with("about:blank")
3243                                && !final_url.starts_with("chrome-error://chromewebdata") =>
3244                        {
3245                            final_url
3246                        }
3247                        _ => target_url,
3248                    };
3249                    if let Some(h) = &page_response.headers {
3250                        if let Some(content) = &page_response.content {
3251                            anti_bot_tech = detect_anti_bot_tech_response(
3252                                &final_url,
3253                                &HeaderSource::HeaderMap(h),
3254                                &content,
3255                                None,
3256                            );
3257                        }
3258                    }
3259                }
3260
3261                #[cfg(feature = "real_browser")]
3262                if let Some(content) = &page_response.content {
3263                    // validate if the turnstile page is still open.
3264                    if anti_bot_tech == AntiBotTech::Cloudflare
3265                        && page_response.status_code == StatusCode::FORBIDDEN
3266                    {
3267                        let cf_turnstile = crate::features::solvers::detect_cf_turnstyle(&content);
3268
3269                        if !cf_turnstile {
3270                            page_response.status_code = StatusCode::OK;
3271                        }
3272                    }
3273                }
3274                #[cfg(feature = "cache_request")]
3275                if cache_request && !page_set && !rs.main_doc_from_cache && !modified_cache {
3276                    let _ = tokio::time::timeout(
3277                        base_timeout,
3278                        cache_chrome_response(
3279                            &source,
3280                            &page_response,
3281                            chrome_http_req_res,
3282                            &cache_options,
3283                        ),
3284                    )
3285                    .await;
3286                }
3287            }
3288            if request_map.is_some() {
3289                page_response.request_map = request_map;
3290            }
3291
3292            page_response.bytes_transferred = Some(transferred);
3293        }
3294    }
3295
3296    page_response.anti_bot_tech = anti_bot_tech;
3297
3298    set_page_response_duration(&mut page_response, duration);
3299
3300    Ok(page_response)
3301}
3302
3303#[cfg(feature = "time")]
3304/// Set the duration of time took for the page.
3305pub(crate) fn set_page_response_duration(
3306    page_response: &mut PageResponse,
3307    duration: Option<tokio::time::Instant>,
3308) {
3309    page_response.duration = duration;
3310}
3311
3312#[cfg(not(feature = "time"))]
3313/// Set the duration of time took for the page.
3314pub(crate) fn set_page_response_duration(
3315    _page_response: &mut PageResponse,
3316    _duration: Option<tokio::time::Instant>,
3317) {
3318}
3319
3320/// Set the page response.
3321#[cfg(feature = "chrome")]
3322fn set_page_response(
3323    ok: bool,
3324    res: Box<Vec<u8>>,
3325    status_code: StatusCode,
3326    final_url: Option<String>,
3327) -> PageResponse {
3328    PageResponse {
3329        content: if ok { Some(res.into()) } else { None },
3330        status_code,
3331        final_url,
3332        ..Default::default()
3333    }
3334}
3335
3336/// Set the page response.
3337#[cfg(all(feature = "chrome", feature = "headers"))]
3338fn set_page_response_headers(
3339    chrome_http_req_res: &mut ChromeHTTPReqRes,
3340    page_response: &mut PageResponse,
3341) {
3342    let response_headers = convert_headers(&chrome_http_req_res.response_headers);
3343
3344    if !response_headers.is_empty() {
3345        page_response.headers = Some(response_headers);
3346    }
3347}
3348
3349/// Set the page response.
3350#[cfg(all(feature = "chrome", not(feature = "headers")))]
3351fn set_page_response_headers(
3352    _chrome_http_req_res: &mut ChromeHTTPReqRes,
3353    _page_response: &mut PageResponse,
3354) {
3355}
3356
3357/// Set the page response.
3358#[cfg(all(feature = "chrome", feature = "headers"))]
3359fn set_page_response_headers_raw(
3360    chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
3361    page_response: &mut PageResponse,
3362) {
3363    if let Some(chrome_headers) = chrome_http_req_res {
3364        let mut header_map = reqwest::header::HeaderMap::new();
3365
3366        if let Some(obj) = chrome_headers.inner().as_object() {
3367            for (index, (key, value)) in obj.iter().enumerate() {
3368                use std::str::FromStr;
3369                if let (Ok(header_name), Ok(header_value)) = (
3370                    reqwest::header::HeaderName::from_str(key),
3371                    reqwest::header::HeaderValue::from_str(&value.to_string()),
3372                ) {
3373                    header_map.insert(header_name, header_value);
3374                }
3375                if index > 1000 {
3376                    break;
3377                }
3378            }
3379        }
3380        if !header_map.is_empty() {
3381            page_response.headers = Some(header_map);
3382        }
3383    }
3384}
3385
3386/// Set the page response.
3387#[cfg(all(feature = "chrome", not(feature = "headers")))]
3388fn set_page_response_headers_raw(
3389    _chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
3390    _page_response: &mut PageResponse,
3391) {
3392}
3393
3394#[cfg(all(feature = "chrome", feature = "cookies"))]
3395async fn set_page_response_cookies(
3396    page_response: &mut PageResponse,
3397    page: &chromiumoxide::Page,
3398    jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
3399    scope_url: Option<&url::Url>,
3400) {
3401    if let Ok(mut cookies) = page.get_cookies().await {
3402        let mut cookies_map: std::collections::HashMap<String, String> =
3403            std::collections::HashMap::new();
3404
3405        for cookie in cookies.drain(..) {
3406            if let Some(scope_url) = scope_url {
3407                if let Some(jar) = jar {
3408                    let sc = format!("{}={}; Path=/", cookie.name, cookie.value);
3409                    jar.add_cookie_str(&sc, scope_url);
3410                }
3411            }
3412            cookies_map.insert(cookie.name, cookie.value);
3413        }
3414
3415        let response_headers = convert_headers(&cookies_map);
3416        if !response_headers.is_empty() {
3417            page_response.cookies = Some(response_headers);
3418        }
3419    }
3420}
3421
3422/// Perform a screenshot shortcut.
3423#[cfg(feature = "chrome")]
3424pub async fn perform_screenshot(
3425    target_url: &str,
3426    page: &chromiumoxide::Page,
3427    screenshot: &Option<crate::configuration::ScreenShotConfig>,
3428    page_response: &mut PageResponse,
3429) {
3430    use base64::engine::general_purpose::STANDARD;
3431    use base64::Engine;
3432
3433    match &screenshot {
3434        Some(ss) => {
3435            let output_format = string_concat!(
3436                ".",
3437                ss.params
3438                    .cdp_params
3439                    .format
3440                    .as_ref()
3441                    .unwrap_or_else(|| &crate::configuration::CaptureScreenshotFormat::Png)
3442                    .to_string()
3443            );
3444            let ss_params = chromiumoxide::page::ScreenshotParams::from(ss.params.clone());
3445
3446            let full_page = ss_params.full_page.unwrap_or_default();
3447            let omit_background = ss_params.omit_background.unwrap_or_default();
3448            let mut cdp_params = ss_params.cdp_params;
3449
3450            cdp_params.optimize_for_speed = Some(true);
3451
3452            if full_page {
3453                cdp_params.capture_beyond_viewport = Some(true);
3454            }
3455
3456            if omit_background {
3457                let _ = page.send_command(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams {
3458                    color: Some(chromiumoxide::cdp::browser_protocol::dom::Rgba {
3459                        r: 0,
3460                        g: 0,
3461                        b: 0,
3462                        a: Some(0.),
3463                    }),
3464                })
3465                .await;
3466            }
3467
3468            match page.execute(cdp_params).await {
3469                Ok(b) => {
3470                    if let Ok(b) = STANDARD.decode(&b.data) {
3471                        if ss.save {
3472                            let output_path = create_output_path(
3473                                &ss.output_dir.clone().unwrap_or_else(|| "./storage/".into()),
3474                                &target_url,
3475                                &output_format,
3476                            )
3477                            .await;
3478                            let _ = tokio::fs::write(output_path, &b).await;
3479                        }
3480                        if ss.bytes {
3481                            page_response.screenshot_bytes = Some(b);
3482                        }
3483                    }
3484                }
3485                Err(e) => {
3486                    log::error!("failed to take screenshot: {:?} - {:?}", e, target_url)
3487                }
3488            };
3489
3490            if omit_background {
3491                let _ = page.send_command(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams { color: None })
3492                        .await;
3493            }
3494        }
3495        _ => {
3496            let output_path = create_output_path(
3497                &std::env::var("SCREENSHOT_DIRECTORY")
3498                    .unwrap_or_else(|_| "./storage/".to_string())
3499                    .into(),
3500                &target_url,
3501                &".png",
3502            )
3503            .await;
3504
3505            match page
3506                .save_screenshot(
3507                    chromiumoxide::page::ScreenshotParams::builder()
3508                        .format(
3509                            chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png,
3510                        )
3511                        .full_page(match std::env::var("SCREENSHOT_FULL_PAGE") {
3512                            Ok(t) => t == "true",
3513                            _ => true,
3514                        })
3515                        .omit_background(match std::env::var("SCREENSHOT_OMIT_BACKGROUND") {
3516                            Ok(t) => t == "true",
3517                            _ => true,
3518                        })
3519                        .build(),
3520                    &output_path,
3521                )
3522                .await
3523            {
3524                Ok(_) => log::debug!("saved screenshot: {:?}", output_path),
3525                Err(e) => log::error!("failed to save screenshot: {:?} - {:?}", e, output_path),
3526            };
3527        }
3528    }
3529}
3530
3531#[cfg(feature = "chrome")]
3532/// Check if url matches the last item in a redirect chain for chrome CDP
3533pub async fn get_last_redirect(
3534    target_url: &str,
3535    u: &Option<std::sync::Arc<chromiumoxide::handler::http::HttpRequest>>,
3536    page: &chromiumoxide::Page,
3537) -> Option<String> {
3538    if let Some(http_request) = u {
3539        if let Some(redirect) = http_request.redirect_chain.last() {
3540            if let Some(url) = redirect.url.as_ref() {
3541                return if target_url != url {
3542                    Some(url.clone())
3543                } else {
3544                    None
3545                };
3546            }
3547        }
3548    }
3549    page.url().await.ok()?
3550}
3551
3552/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
3553#[cfg(feature = "cookies")]
3554pub fn get_cookies(res: &Response) -> Option<crate::client::header::HeaderMap> {
3555    use crate::client::header::{HeaderMap, HeaderName, HeaderValue};
3556
3557    let mut headers = HeaderMap::new();
3558
3559    for cookie in res.cookies() {
3560        if let Ok(h) = HeaderValue::from_str(cookie.value()) {
3561            if let Ok(n) = HeaderName::from_str(cookie.name()) {
3562                headers.insert(n, h);
3563            }
3564        }
3565    }
3566
3567    if !headers.is_empty() {
3568        Some(headers)
3569    } else {
3570        None
3571    }
3572}
3573
3574#[cfg(not(feature = "cookies"))]
3575/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
3576pub fn get_cookies(_res: &Response) -> Option<crate::client::header::HeaderMap> {
3577    None
3578}
3579
3580/// Block streaming
3581pub(crate) fn block_streaming(res: &Response, only_html: bool) -> bool {
3582    let mut block_streaming = false;
3583
3584    if only_html {
3585        if let Some(content_type) = res.headers().get(crate::client::header::CONTENT_TYPE) {
3586            if let Ok(content_type_str) = content_type.to_str() {
3587                if IGNORE_CONTENT_TYPES.contains(content_type_str) {
3588                    block_streaming = true;
3589                }
3590            }
3591        }
3592    }
3593
3594    block_streaming
3595}
3596
3597/// Handle the response bytes
3598pub async fn handle_response_bytes(
3599    res: Response,
3600    target_url: &str,
3601    only_html: bool,
3602) -> PageResponse {
3603    let u = res.url().as_str();
3604
3605    let rd = if target_url != u {
3606        Some(u.into())
3607    } else {
3608        None
3609    };
3610
3611    let status_code: StatusCode = res.status();
3612    let headers = res.headers().clone();
3613    #[cfg(feature = "remote_addr")]
3614    let remote_addr = res.remote_addr();
3615    let cookies = get_cookies(&res);
3616
3617    let mut content: Option<Box<Vec<u8>>> = None;
3618    let mut anti_bot_tech = AntiBotTech::default();
3619
3620    let limit = *MAX_SIZE_BYTES;
3621
3622    if limit > 0 {
3623        let base = res
3624            .content_length()
3625            .and_then(|n| usize::try_from(n).ok())
3626            .unwrap_or(0);
3627
3628        let hdr = res
3629            .headers()
3630            .get(CONTENT_LENGTH)
3631            .and_then(|v| v.to_str().ok())
3632            .and_then(|s| s.parse::<usize>().ok())
3633            .unwrap_or(0);
3634
3635        let current_size = base + hdr.saturating_sub(base);
3636
3637        if current_size > limit {
3638            anti_bot_tech = detect_anti_bot_tech_response(
3639                target_url,
3640                &HeaderSource::HeaderMap(&headers),
3641                &Default::default(),
3642                None,
3643            );
3644            return PageResponse {
3645                headers: Some(headers),
3646                #[cfg(feature = "remote_addr")]
3647                remote_addr,
3648                #[cfg(feature = "cookies")]
3649                cookies,
3650                content: None,
3651                final_url: rd,
3652                status_code,
3653                anti_bot_tech,
3654                ..Default::default()
3655            };
3656        }
3657    }
3658
3659    if !block_streaming(&res, only_html) {
3660        let mut data = match res.content_length() {
3661            Some(cap) if cap >= MAX_PRE_ALLOCATED_HTML_PAGE_SIZE => {
3662                Vec::with_capacity(cap.max(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE) as usize)
3663            }
3664            _ => Vec::with_capacity(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE),
3665        };
3666        let mut stream = res.bytes_stream();
3667        let mut first_bytes = true;
3668
3669        while let Some(item) = stream.next().await {
3670            match item {
3671                Ok(text) => {
3672                    if only_html && first_bytes {
3673                        first_bytes = false;
3674                        if is_binary_file(&text) {
3675                            break;
3676                        }
3677                    }
3678
3679                    if limit > 0 && data.len() + text.len() > limit {
3680                        break;
3681                    }
3682
3683                    data.extend_from_slice(&text)
3684                }
3685                Err(e) => {
3686                    log::error!("{e} in {}", target_url);
3687                    break;
3688                }
3689            }
3690        }
3691
3692        anti_bot_tech = detect_anti_bot_tech_response(
3693            &target_url,
3694            &HeaderSource::HeaderMap(&headers),
3695            &data,
3696            None,
3697        );
3698        content.replace(Box::new(data.into()));
3699    }
3700
3701    PageResponse {
3702        headers: Some(headers),
3703        #[cfg(feature = "remote_addr")]
3704        remote_addr,
3705        #[cfg(feature = "cookies")]
3706        cookies,
3707        content,
3708        final_url: rd,
3709        status_code,
3710        anti_bot_tech,
3711        ..Default::default()
3712    }
3713}
3714
3715/// Handle the response bytes writing links while crawling
3716pub async fn handle_response_bytes_writer<'h, O>(
3717    res: Response,
3718    target_url: &str,
3719    only_html: bool,
3720    rewriter: &mut HtmlRewriter<'h, O>,
3721    collected_bytes: &mut Vec<u8>,
3722) -> (PageResponse, bool)
3723where
3724    O: OutputSink + Send + 'static,
3725{
3726    let u = res.url().as_str();
3727
3728    let final_url: Option<String> = if target_url != u {
3729        Some(u.into())
3730    } else {
3731        None
3732    };
3733
3734    let status_code: StatusCode = res.status();
3735    let headers = res.headers().clone();
3736    #[cfg(feature = "remote_addr")]
3737    let remote_addr = res.remote_addr();
3738    let cookies = get_cookies(&res);
3739    let mut anti_bot_tech = AntiBotTech::default();
3740
3741    let mut rewrite_error = false;
3742
3743    if !block_streaming(&res, only_html) {
3744        let mut stream = res.bytes_stream();
3745        let mut first_bytes = true;
3746        let mut data_len = 0;
3747
3748        while let Some(item) = stream.next().await {
3749            match item {
3750                Ok(res_bytes) => {
3751                    if only_html && first_bytes {
3752                        first_bytes = false;
3753                        if is_binary_file(&res_bytes) {
3754                            break;
3755                        }
3756                    }
3757                    let limit = *MAX_SIZE_BYTES;
3758                    let bytes_len = res_bytes.len();
3759
3760                    if limit > 0 && data_len + bytes_len > limit {
3761                        break;
3762                    }
3763
3764                    data_len += bytes_len;
3765
3766                    if !rewrite_error {
3767                        if rewriter.write(&res_bytes).is_err() {
3768                            rewrite_error = true;
3769                        }
3770                    }
3771
3772                    collected_bytes.extend_from_slice(&res_bytes);
3773                }
3774                Err(e) => {
3775                    log::error!("{e} in {}", target_url);
3776                    break;
3777                }
3778            }
3779        }
3780
3781        anti_bot_tech = detect_anti_bot_tech_response(
3782            &target_url,
3783            &HeaderSource::HeaderMap(&headers),
3784            &collected_bytes,
3785            None,
3786        );
3787    }
3788
3789    (
3790        PageResponse {
3791            #[cfg(feature = "headers")]
3792            headers: Some(headers),
3793            #[cfg(feature = "remote_addr")]
3794            remote_addr,
3795            #[cfg(feature = "cookies")]
3796            cookies,
3797            final_url,
3798            status_code,
3799            anti_bot_tech,
3800            ..Default::default()
3801        },
3802        rewrite_error,
3803    )
3804}
3805
3806/// Continue to parse a valid web page.
3807pub(crate) fn valid_parsing_status(res: &Response) -> bool {
3808    res.status().is_success() || res.status() == 404
3809}
3810
3811/// Build the error page response.
3812fn build_error_page_response(target_url: &str, err: RequestError) -> PageResponse {
3813    log::info!("error fetching {}", target_url);
3814
3815    let mut page_response = PageResponse::default();
3816    if let Some(status_code) = err.status() {
3817        page_response.status_code = status_code;
3818    } else {
3819        page_response.status_code = crate::page::get_error_http_status_code(&err);
3820    }
3821    page_response.error_for_status = Some(Err(err));
3822    page_response
3823}
3824
3825/// Error chain handshake failure.
3826fn error_chain_contains_handshake_failure(err: &RequestError) -> bool {
3827    if err.to_string().to_lowercase().contains("handshake failure") {
3828        return true;
3829    }
3830    let mut cur: Option<&(dyn std::error::Error + 'static)> = err.source();
3831
3832    while let Some(e) = cur {
3833        let s = e.to_string().to_lowercase();
3834        if s.contains("handshake failure") {
3835            return true;
3836        }
3837        cur = e.source();
3838    }
3839
3840    false
3841}
3842
3843/// Perform a network request to a resource extracting all content streaming.
3844async fn fetch_page_html_raw_base(
3845    target_url: &str,
3846    client: &Client,
3847    only_html: bool,
3848) -> PageResponse {
3849    async fn attempt_once(
3850        url: &str,
3851        client: &Client,
3852        only_html: bool,
3853    ) -> Result<PageResponse, RequestError> {
3854        let res = client.get(url).send().await?;
3855        Ok(handle_response_bytes(res, url, only_html).await)
3856    }
3857
3858    let duration = if cfg!(feature = "time") {
3859        Some(tokio::time::Instant::now())
3860    } else {
3861        None
3862    };
3863
3864    let mut page_response = match attempt_once(target_url, client, only_html).await {
3865        Ok(pr) => pr,
3866        Err(err) => {
3867            let should_retry = error_chain_contains_handshake_failure(&err);
3868            if should_retry {
3869                if let Some(flipped) = flip_http_https(target_url) {
3870                    log::info!(
3871                        "TLS handshake failure for {}; retrying with flipped scheme: {}",
3872                        target_url,
3873                        flipped
3874                    );
3875                    match attempt_once(&flipped, client, only_html).await {
3876                        Ok(pr2) => pr2,
3877                        Err(err2) => build_error_page_response(&flipped, err2),
3878                    }
3879                } else {
3880                    build_error_page_response(target_url, err)
3881                }
3882            } else {
3883                build_error_page_response(target_url, err)
3884            }
3885        }
3886    };
3887
3888    set_page_response_duration(&mut page_response, duration);
3889    page_response
3890}
3891
3892/// Perform a network request to a resource extracting all content streaming.
3893pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageResponse {
3894    fetch_page_html_raw_base(target_url, client, false).await
3895}
3896
3897/// Perform a network request to a resource extracting all content streaming.
3898pub async fn fetch_page_html_raw_only_html(target_url: &str, client: &Client) -> PageResponse {
3899    fetch_page_html_raw_base(target_url, client, false).await
3900}
3901
3902/// Perform a network request to a resource extracting all content as text.
3903#[cfg(feature = "decentralized")]
3904pub async fn fetch_page(target_url: &str, client: &Client) -> Option<Vec<u8>> {
3905    match client.get(target_url).send().await {
3906        Ok(res) if valid_parsing_status(&res) => match res.bytes().await {
3907            Ok(text) => Some(text.into()),
3908            Err(_) => {
3909                log("- error fetching {}", &target_url);
3910                None
3911            }
3912        },
3913        Ok(_) => None,
3914        Err(_) => {
3915            log("- error parsing html bytes {}", &target_url);
3916            None
3917        }
3918    }
3919}
3920
3921#[cfg(all(feature = "decentralized", feature = "headers"))]
3922/// Fetch a page with the headers returned.
3923pub enum FetchPageResult {
3924    /// Success extracting contents of the page
3925    Success(reqwest::header::HeaderMap, Option<Vec<u8>>),
3926    /// No success extracting content
3927    NoSuccess(reqwest::header::HeaderMap),
3928    /// A network error occured.
3929    FetchError,
3930}
3931
3932#[cfg(all(feature = "decentralized", feature = "headers"))]
3933/// Perform a network request to a resource with the response headers..
3934pub async fn fetch_page_and_headers(target_url: &str, client: &Client) -> FetchPageResult {
3935    match client.get(target_url).send().await {
3936        Ok(res) if valid_parsing_status(&res) => {
3937            let headers = res.headers().clone();
3938            let b = match res.bytes().await {
3939                Ok(text) => Some(text),
3940                Err(_) => {
3941                    log("- error fetching {}", &target_url);
3942                    None
3943                }
3944            };
3945            FetchPageResult::Success(headers, b)
3946        }
3947        Ok(res) => FetchPageResult::NoSuccess(res.headers().clone()),
3948        Err(_) => {
3949            log("- error parsing html bytes {}", &target_url);
3950            FetchPageResult::FetchError
3951        }
3952    }
3953}
3954
3955#[cfg(all(not(feature = "fs"), not(feature = "chrome")))]
3956/// Perform a network request to a resource extracting all content as text streaming.
3957pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
3958    fetch_page_html_raw(target_url, client).await
3959}
3960
3961/// Perform a network request to a resource extracting all content as text streaming.
3962#[cfg(all(feature = "fs", not(feature = "chrome")))]
3963pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
3964    use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
3965    use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
3966
3967    let duration = if cfg!(feature = "time") {
3968        Some(tokio::time::Instant::now())
3969    } else {
3970        None
3971    };
3972
3973    match client.get(target_url).send().await {
3974        Ok(res) if valid_parsing_status(&res) => {
3975            let u = res.url().as_str();
3976
3977            let rd = if target_url != u {
3978                Some(u.into())
3979            } else {
3980                None
3981            };
3982
3983            let status_code = res.status();
3984            let cookies = get_cookies(&res);
3985            #[cfg(feature = "headers")]
3986            let headers = res.headers().clone();
3987            #[cfg(feature = "remote_addr")]
3988            let remote_addr = res.remote_addr();
3989            let mut stream = res.bytes_stream();
3990            let mut data = Vec::new();
3991            let mut file: Option<tokio::fs::File> = None;
3992            let mut file_path = String::new();
3993
3994            while let Some(item) = stream.next().await {
3995                match item {
3996                    Ok(text) => {
3997                        let wrote_disk = file.is_some();
3998
3999                        // perform operations entire in memory to build resource
4000                        if !wrote_disk && data.capacity() < 8192 {
4001                            data.extend_from_slice(&text);
4002                        } else {
4003                            if !wrote_disk {
4004                                file_path = string_concat!(
4005                                    TMP_DIR,
4006                                    &utf8_percent_encode(target_url, NON_ALPHANUMERIC).to_string()
4007                                );
4008                                match tokio::fs::File::create(&file_path).await {
4009                                    Ok(f) => {
4010                                        let file = file.insert(f);
4011
4012                                        data.extend_from_slice(&text);
4013
4014                                        if let Ok(_) = file.write_all(&data.as_ref()).await {
4015                                            data.clear();
4016                                        }
4017                                    }
4018                                    _ => data.extend_from_slice(&text),
4019                                };
4020                            } else {
4021                                if let Some(f) = file.as_mut() {
4022                                    if let Err(_) = f.write_all(&text).await {
4023                                        data.extend_from_slice(&text)
4024                                    }
4025                                }
4026                            }
4027                        }
4028                    }
4029                    Err(e) => {
4030                        log::error!("{e} in {}", target_url);
4031                        break;
4032                    }
4033                }
4034            }
4035
4036            PageResponse {
4037                #[cfg(feature = "time")]
4038                duration,
4039                #[cfg(feature = "headers")]
4040                headers: Some(headers),
4041                #[cfg(feature = "remote_addr")]
4042                remote_addr,
4043                #[cfg(feature = "cookies")]
4044                cookies,
4045                content: Some(if file.is_some() {
4046                    let mut buffer = vec![];
4047
4048                    if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
4049                        if let Ok(_) = b.read_to_end(&mut buffer).await {
4050                            let _ = tokio::fs::remove_file(file_path).await;
4051                        }
4052                    }
4053
4054                    Box::new(buffer.into())
4055                } else {
4056                    Box::new(data.into())
4057                }),
4058                status_code,
4059                final_url: rd,
4060                ..Default::default()
4061            }
4062        }
4063        Ok(res) => {
4064            let u = res.url().as_str();
4065
4066            let rd = if target_url != u {
4067                Some(u.into())
4068            } else {
4069                None
4070            };
4071
4072            PageResponse {
4073                #[cfg(feature = "time")]
4074                duration,
4075                #[cfg(feature = "headers")]
4076                headers: Some(res.headers().clone()),
4077                #[cfg(feature = "remote_addr")]
4078                remote_addr: res.remote_addr(),
4079                #[cfg(feature = "cookies")]
4080                cookies: get_cookies(&res),
4081                status_code: res.status(),
4082                final_url: rd,
4083                ..Default::default()
4084            }
4085        }
4086        Err(err) => {
4087            log::info!("error fetching {}", target_url);
4088            let mut page_response = PageResponse::default();
4089
4090            if let Some(status_code) = err.status() {
4091                page_response.status_code = status_code;
4092            } else {
4093                page_response.status_code = crate::page::get_error_http_status_code(&err);
4094            }
4095
4096            page_response.error_for_status = Some(Err(err));
4097            page_response
4098        }
4099    }
4100}
4101
4102/// Perform a network request to a resource extracting all content as text streaming.
4103#[cfg(all(feature = "fs", feature = "chrome"))]
4104/// Perform a network request to a resource extracting all content as text streaming via chrome.
4105pub async fn fetch_page_html(
4106    target_url: &str,
4107    client: &Client,
4108    page: &chromiumoxide::Page,
4109    wait_for: &Option<crate::configuration::WaitFor>,
4110    screenshot: &Option<crate::configuration::ScreenShotConfig>,
4111    page_set: bool,
4112    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4113    execution_scripts: &Option<ExecutionScripts>,
4114    automation_scripts: &Option<AutomationScripts>,
4115    viewport: &Option<crate::configuration::Viewport>,
4116    request_timeout: &Option<Box<std::time::Duration>>,
4117    track_events: &Option<crate::configuration::ChromeEventTracker>,
4118    referrer: Option<String>,
4119    max_page_bytes: Option<f64>,
4120    cache_options: Option<CacheOptions>,
4121    cache_policy: &Option<BasicCachePolicy>,
4122    #[cfg(feature = "cookies")] jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4123) -> PageResponse {
4124    use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
4125    use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
4126
4127    #[cfg(feature = "time")]
4128    let duration = Some(tokio::time::Instant::now());
4129
4130    let skip_browser = cache_skip_browser(&cache_options);
4131    let cached_html = get_cached_url(&target_url, cache_options.as_ref(), cache_policy).await;
4132    let cached = cached_html.is_some();
4133
4134    // Skip browser entirely if cached and skip_browser mode is enabled
4135    if skip_browser {
4136        if let Some(html) = cached_html {
4137            return PageResponse {
4138                content: Some(Box::new(html.into_bytes())),
4139                status_code: StatusCode::OK,
4140                final_url: Some(target_url.to_string()),
4141                #[cfg(feature = "time")]
4142                duration,
4143                ..Default::default()
4144            };
4145        }
4146    }
4147
4148    let mut page_response = match &page {
4149        page => {
4150            match fetch_page_html_chrome_base(
4151                if let Some(cached) = &cached_html {
4152                    cached
4153                } else {
4154                    target_url
4155                },
4156                &page,
4157                cached,
4158                true,
4159                wait_for,
4160                screenshot,
4161                page_set,
4162                openai_config,
4163                if cached { Some(target_url) } else { None },
4164                execution_scripts,
4165                automation_scripts,
4166                &viewport,
4167                &request_timeout,
4168                &track_events,
4169                referrer,
4170                max_page_bytes,
4171                cache_options,
4172                cache_policy,
4173                &None,
4174                jar,
4175            )
4176            .await
4177            {
4178                Ok(page) => page,
4179                _ => {
4180                    log::info!(
4181                        "- error fetching chrome page defaulting to raw http request {}",
4182                        &target_url,
4183                    );
4184
4185                    match client.get(target_url).send().await {
4186                        Ok(res) if valid_parsing_status(&res) => {
4187                            let headers = res.headers().clone();
4188                            let cookies = get_cookies(&res);
4189                            let status_code = res.status();
4190                            let mut stream = res.bytes_stream();
4191                            let mut data = Vec::new();
4192
4193                            let mut file: Option<tokio::fs::File> = None;
4194                            let mut file_path = String::new();
4195
4196                            while let Some(item) = stream.next().await {
4197                                match item {
4198                                    Ok(text) => {
4199                                        let wrote_disk = file.is_some();
4200
4201                                        // perform operations entire in memory to build resource
4202                                        if !wrote_disk && data.capacity() < 8192 {
4203                                            data.extend_from_slice(&text);
4204                                        } else {
4205                                            if !wrote_disk {
4206                                                file_path = string_concat!(
4207                                                    TMP_DIR,
4208                                                    &utf8_percent_encode(
4209                                                        target_url,
4210                                                        NON_ALPHANUMERIC
4211                                                    )
4212                                                    .to_string()
4213                                                );
4214                                                match tokio::fs::File::create(&file_path).await {
4215                                                    Ok(f) => {
4216                                                        let file = file.insert(f);
4217
4218                                                        data.extend_from_slice(&text);
4219
4220                                                        if let Ok(_) =
4221                                                            file.write_all(&data.as_ref()).await
4222                                                        {
4223                                                            data.clear();
4224                                                        }
4225                                                    }
4226                                                    _ => data.extend_from_slice(&text),
4227                                                };
4228                                            } else {
4229                                                if let Some(f) = file.as_mut() {
4230                                                    if let Ok(_) = f.write_all(&text).await {
4231                                                        data.extend_from_slice(&text)
4232                                                    }
4233                                                }
4234                                            }
4235                                        }
4236                                    }
4237                                    Err(e) => {
4238                                        log::error!("{e} in {}", target_url);
4239                                        break;
4240                                    }
4241                                }
4242                            }
4243
4244                            PageResponse {
4245                                #[cfg(feature = "headers")]
4246                                headers: Some(headers),
4247                                #[cfg(feature = "remote_addr")]
4248                                remote_addr: res.remote_addr(),
4249                                #[cfg(feature = "cookies")]
4250                                cookies,
4251                                content: Some(if file.is_some() {
4252                                    let mut buffer = vec![];
4253
4254                                    if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
4255                                        if let Ok(_) = b.read_to_end(&mut buffer).await {
4256                                            let _ = tokio::fs::remove_file(file_path).await;
4257                                        }
4258                                    }
4259
4260                                    Box::new(buffer.into())
4261                                } else {
4262                                    Box::new(data.into())
4263                                }),
4264                                status_code,
4265                                ..Default::default()
4266                            }
4267                        }
4268
4269                        Ok(res) => PageResponse {
4270                            #[cfg(feature = "headers")]
4271                            headers: Some(res.headers().clone()),
4272                            #[cfg(feature = "remote_addr")]
4273                            remote_addr: res.remote_addr(),
4274                            #[cfg(feature = "cookies")]
4275                            cookies: get_cookies(&res),
4276                            status_code: res.status(),
4277                            ..Default::default()
4278                        },
4279                        Err(err) => {
4280                            log::info!("error fetching {}", target_url);
4281                            let mut page_response = PageResponse::default();
4282
4283                            if let Some(status_code) = err.status() {
4284                                page_response.status_code = status_code;
4285                            } else {
4286                                page_response.status_code =
4287                                    crate::page::get_error_http_status_code(&err);
4288                            }
4289
4290                            page_response.error_for_status = Some(Err(err));
4291                            page_response
4292                        }
4293                    }
4294                }
4295            }
4296        }
4297    };
4298    set_page_response_duration(&mut page_response, duration);
4299
4300    page_response
4301}
4302
4303#[cfg(any(feature = "cache", feature = "cache_mem"))]
4304/// Create the cache key from string.
4305pub fn create_cache_key_raw(
4306    uri: &str,
4307    override_method: Option<&str>,
4308    auth: Option<&str>,
4309) -> String {
4310    if let Some(authentication) = auth {
4311        format!(
4312            "{}:{}:{}",
4313            override_method.unwrap_or_else(|| "GET".into()),
4314            uri,
4315            authentication
4316        )
4317    } else {
4318        format!(
4319            "{}:{}",
4320            override_method.unwrap_or_else(|| "GET".into()),
4321            uri
4322        )
4323    }
4324}
4325
4326#[cfg(any(feature = "cache", feature = "cache_mem"))]
4327/// Create the cache key.
4328pub fn create_cache_key(
4329    parts: &http::request::Parts,
4330    override_method: Option<&str>,
4331    auth: Option<&str>,
4332) -> String {
4333    create_cache_key_raw(
4334        &parts.uri.to_string(),
4335        Some(override_method.unwrap_or_else(|| parts.method.as_str())),
4336        auth,
4337    )
4338}
4339
4340#[derive(Default, Debug, Clone, PartialEq, Eq)]
4341#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
4342/// Cache options to use for the request.
4343pub enum CacheOptions {
4344    /// Use cache without authentication.
4345    Yes,
4346    /// Use cache with authentication.
4347    Authorized(String),
4348    #[default]
4349    /// Do not use the memory cache.
4350    No,
4351    /// Skip browser entirely if cached response exists, return cached HTML directly.
4352    SkipBrowser,
4353    /// Skip browser with authentication token if cached response exists.
4354    SkipBrowserAuthorized(String),
4355}
4356
4357#[inline]
4358/// Cache auth token.
4359pub fn cache_auth_token(cache_options: &std::option::Option<CacheOptions>) -> Option<&str> {
4360    cache_options.as_ref().and_then(|opt| match opt {
4361        CacheOptions::Authorized(token) | CacheOptions::SkipBrowserAuthorized(token) => {
4362            Some(token.as_str())
4363        }
4364        _ => None,
4365    })
4366}
4367
4368#[inline]
4369/// Check if cache options indicate browser should be skipped when cached.
4370pub fn cache_skip_browser(cache_options: &std::option::Option<CacheOptions>) -> bool {
4371    matches!(
4372        cache_options,
4373        Some(CacheOptions::SkipBrowser) | Some(CacheOptions::SkipBrowserAuthorized(_))
4374    )
4375}
4376
4377/// Basic cache policy.
4378#[derive(Debug, Default, Clone, Hash, PartialEq, Eq)]
4379#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
4380pub enum BasicCachePolicy {
4381    /// Allow stale caches – responses may be used even if they *should* be revalidated.
4382    AllowStale,
4383    /// Use this `SystemTime` as the reference "now" for staleness checks.
4384    Period(std::time::SystemTime),
4385    #[default]
4386    /// Use the default system time.
4387    Normal,
4388}
4389
4390#[cfg(feature = "chrome_remote_cache")]
4391impl BasicCachePolicy {
4392    /// Convert the cache policy to chrome.
4393    pub fn from_basic(&self) -> chromiumoxide::cache::BasicCachePolicy {
4394        match &self {
4395            BasicCachePolicy::AllowStale => chromiumoxide::cache::BasicCachePolicy::AllowStale,
4396            BasicCachePolicy::Normal => chromiumoxide::cache::BasicCachePolicy::Normal,
4397            BasicCachePolicy::Period(p) => chromiumoxide::cache::BasicCachePolicy::Period(*p),
4398        }
4399    }
4400}
4401
4402#[cfg(any(feature = "cache", feature = "cache_mem"))]
4403/// Perform a network request to a resource extracting all content as text streaming via chrome.
4404pub async fn get_cached_url_base(
4405    target_url: &str,
4406    cache_options: Option<CacheOptions>,
4407    cache_policy: &Option<BasicCachePolicy>, // optional override/behavior
4408) -> Option<String> {
4409    use crate::http_cache_reqwest::CacheManager;
4410
4411    let auth_opt = match cache_options {
4412        Some(CacheOptions::Yes) | Some(CacheOptions::SkipBrowser) => None,
4413        Some(CacheOptions::Authorized(token))
4414        | Some(CacheOptions::SkipBrowserAuthorized(token)) => Some(token),
4415        Some(CacheOptions::No) | None => return None,
4416    };
4417
4418    // Override behavior:
4419    // - AllowStale: accept even stale entries
4420    // - Period(t): use t as "now" for staleness checks
4421    // - Normal/None: use SystemTime::now()
4422    let allow_stale = matches!(cache_policy, Some(BasicCachePolicy::AllowStale));
4423    let now = match cache_policy {
4424        Some(BasicCachePolicy::Period(t)) => *t,
4425        _ => std::time::SystemTime::now(),
4426    };
4427
4428    let cache_url = create_cache_key_raw(target_url, None, auth_opt.as_deref());
4429
4430    let result = tokio::time::timeout(Duration::from_millis(60), async {
4431        crate::website::CACACHE_MANAGER.get(&cache_url).await
4432    })
4433    .await;
4434
4435    if let Ok(cache_result) = result {
4436        if let Ok(Some((http_response, stored_policy))) = cache_result {
4437            if allow_stale || !stored_policy.is_stale(now) {
4438                let body = http_response.body;
4439                if !auto_encoder::is_binary_file(&body) {
4440                    let accept_lang = http_response
4441                        .headers
4442                        .get("accept-language")
4443                        .and_then(|h| if h.is_empty() { None } else { Some(h) })
4444                        .map_or("", |v| v);
4445
4446                    return Some(if !accept_lang.is_empty() {
4447                        auto_encoder::encode_bytes_from_language(&body, accept_lang)
4448                    } else {
4449                        auto_encoder::auto_encode_bytes(&body)
4450                    });
4451                }
4452            }
4453        }
4454    }
4455
4456    None
4457}
4458
4459#[cfg(any(feature = "cache", feature = "cache_mem"))]
4460/// Perform a network request to a resource extracting all content as text streaming via chrome.
4461pub async fn get_cached_url(
4462    target_url: &str,
4463    cache_options: Option<&CacheOptions>,
4464    cache_policy: &Option<BasicCachePolicy>,
4465) -> Option<String> {
4466    if let Some(body) = get_cached_url_base(target_url, cache_options.cloned(), cache_policy).await
4467    {
4468        return Some(body);
4469    }
4470
4471    let alt_url: Option<String> = if target_url.ends_with('/') {
4472        let trimmed = target_url.trim_end_matches('/');
4473        if trimmed.is_empty() || trimmed == target_url {
4474            None
4475        } else {
4476            Some(trimmed.to_string())
4477        }
4478    } else {
4479        let mut s = String::with_capacity(target_url.len() + 1);
4480        s.push_str(target_url);
4481        s.push('/');
4482        Some(s)
4483    };
4484
4485    if let Some(alt) = alt_url {
4486        if let Some(body) = get_cached_url_base(&alt, cache_options.cloned(), cache_policy).await {
4487            return Some(body);
4488        }
4489    }
4490
4491    None
4492}
4493
4494#[cfg(all(not(feature = "cache"), not(feature = "cache_mem")))]
4495/// Perform a network request to a resource extracting all content as text streaming via chrome.
4496pub async fn get_cached_url(
4497    _target_url: &str,
4498    _cache_options: Option<&CacheOptions>,
4499    _cache_policy: &Option<BasicCachePolicy>,
4500) -> Option<String> {
4501    None
4502}
4503
4504#[cfg(all(not(feature = "fs"), feature = "chrome"))]
4505/// Perform a network request to a resource extracting all content as text streaming via chrome.
4506pub async fn fetch_page_html_base(
4507    target_url: &str,
4508    client: &Client,
4509    page: &chromiumoxide::Page,
4510    wait_for: &Option<crate::configuration::WaitFor>,
4511    screenshot: &Option<crate::configuration::ScreenShotConfig>,
4512    page_set: bool,
4513    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4514    execution_scripts: &Option<ExecutionScripts>,
4515    automation_scripts: &Option<AutomationScripts>,
4516    viewport: &Option<crate::configuration::Viewport>,
4517    request_timeout: &Option<Box<std::time::Duration>>,
4518    track_events: &Option<crate::configuration::ChromeEventTracker>,
4519    referrer: Option<String>,
4520    max_page_bytes: Option<f64>,
4521    cache_options: Option<CacheOptions>,
4522    cache_policy: &Option<BasicCachePolicy>,
4523    seeded_resource: Option<String>,
4524    jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4525    remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4526) -> PageResponse {
4527    let skip_browser = cache_skip_browser(&cache_options);
4528    let cached_html = if seeded_resource.is_some() {
4529        seeded_resource
4530    } else {
4531        get_cached_url(&target_url, cache_options.as_ref(), cache_policy).await
4532    };
4533    let cached = cached_html.is_some();
4534
4535    // Skip browser entirely if cached and skip_browser mode is enabled
4536    if skip_browser {
4537        if let Some(html) = cached_html {
4538            return PageResponse {
4539                content: Some(Box::new(html.into_bytes())),
4540                status_code: StatusCode::OK,
4541                final_url: Some(target_url.to_string()),
4542                ..Default::default()
4543            };
4544        }
4545    }
4546
4547    match fetch_page_html_chrome_base(
4548        if let Some(cached) = &cached_html {
4549            cached
4550        } else {
4551            target_url
4552        },
4553        &page,
4554        cached,
4555        true,
4556        wait_for,
4557        screenshot,
4558        page_set,
4559        openai_config,
4560        if cached { Some(target_url) } else { None },
4561        execution_scripts,
4562        automation_scripts,
4563        viewport,
4564        request_timeout,
4565        track_events,
4566        referrer,
4567        max_page_bytes,
4568        cache_options,
4569        cache_policy,
4570        &None,
4571        &None,
4572        jar,
4573        remote_multimodal,
4574    )
4575    .await
4576    {
4577        Ok(page) => page,
4578        Err(err) => {
4579            log::error!("{:?}", err);
4580            fetch_page_html_raw(&target_url, &client).await
4581        }
4582    }
4583}
4584
4585#[cfg(all(not(feature = "fs"), feature = "chrome"))]
4586/// Perform a network request to a resource extracting all content as text streaming via chrome.
4587pub async fn fetch_page_html(
4588    target_url: &str,
4589    client: &Client,
4590    page: &chromiumoxide::Page,
4591    wait_for: &Option<crate::configuration::WaitFor>,
4592    screenshot: &Option<crate::configuration::ScreenShotConfig>,
4593    page_set: bool,
4594    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4595    execution_scripts: &Option<ExecutionScripts>,
4596    automation_scripts: &Option<AutomationScripts>,
4597    viewport: &Option<crate::configuration::Viewport>,
4598    request_timeout: &Option<Box<std::time::Duration>>,
4599    track_events: &Option<crate::configuration::ChromeEventTracker>,
4600    referrer: Option<String>,
4601    max_page_bytes: Option<f64>,
4602    cache_options: Option<CacheOptions>,
4603    cache_policy: &Option<BasicCachePolicy>,
4604    remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4605) -> PageResponse {
4606    fetch_page_html_base(
4607        target_url,
4608        client,
4609        page,
4610        wait_for,
4611        screenshot,
4612        page_set,
4613        openai_config,
4614        execution_scripts,
4615        automation_scripts,
4616        viewport,
4617        request_timeout,
4618        track_events,
4619        referrer,
4620        max_page_bytes,
4621        cache_options,
4622        cache_policy,
4623        None,
4624        None,
4625        remote_multimodal,
4626    )
4627    .await
4628}
4629
4630#[cfg(all(not(feature = "fs"), feature = "chrome"))]
4631/// Perform a network request to a resource extracting all content as text streaming via chrome.
4632pub async fn fetch_page_html_seeded(
4633    target_url: &str,
4634    client: &Client,
4635    page: &chromiumoxide::Page,
4636    wait_for: &Option<crate::configuration::WaitFor>,
4637    screenshot: &Option<crate::configuration::ScreenShotConfig>,
4638    page_set: bool,
4639    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4640    execution_scripts: &Option<ExecutionScripts>,
4641    automation_scripts: &Option<AutomationScripts>,
4642    viewport: &Option<crate::configuration::Viewport>,
4643    request_timeout: &Option<Box<std::time::Duration>>,
4644    track_events: &Option<crate::configuration::ChromeEventTracker>,
4645    referrer: Option<String>,
4646    max_page_bytes: Option<f64>,
4647    cache_options: Option<CacheOptions>,
4648    cache_policy: &Option<BasicCachePolicy>,
4649    seeded_resource: Option<String>,
4650    jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4651    remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4652) -> PageResponse {
4653    fetch_page_html_base(
4654        target_url,
4655        client,
4656        page,
4657        wait_for,
4658        screenshot,
4659        page_set,
4660        openai_config,
4661        execution_scripts,
4662        automation_scripts,
4663        viewport,
4664        request_timeout,
4665        track_events,
4666        referrer,
4667        max_page_bytes,
4668        cache_options,
4669        cache_policy,
4670        seeded_resource,
4671        jar,
4672        remote_multimodal,
4673    )
4674    .await
4675}
4676
4677#[cfg(feature = "chrome")]
4678/// Perform a network request to a resource extracting all content as text streaming via chrome.
4679async fn _fetch_page_html_chrome(
4680    target_url: &str,
4681    client: &Client,
4682    page: &chromiumoxide::Page,
4683    wait_for: &Option<crate::configuration::WaitFor>,
4684    screenshot: &Option<crate::configuration::ScreenShotConfig>,
4685    page_set: bool,
4686    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4687    execution_scripts: &Option<ExecutionScripts>,
4688    automation_scripts: &Option<AutomationScripts>,
4689    viewport: &Option<crate::configuration::Viewport>,
4690    request_timeout: &Option<Box<std::time::Duration>>,
4691    track_events: &Option<crate::configuration::ChromeEventTracker>,
4692    referrer: Option<String>,
4693    max_page_bytes: Option<f64>,
4694    cache_options: Option<CacheOptions>,
4695    cache_policy: &Option<BasicCachePolicy>,
4696    resource: Option<String>,
4697    jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4698    remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4699) -> PageResponse {
4700    let duration = if cfg!(feature = "time") {
4701        Some(tokio::time::Instant::now())
4702    } else {
4703        None
4704    };
4705
4706    let cached_html = if resource.is_some() {
4707        resource
4708    } else {
4709        get_cached_url(&target_url, cache_options.as_ref(), cache_policy).await
4710    };
4711
4712    let cached = !cached_html.is_none();
4713
4714    let mut page_response = match &page {
4715        page => {
4716            match fetch_page_html_chrome_base(
4717                if let Some(cached) = &cached_html {
4718                    &cached
4719                } else {
4720                    &target_url
4721                },
4722                &page,
4723                cached,
4724                true,
4725                wait_for,
4726                screenshot,
4727                page_set,
4728                openai_config,
4729                if cached { Some(target_url) } else { None },
4730                execution_scripts,
4731                automation_scripts,
4732                viewport,
4733                request_timeout,
4734                track_events,
4735                referrer,
4736                max_page_bytes,
4737                cache_options,
4738                cache_policy,
4739                &None,
4740                &None,
4741                jar,
4742                remote_multimodal,
4743            )
4744            .await
4745            {
4746                Ok(page) => page,
4747                Err(err) => {
4748                    log::error!(
4749                        "{:?}. Error requesting: {} - defaulting to raw http request",
4750                        err,
4751                        target_url
4752                    );
4753
4754                    match client.get(target_url).send().await {
4755                        Ok(res) if valid_parsing_status(&res) => {
4756                            #[cfg(feature = "headers")]
4757                            let headers = res.headers().clone();
4758                            #[cfg(feature = "remote_addr")]
4759                            let remote_addr = res.remote_addr();
4760                            let cookies = get_cookies(&res);
4761                            let status_code = res.status();
4762                            let mut stream = res.bytes_stream();
4763                            let mut data = Vec::new();
4764
4765                            while let Some(item) = stream.next().await {
4766                                match item {
4767                                    Ok(text) => {
4768                                        let limit = *MAX_SIZE_BYTES;
4769
4770                                        if limit > 0 && data.len() + text.len() > limit {
4771                                            break;
4772                                        }
4773
4774                                        data.extend_from_slice(&text)
4775                                    }
4776                                    Err(e) => {
4777                                        log::error!("{e} in {}", target_url);
4778                                        break;
4779                                    }
4780                                }
4781                            }
4782
4783                            PageResponse {
4784                                #[cfg(feature = "headers")]
4785                                headers: Some(headers),
4786                                #[cfg(feature = "remote_addr")]
4787                                remote_addr,
4788                                #[cfg(feature = "cookies")]
4789                                cookies,
4790                                content: Some(Box::new(data.into())),
4791                                status_code,
4792                                ..Default::default()
4793                            }
4794                        }
4795                        Ok(res) => PageResponse {
4796                            #[cfg(feature = "headers")]
4797                            headers: Some(res.headers().clone()),
4798                            #[cfg(feature = "remote_addr")]
4799                            remote_addr: res.remote_addr(),
4800                            #[cfg(feature = "cookies")]
4801                            cookies: get_cookies(&res),
4802                            status_code: res.status(),
4803                            ..Default::default()
4804                        },
4805                        Err(err) => {
4806                            log::info!("error fetching {}", target_url);
4807                            let mut page_response = PageResponse::default();
4808
4809                            if let Some(status_code) = err.status() {
4810                                page_response.status_code = status_code;
4811                            } else {
4812                                page_response.status_code =
4813                                    crate::page::get_error_http_status_code(&err);
4814                            }
4815
4816                            page_response.error_for_status = Some(Err(err));
4817                            page_response
4818                        }
4819                    }
4820                }
4821            }
4822        }
4823    };
4824
4825    set_page_response_duration(&mut page_response, duration);
4826
4827    page_response
4828}
4829
4830#[cfg(feature = "chrome")]
4831/// Perform a network request to a resource extracting all content as text streaming via chrome.
4832pub async fn fetch_page_html_chrome(
4833    target_url: &str,
4834    client: &Client,
4835    page: &chromiumoxide::Page,
4836    wait_for: &Option<crate::configuration::WaitFor>,
4837    screenshot: &Option<crate::configuration::ScreenShotConfig>,
4838    page_set: bool,
4839    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4840    execution_scripts: &Option<ExecutionScripts>,
4841    automation_scripts: &Option<AutomationScripts>,
4842    viewport: &Option<crate::configuration::Viewport>,
4843    request_timeout: &Option<Box<std::time::Duration>>,
4844    track_events: &Option<crate::configuration::ChromeEventTracker>,
4845    referrer: Option<String>,
4846    max_page_bytes: Option<f64>,
4847    cache_options: Option<CacheOptions>,
4848    cache_policy: &Option<BasicCachePolicy>,
4849    jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4850    remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4851) -> PageResponse {
4852    _fetch_page_html_chrome(
4853        target_url,
4854        client,
4855        page,
4856        wait_for,
4857        screenshot,
4858        page_set,
4859        openai_config,
4860        execution_scripts,
4861        automation_scripts,
4862        viewport,
4863        request_timeout,
4864        track_events,
4865        referrer,
4866        max_page_bytes,
4867        cache_options,
4868        cache_policy,
4869        None,
4870        jar,
4871        remote_multimodal,
4872    )
4873    .await
4874}
4875
4876#[cfg(feature = "chrome")]
4877/// Perform a network request to a resource extracting all content as text streaming via chrome seeded.
4878pub async fn fetch_page_html_chrome_seeded(
4879    target_url: &str,
4880    client: &Client,
4881    page: &chromiumoxide::Page,
4882    wait_for: &Option<crate::configuration::WaitFor>,
4883    screenshot: &Option<crate::configuration::ScreenShotConfig>,
4884    page_set: bool,
4885    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
4886    execution_scripts: &Option<ExecutionScripts>,
4887    automation_scripts: &Option<AutomationScripts>,
4888    viewport: &Option<crate::configuration::Viewport>,
4889    request_timeout: &Option<Box<std::time::Duration>>,
4890    track_events: &Option<crate::configuration::ChromeEventTracker>,
4891    referrer: Option<String>,
4892    max_page_bytes: Option<f64>,
4893    cache_options: Option<CacheOptions>,
4894    cache_policy: &Option<BasicCachePolicy>,
4895    resource: Option<String>,
4896    jar: Option<&std::sync::Arc<reqwest::cookie::Jar>>,
4897    remote_multimodal: &Option<Box<RemoteMultimodalConfigs>>,
4898) -> PageResponse {
4899    _fetch_page_html_chrome(
4900        target_url,
4901        client,
4902        page,
4903        wait_for,
4904        screenshot,
4905        page_set,
4906        openai_config,
4907        execution_scripts,
4908        automation_scripts,
4909        viewport,
4910        request_timeout,
4911        track_events,
4912        referrer,
4913        max_page_bytes,
4914        cache_options,
4915        cache_policy,
4916        resource,
4917        jar,
4918        remote_multimodal,
4919    )
4920    .await
4921}
4922
4923#[cfg(not(feature = "openai"))]
4924/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
4925pub async fn openai_request(
4926    _gpt_configs: &crate::configuration::GPTConfigs,
4927    _resource: String,
4928    _url: &str,
4929    _prompt: &str,
4930) -> crate::features::openai_common::OpenAIReturn {
4931    Default::default()
4932}
4933
4934#[cfg(feature = "openai")]
4935lazy_static! {
4936    static ref CORE_BPE_TOKEN_COUNT: tiktoken_rs::CoreBPE = tiktoken_rs::cl100k_base().unwrap();
4937    static ref SEM: tokio::sync::Semaphore = {
4938        let logical = num_cpus::get();
4939        let physical = num_cpus::get_physical();
4940
4941        let sem_limit = if logical > physical {
4942            (logical) / (physical)
4943        } else {
4944            logical
4945        };
4946
4947        let (sem_limit, sem_max) = if logical == physical {
4948            (sem_limit * physical, 20)
4949        } else {
4950            (sem_limit * 4, 10)
4951        };
4952        let sem_limit = sem_limit / 3;
4953        tokio::sync::Semaphore::const_new(sem_limit.max(sem_max))
4954    };
4955    static ref CLIENT: async_openai::Client<async_openai::config::OpenAIConfig> =
4956        async_openai::Client::new();
4957}
4958
4959#[cfg(feature = "openai")]
4960/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
4961pub async fn openai_request_base(
4962    gpt_configs: &crate::configuration::GPTConfigs,
4963    resource: String,
4964    url: &str,
4965    prompt: &str,
4966) -> crate::features::openai_common::OpenAIReturn {
4967    match SEM.acquire().await {
4968        Ok(permit) => {
4969            let mut chat_completion_defaults =
4970                async_openai::types::CreateChatCompletionRequestArgs::default();
4971            let gpt_base = chat_completion_defaults
4972                .max_tokens(gpt_configs.max_tokens)
4973                .model(&gpt_configs.model);
4974            let gpt_base = match &gpt_configs.user {
4975                Some(user) => gpt_base.user(user),
4976                _ => gpt_base,
4977            };
4978            let gpt_base = match gpt_configs.temperature {
4979                Some(temp) => gpt_base.temperature(temp),
4980                _ => gpt_base,
4981            };
4982            let gpt_base = match gpt_configs.top_p {
4983                Some(tp) => gpt_base.top_p(tp),
4984                _ => gpt_base,
4985            };
4986
4987            let core_bpe = match tiktoken_rs::get_bpe_from_model(&gpt_configs.model) {
4988                Ok(bpe) => Some(bpe),
4989                _ => None,
4990            };
4991
4992            let (tokens, prompt_tokens) = match &core_bpe {
4993                Some(core_bpe) => (
4994                    core_bpe.encode_with_special_tokens(&resource),
4995                    core_bpe.encode_with_special_tokens(&prompt),
4996                ),
4997                _ => (
4998                    CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&resource),
4999                    CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
5000                ),
5001            };
5002
5003            // // we can use the output count later to perform concurrent actions.
5004            let output_tokens_count = tokens.len() + prompt_tokens.len();
5005
5006            let mut max_tokens = crate::features::openai::calculate_max_tokens(
5007                &gpt_configs.model,
5008                gpt_configs.max_tokens,
5009                &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
5010                &resource,
5011                &prompt,
5012            );
5013
5014            // we need to slim down the content to fit the window.
5015            let resource = if output_tokens_count > max_tokens {
5016                let r = clean_html(&resource);
5017
5018                max_tokens = crate::features::openai::calculate_max_tokens(
5019                    &gpt_configs.model,
5020                    gpt_configs.max_tokens,
5021                    &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
5022                    &r,
5023                    &prompt,
5024                );
5025
5026                let (tokens, prompt_tokens) = match &core_bpe {
5027                    Some(core_bpe) => (
5028                        core_bpe.encode_with_special_tokens(&r),
5029                        core_bpe.encode_with_special_tokens(&prompt),
5030                    ),
5031                    _ => (
5032                        CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
5033                        CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
5034                    ),
5035                };
5036
5037                let output_tokens_count = tokens.len() + prompt_tokens.len();
5038
5039                if output_tokens_count > max_tokens {
5040                    let r = clean_html_slim(&r);
5041
5042                    max_tokens = crate::features::openai::calculate_max_tokens(
5043                        &gpt_configs.model,
5044                        gpt_configs.max_tokens,
5045                        &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
5046                        &r,
5047                        &prompt,
5048                    );
5049
5050                    let (tokens, prompt_tokens) = match &core_bpe {
5051                        Some(core_bpe) => (
5052                            core_bpe.encode_with_special_tokens(&r),
5053                            core_bpe.encode_with_special_tokens(&prompt),
5054                        ),
5055                        _ => (
5056                            CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
5057                            CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
5058                        ),
5059                    };
5060
5061                    let output_tokens_count = tokens.len() + prompt_tokens.len();
5062
5063                    if output_tokens_count > max_tokens {
5064                        clean_html_full(&r)
5065                    } else {
5066                        r
5067                    }
5068                } else {
5069                    r
5070                }
5071            } else {
5072                clean_html(&resource)
5073            };
5074
5075            let mut tokens_used = crate::features::openai_common::OpenAIUsage::default();
5076            let json_mode = gpt_configs.extra_ai_data;
5077
5078            let response_format = {
5079                let mut mode = if json_mode {
5080                    async_openai::types::ResponseFormat::JsonObject
5081                } else {
5082                    async_openai::types::ResponseFormat::Text
5083                };
5084
5085                if let Some(structure) = &gpt_configs.json_schema {
5086                    if let Some(schema) = &structure.schema {
5087                        if let Ok(mut schema) =
5088                            crate::features::serde_json::from_str::<serde_json::Value>(&schema)
5089                        {
5090                            if json_mode {
5091                                // Insert the "js" property into the schema's properties. Todo: capture if the js property exist and re-word prompt to match new js property with after removal.
5092                                if let Some(properties) = schema.get_mut("properties") {
5093                                    if let Some(properties_map) = properties.as_object_mut() {
5094                                        properties_map.insert(
5095                                            "js".to_string(),
5096                                            serde_json::json!({
5097                                                "type": "string"
5098                                            }),
5099                                        );
5100                                    }
5101                                }
5102                            }
5103
5104                            mode = async_openai::types::ResponseFormat::JsonSchema {
5105                                json_schema: async_openai::types::ResponseFormatJsonSchema {
5106                                    description: structure.description.clone(),
5107                                    name: structure.name.clone(),
5108                                    schema: if schema.is_null() { None } else { Some(schema) },
5109                                    strict: structure.strict,
5110                                },
5111                            }
5112                        }
5113                    }
5114                }
5115
5116                mode
5117            };
5118
5119            match async_openai::types::ChatCompletionRequestAssistantMessageArgs::default()
5120                .content(string_concat!("URL: ", url, "\n", "HTML: ", resource))
5121                .build()
5122            {
5123                Ok(resource_completion) => {
5124                    let mut messages: Vec<async_openai::types::ChatCompletionRequestMessage> =
5125                        vec![crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT.clone()];
5126
5127                    if json_mode {
5128                        messages.push(
5129                            crate::features::openai::BROWSER_ACTIONS_SYSTEM_EXTRA_PROMPT.clone(),
5130                        );
5131                    }
5132
5133                    messages.push(resource_completion.into());
5134
5135                    if !prompt.is_empty() {
5136                        messages.push(
5137                            match async_openai::types::ChatCompletionRequestUserMessageArgs::default()
5138                            .content(prompt)
5139                            .build()
5140                        {
5141                            Ok(o) => o,
5142                            _ => Default::default(),
5143                        }
5144                        .into()
5145                        )
5146                    }
5147
5148                    let v = match gpt_base
5149                        .max_tokens(max_tokens as u32)
5150                        .messages(messages)
5151                        .response_format(response_format)
5152                        .build()
5153                    {
5154                        Ok(request) => {
5155                            let res = match &gpt_configs.api_key {
5156                                Some(key) => {
5157                                    if !key.is_empty() {
5158                                        let conf = CLIENT.config().to_owned();
5159                                        async_openai::Client::with_config(conf.with_api_key(key))
5160                                            .chat()
5161                                            .create(request)
5162                                            .await
5163                                    } else {
5164                                        CLIENT.chat().create(request).await
5165                                    }
5166                                }
5167                                _ => CLIENT.chat().create(request).await,
5168                            };
5169
5170                            match res {
5171                                Ok(mut response) => {
5172                                    let mut choice = response.choices.first_mut();
5173
5174                                    if let Some(usage) = response.usage.take() {
5175                                        tokens_used.prompt_tokens = usage.prompt_tokens;
5176                                        tokens_used.completion_tokens = usage.completion_tokens;
5177                                        tokens_used.total_tokens = usage.total_tokens;
5178                                    }
5179
5180                                    match choice.as_mut() {
5181                                        Some(c) => match c.message.content.take() {
5182                                            Some(content) => content,
5183                                            _ => Default::default(),
5184                                        },
5185                                        _ => Default::default(),
5186                                    }
5187                                }
5188                                Err(err) => {
5189                                    log::error!("{:?}", err);
5190                                    Default::default()
5191                                }
5192                            }
5193                        }
5194                        _ => Default::default(),
5195                    };
5196
5197                    drop(permit);
5198
5199                    crate::features::openai_common::OpenAIReturn {
5200                        response: v,
5201                        usage: tokens_used,
5202                        error: None,
5203                    }
5204                }
5205                Err(e) => {
5206                    let mut d = crate::features::openai_common::OpenAIReturn::default();
5207
5208                    d.error = Some(e.to_string());
5209
5210                    d
5211                }
5212            }
5213        }
5214        Err(e) => {
5215            let mut d = crate::features::openai_common::OpenAIReturn::default();
5216
5217            d.error = Some(e.to_string());
5218
5219            d
5220        }
5221    }
5222}
5223
5224#[cfg(all(feature = "openai", not(feature = "cache_openai")))]
5225/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
5226pub async fn openai_request(
5227    gpt_configs: &crate::configuration::GPTConfigs,
5228    resource: String,
5229    url: &str,
5230    prompt: &str,
5231) -> crate::features::openai_common::OpenAIReturn {
5232    openai_request_base(gpt_configs, resource, url, prompt).await
5233}
5234
5235#[cfg(all(feature = "openai", feature = "cache_openai"))]
5236/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
5237pub async fn openai_request(
5238    gpt_configs: &crate::configuration::GPTConfigs,
5239    resource: String,
5240    url: &str,
5241    prompt: &str,
5242) -> crate::features::openai_common::OpenAIReturn {
5243    match &gpt_configs.cache {
5244        Some(cache) => {
5245            use std::hash::{Hash, Hasher};
5246            let mut s = ahash::AHasher::default();
5247
5248            url.hash(&mut s);
5249            prompt.hash(&mut s);
5250            gpt_configs.model.hash(&mut s);
5251            gpt_configs.max_tokens.hash(&mut s);
5252            gpt_configs.extra_ai_data.hash(&mut s);
5253            // non-determinstic
5254            resource.hash(&mut s);
5255
5256            let key = s.finish();
5257
5258            match cache.get(&key).await {
5259                Some(cache) => {
5260                    let mut c = cache;
5261                    c.usage.cached = true;
5262                    c
5263                }
5264                _ => {
5265                    let r = openai_request_base(gpt_configs, resource, url, prompt).await;
5266                    let _ = cache.insert(key, r.clone()).await;
5267                    r
5268                }
5269            }
5270        }
5271        _ => openai_request_base(gpt_configs, resource, url, prompt).await,
5272    }
5273}
5274
5275#[cfg(any(feature = "gemini", feature = "real_browser"))]
5276lazy_static! {
5277    /// Semaphore for Gemini rate limiting
5278    pub static ref GEMINI_SEM: tokio::sync::Semaphore = {
5279        let sem_limit = (num_cpus::get() * 2).max(8);
5280        tokio::sync::Semaphore::const_new(sem_limit)
5281    };
5282}
5283
5284#[cfg(not(feature = "gemini"))]
5285/// Perform a request to Gemini. This does nothing without the 'gemini' flag enabled.
5286pub async fn gemini_request(
5287    _gemini_configs: &crate::configuration::GeminiConfigs,
5288    _resource: String,
5289    _url: &str,
5290    _prompt: &str,
5291) -> crate::features::gemini_common::GeminiReturn {
5292    Default::default()
5293}
5294
5295#[cfg(feature = "gemini")]
5296/// Perform a request to Gemini Chat.
5297pub async fn gemini_request_base(
5298    gemini_configs: &crate::configuration::GeminiConfigs,
5299    resource: String,
5300    url: &str,
5301    prompt: &str,
5302) -> crate::features::gemini_common::GeminiReturn {
5303    use crate::features::gemini_common::{GeminiReturn, GeminiUsage, DEFAULT_GEMINI_MODEL};
5304
5305    match GEMINI_SEM.acquire().await {
5306        Ok(permit) => {
5307            // Get API key from config or environment
5308            let api_key = match &gemini_configs.api_key {
5309                Some(key) if !key.is_empty() => key.clone(),
5310                _ => match std::env::var("GEMINI_API_KEY") {
5311                    Ok(key) => key,
5312                    Err(_) => {
5313                        return GeminiReturn {
5314                            error: Some("GEMINI_API_KEY not set".to_string()),
5315                            ..Default::default()
5316                        };
5317                    }
5318                },
5319            };
5320
5321            // Determine model to use
5322            let model = if gemini_configs.model.is_empty() {
5323                DEFAULT_GEMINI_MODEL.to_string()
5324            } else {
5325                gemini_configs.model.clone()
5326            };
5327
5328            // Create Gemini client with model
5329            let client = match gemini_rust::Gemini::with_model(&api_key, model) {
5330                Ok(c) => c,
5331                Err(e) => {
5332                    drop(permit);
5333                    return GeminiReturn {
5334                        error: Some(format!("Failed to create Gemini client: {}", e)),
5335                        ..Default::default()
5336                    };
5337                }
5338            };
5339
5340            // Clean HTML to reduce token usage
5341            let resource = clean_html(&resource);
5342
5343            // Build the combined prompt
5344            let json_mode = gemini_configs.extra_ai_data;
5345            let system_prompt = if json_mode {
5346                format!(
5347                    "{}\n\n{}",
5348                    *crate::features::gemini::BROWSER_ACTIONS_SYSTEM_PROMPT,
5349                    *crate::features::gemini::BROWSER_ACTIONS_SYSTEM_EXTRA_PROMPT
5350                )
5351            } else {
5352                crate::features::gemini::BROWSER_ACTIONS_SYSTEM_PROMPT.clone()
5353            };
5354
5355            let full_prompt = format!(
5356                "{}\n\nURL: {}\nHTML: {}\n\nUser Request: {}",
5357                system_prompt, url, resource, prompt
5358            );
5359
5360            // Build generation config with JSON schema support
5361            let gen_config = gemini_rust::GenerationConfig {
5362                max_output_tokens: Some(gemini_configs.max_tokens as i32),
5363                temperature: gemini_configs.temperature,
5364                top_p: gemini_configs.top_p,
5365                top_k: gemini_configs.top_k,
5366                response_mime_type: if gemini_configs.json_schema.is_some() {
5367                    Some("application/json".to_string())
5368                } else {
5369                    None
5370                },
5371                response_schema: gemini_configs.json_schema.as_ref().and_then(|schema| {
5372                    schema
5373                        .schema
5374                        .as_ref()
5375                        .and_then(|s| serde_json::from_str::<serde_json::Value>(s).ok())
5376                }),
5377                ..Default::default()
5378            };
5379
5380            // Execute request
5381            let result = client
5382                .generate_content()
5383                .with_user_message(&full_prompt)
5384                .with_generation_config(gen_config)
5385                .execute()
5386                .await;
5387
5388            drop(permit);
5389
5390            match result {
5391                Ok(response) => {
5392                    let text = response.text();
5393
5394                    // Extract usage metadata
5395                    let usage = if let Some(meta) = response.usage_metadata {
5396                        GeminiUsage {
5397                            prompt_tokens: meta.prompt_token_count.unwrap_or(0) as u32,
5398                            completion_tokens: meta.candidates_token_count.unwrap_or(0) as u32,
5399                            total_tokens: meta.total_token_count.unwrap_or(0) as u32,
5400                            cached: false,
5401                        }
5402                    } else {
5403                        GeminiUsage::default()
5404                    };
5405
5406                    GeminiReturn {
5407                        response: text,
5408                        usage,
5409                        error: None,
5410                    }
5411                }
5412                Err(e) => {
5413                    log::error!("Gemini request failed: {:?}", e);
5414                    GeminiReturn {
5415                        error: Some(e.to_string()),
5416                        ..Default::default()
5417                    }
5418                }
5419            }
5420        }
5421        Err(e) => GeminiReturn {
5422            error: Some(e.to_string()),
5423            ..Default::default()
5424        },
5425    }
5426}
5427
5428#[cfg(all(feature = "gemini", not(feature = "cache_gemini")))]
5429/// Perform a request to Gemini Chat.
5430pub async fn gemini_request(
5431    gemini_configs: &crate::configuration::GeminiConfigs,
5432    resource: String,
5433    url: &str,
5434    prompt: &str,
5435) -> crate::features::gemini_common::GeminiReturn {
5436    gemini_request_base(gemini_configs, resource, url, prompt).await
5437}
5438
5439#[cfg(all(feature = "gemini", feature = "cache_gemini"))]
5440/// Perform a request to Gemini Chat with caching.
5441pub async fn gemini_request(
5442    gemini_configs: &crate::configuration::GeminiConfigs,
5443    resource: String,
5444    url: &str,
5445    prompt: &str,
5446) -> crate::features::gemini_common::GeminiReturn {
5447    match &gemini_configs.cache {
5448        Some(cache) => {
5449            use std::hash::{Hash, Hasher};
5450            let mut s = ahash::AHasher::default();
5451
5452            url.hash(&mut s);
5453            prompt.hash(&mut s);
5454            gemini_configs.model.hash(&mut s);
5455            gemini_configs.max_tokens.hash(&mut s);
5456            gemini_configs.extra_ai_data.hash(&mut s);
5457            resource.hash(&mut s);
5458
5459            let key = s.finish();
5460
5461            match cache.get(&key).await {
5462                Some(cached) => {
5463                    let mut c = cached;
5464                    c.usage.cached = true;
5465                    c
5466                }
5467                _ => {
5468                    let r = gemini_request_base(gemini_configs, resource, url, prompt).await;
5469                    let _ = cache.insert(key, r.clone()).await;
5470                    r
5471                }
5472            }
5473        }
5474        _ => gemini_request_base(gemini_configs, resource, url, prompt).await,
5475    }
5476}
5477
5478/// Clean the html removing css and js default (raw passthrough).
5479#[inline]
5480pub fn clean_html_raw(html: &str) -> String {
5481    html.to_string()
5482}
5483
5484/// Clean the html removing css and js (base).
5485///
5486/// Uses `lol_html` to strip noisy elements and reduce prompt size.
5487pub fn clean_html_base(html: &str) -> String {
5488    use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
5489
5490    match rewrite_str(
5491        html,
5492        RewriteStrSettings {
5493            element_content_handlers: vec![
5494                element!("script", |el| {
5495                    el.remove();
5496                    Ok(())
5497                }),
5498                element!("style", |el| {
5499                    el.remove();
5500                    Ok(())
5501                }),
5502                element!("link", |el| {
5503                    el.remove();
5504                    Ok(())
5505                }),
5506                element!("iframe", |el| {
5507                    el.remove();
5508                    Ok(())
5509                }),
5510                element!("[style*='display:none']", |el| {
5511                    el.remove();
5512                    Ok(())
5513                }),
5514                element!("[id*='ad']", |el| {
5515                    el.remove();
5516                    Ok(())
5517                }),
5518                element!("[class*='ad']", |el| {
5519                    el.remove();
5520                    Ok(())
5521                }),
5522                element!("[id*='tracking']", |el| {
5523                    el.remove();
5524                    Ok(())
5525                }),
5526                element!("[class*='tracking']", |el| {
5527                    el.remove();
5528                    Ok(())
5529                }),
5530                element!("meta", |el| {
5531                    if let Some(attribute) = el.get_attribute("name") {
5532                        if attribute != "title" && attribute != "description" {
5533                            el.remove();
5534                        }
5535                    } else {
5536                        el.remove();
5537                    }
5538                    Ok(())
5539                }),
5540            ],
5541            document_content_handlers: vec![doc_comments!(|c| {
5542                c.remove();
5543                Ok(())
5544            })],
5545            ..RewriteStrSettings::default()
5546        },
5547    ) {
5548        Ok(r) => r,
5549        _ => html.into(),
5550    }
5551}
5552
5553/// Clean the HTML to slim-fit models. This removes base64 images and heavy nodes.
5554pub fn clean_html_slim(html: &str) -> String {
5555    use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
5556
5557    match rewrite_str(
5558        html,
5559        RewriteStrSettings {
5560            element_content_handlers: vec![
5561                element!("script", |el| {
5562                    el.remove();
5563                    Ok(())
5564                }),
5565                element!("style", |el| {
5566                    el.remove();
5567                    Ok(())
5568                }),
5569                element!("svg", |el| {
5570                    el.remove();
5571                    Ok(())
5572                }),
5573                element!("noscript", |el| {
5574                    el.remove();
5575                    Ok(())
5576                }),
5577                element!("link", |el| {
5578                    el.remove();
5579                    Ok(())
5580                }),
5581                element!("iframe", |el| {
5582                    el.remove();
5583                    Ok(())
5584                }),
5585                element!("canvas", |el| {
5586                    el.remove();
5587                    Ok(())
5588                }),
5589                element!("video", |el| {
5590                    el.remove();
5591                    Ok(())
5592                }),
5593                element!("img", |el| {
5594                    if let Some(src) = el.get_attribute("src") {
5595                        if src.starts_with("data:image") {
5596                            el.remove();
5597                        }
5598                    }
5599                    Ok(())
5600                }),
5601                element!("picture", |el| {
5602                    // picture usually has nested sources; still remove if it’s inline-data heavy
5603                    // (this is conservative; keep if you want structure)
5604                    if let Some(src) = el.get_attribute("src") {
5605                        if src.starts_with("data:image") {
5606                            el.remove();
5607                        }
5608                    }
5609                    Ok(())
5610                }),
5611                element!("[style*='display:none']", |el| {
5612                    el.remove();
5613                    Ok(())
5614                }),
5615                element!("[id*='ad']", |el| {
5616                    el.remove();
5617                    Ok(())
5618                }),
5619                element!("[class*='ad']", |el| {
5620                    el.remove();
5621                    Ok(())
5622                }),
5623                element!("[id*='tracking']", |el| {
5624                    el.remove();
5625                    Ok(())
5626                }),
5627                element!("[class*='tracking']", |el| {
5628                    el.remove();
5629                    Ok(())
5630                }),
5631                element!("meta", |el| {
5632                    if let Some(attribute) = el.get_attribute("name") {
5633                        if attribute != "title" && attribute != "description" {
5634                            el.remove();
5635                        }
5636                    } else {
5637                        el.remove();
5638                    }
5639                    Ok(())
5640                }),
5641            ],
5642            document_content_handlers: vec![doc_comments!(|c| {
5643                c.remove();
5644                Ok(())
5645            })],
5646            ..RewriteStrSettings::default()
5647        },
5648    ) {
5649        Ok(r) => r,
5650        _ => html.into(),
5651    }
5652}
5653
5654/// Clean the most extra properties in the html to fit the context.
5655/// Removes nav/footer, trims meta, and prunes most attributes except id/class/data-*.
5656pub fn clean_html_full(html: &str) -> String {
5657    use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
5658
5659    match rewrite_str(
5660        html,
5661        RewriteStrSettings {
5662            element_content_handlers: vec![
5663                element!("nav, footer", |el| {
5664                    el.remove();
5665                    Ok(())
5666                }),
5667                element!("meta", |el| {
5668                    let name = el.get_attribute("name").map(|n| n.to_lowercase());
5669                    if !matches!(name.as_deref(), Some("viewport") | Some("charset")) {
5670                        el.remove();
5671                    }
5672                    Ok(())
5673                }),
5674                element!("*", |el| {
5675                    // Keep only: id, class, data-*
5676                    let mut to_remove: Vec<String> = Vec::new();
5677                    for attr in el.attributes().iter() {
5678                        let n = attr.name();
5679                        let keep = n == "id" || n == "class" || n.starts_with("data-");
5680                        if !keep {
5681                            to_remove.push(n);
5682                        }
5683                    }
5684                    for attr in to_remove {
5685                        el.remove_attribute(&attr);
5686                    }
5687                    Ok(())
5688                }),
5689            ],
5690            document_content_handlers: vec![doc_comments!(|c| {
5691                c.remove();
5692                Ok(())
5693            })],
5694            ..RewriteStrSettings::default()
5695        },
5696    ) {
5697        Ok(r) => r,
5698        _ => html.into(),
5699    }
5700}
5701
5702/// Default cleaner used by the engine.
5703///
5704/// If you still want a “slim fit” toggle, keep the feature gate here (safe).
5705#[cfg(feature = "openai_slim_fit")]
5706#[inline]
5707pub fn clean_html(html: &str) -> String {
5708    clean_html_slim(html)
5709}
5710
5711/// Default cleaner used by the engine (non-slim build).
5712#[cfg(not(feature = "openai_slim_fit"))]
5713#[inline]
5714pub fn clean_html(html: &str) -> String {
5715    clean_html_base(html)
5716}
5717
5718/// Log to console if configuration verbose.
5719pub fn log(message: &'static str, data: impl AsRef<str>) {
5720    if log_enabled!(Level::Info) {
5721        info!("{message} - {}", data.as_ref());
5722    }
5723}
5724
5725#[cfg(feature = "control")]
5726/// determine action
5727#[derive(PartialEq, Debug)]
5728pub enum Handler {
5729    /// Crawl start state
5730    Start,
5731    /// Crawl pause state
5732    Pause,
5733    /// Crawl resume
5734    Resume,
5735    /// Crawl shutdown
5736    Shutdown,
5737}
5738
5739#[cfg(feature = "control")]
5740lazy_static! {
5741    /// control handle for crawls
5742    pub static ref CONTROLLER: std::sync::Arc<tokio::sync::RwLock<(tokio::sync::watch::Sender<(String, Handler)>,
5743        tokio::sync::watch::Receiver<(String, Handler)>)>> =
5744            std::sync::Arc::new(tokio::sync::RwLock::new(tokio::sync::watch::channel(("handles".to_string(), Handler::Start))));
5745}
5746
5747#[cfg(feature = "control")]
5748/// Pause a target website running crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
5749pub async fn pause(target: &str) {
5750    if let Err(e) = CONTROLLER
5751        .write()
5752        .await
5753        .0
5754        .send((target.into(), Handler::Pause))
5755    {
5756        log::error!("PAUSE: {:?}", e);
5757    }
5758}
5759
5760#[cfg(feature = "control")]
5761/// Resume a target website crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
5762pub async fn resume(target: &str) {
5763    if let Err(e) = CONTROLLER
5764        .write()
5765        .await
5766        .0
5767        .send((target.into(), Handler::Resume))
5768    {
5769        log::error!("RESUME: {:?}", e);
5770    }
5771}
5772
5773#[cfg(feature = "control")]
5774/// Shutdown a target website crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
5775pub async fn shutdown(target: &str) {
5776    if let Err(e) = CONTROLLER
5777        .write()
5778        .await
5779        .0
5780        .send((target.into(), Handler::Shutdown))
5781    {
5782        log::error!("SHUTDOWN: {:?}", e);
5783    }
5784}
5785
5786#[cfg(feature = "control")]
5787/// Reset a target website crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
5788pub async fn reset(target: &str) {
5789    if let Err(e) = CONTROLLER
5790        .write()
5791        .await
5792        .0
5793        .send((target.into(), Handler::Start))
5794    {
5795        log::error!("RESET: {:?}", e);
5796    }
5797}
5798
5799/// Setup selectors for handling link targets.
5800pub(crate) fn setup_website_selectors(url: &str, allowed: AllowedDomainTypes) -> RelativeSelectors {
5801    let subdomains = allowed.subdomains;
5802    let tld = allowed.tld;
5803
5804    crate::page::get_page_selectors_base(url, subdomains, tld)
5805}
5806
5807/// Allow subdomains or tlds.
5808#[derive(Debug, Default, Clone, Copy)]
5809pub struct AllowedDomainTypes {
5810    /// Subdomains
5811    pub subdomains: bool,
5812    /// Tlds
5813    pub tld: bool,
5814}
5815
5816impl AllowedDomainTypes {
5817    /// A new domain type.
5818    pub fn new(subdomains: bool, tld: bool) -> Self {
5819        Self { subdomains, tld }
5820    }
5821}
5822
5823/// Modify the selectors for targetting a website.
5824pub(crate) fn modify_selectors(
5825    prior_domain: &Option<Box<Url>>,
5826    domain: &str,
5827    domain_parsed: &mut Option<Box<Url>>,
5828    url: &mut Box<CaseInsensitiveString>,
5829    base: &mut RelativeSelectors,
5830    allowed: AllowedDomainTypes,
5831) {
5832    *domain_parsed = parse_absolute_url(domain);
5833    *url = Box::new(domain.into());
5834    let s = setup_website_selectors(url.inner(), allowed);
5835    base.0 = s.0;
5836    base.1 = s.1;
5837    if let Some(prior_domain) = prior_domain {
5838        if let Some(dname) = prior_domain.host_str() {
5839            base.2 = dname.into();
5840        }
5841    }
5842}
5843
5844/// Get the last segment path.
5845pub fn get_last_segment(path: &str) -> &str {
5846    if let Some(pos) = path.rfind('/') {
5847        let next_position = pos + 1;
5848        if next_position < path.len() {
5849            &path[next_position..]
5850        } else {
5851            ""
5852        }
5853    } else {
5854        path
5855    }
5856}
5857
5858/// Get the path from a url
5859pub(crate) fn get_path_from_url(url: &str) -> &str {
5860    if let Some(start_pos) = url.find("//") {
5861        let mut pos = start_pos + 2;
5862
5863        if let Some(third_slash_pos) = url[pos..].find('/') {
5864            pos += third_slash_pos;
5865            &url[pos..]
5866        } else {
5867            "/"
5868        }
5869    } else {
5870        "/"
5871    }
5872}
5873
5874/// Get the domain from a url.
5875pub(crate) fn get_domain_from_url(url: &str) -> &str {
5876    if let Some(start_pos) = url.find("//") {
5877        let pos = start_pos + 2;
5878
5879        if let Some(first_slash_pos) = url[pos..].find('/') {
5880            &url[pos..pos + first_slash_pos]
5881        } else {
5882            &url[pos..]
5883        }
5884    } else {
5885        if let Some(first_slash_pos) = url.find('/') {
5886            &url[..first_slash_pos]
5887        } else {
5888            &url
5889        }
5890    }
5891}
5892
5893/// Determine if networking is capable for a URL.
5894pub fn networking_capable(url: &str) -> bool {
5895    url.starts_with("https://")
5896        || url.starts_with("http://")
5897        || url.starts_with("file://")
5898        || url.starts_with("ftp://")
5899}
5900
5901/// Prepare the url for parsing if it fails. Use this method if the url does not start with http or https.
5902pub fn prepare_url(u: &str) -> String {
5903    if let Some(index) = u.find("://") {
5904        let split_index = u
5905            .char_indices()
5906            .nth(index + 3)
5907            .map(|(i, _)| i)
5908            .unwrap_or(u.len());
5909
5910        format!("https://{}", &u[split_index..])
5911    } else {
5912        format!("https://{}", u)
5913    }
5914}
5915
5916/// normalize the html markup to prevent Maliciousness.
5917pub(crate) async fn normalize_html(html: &[u8]) -> Vec<u8> {
5918    use lol_html::{element, send::Settings};
5919
5920    let mut output = Vec::new();
5921
5922    let mut rewriter = HtmlRewriter::new(
5923        Settings {
5924            element_content_handlers: vec![
5925                element!("a[href]", |el| {
5926                    el.remove_attribute("href");
5927                    Ok(())
5928                }),
5929                element!("script, style, iframe, base, noscript", |el| {
5930                    el.remove();
5931                    Ok(())
5932                }),
5933                element!("*", |el| {
5934                    let mut remove_attr = vec![];
5935
5936                    for attr in el.attributes() {
5937                        let name = attr.name();
5938                        let remove =
5939                            !(name.starts_with("data-") || name == "id" || name == "class");
5940                        if remove {
5941                            remove_attr.push(name);
5942                        }
5943                    }
5944
5945                    for name in remove_attr {
5946                        el.remove_attribute(&name);
5947                    }
5948
5949                    Ok(())
5950                }),
5951            ],
5952            ..Settings::new_send()
5953        },
5954        |c: &[u8]| output.extend_from_slice(c),
5955    );
5956
5957    let chunks = html.chunks(*STREAMING_CHUNK_SIZE);
5958    let mut stream = tokio_stream::iter(chunks);
5959    let mut wrote_error = false;
5960
5961    while let Some(chunk) = stream.next().await {
5962        if rewriter.write(chunk).is_err() {
5963            wrote_error = true;
5964            break;
5965        }
5966    }
5967
5968    if !wrote_error {
5969        let _ = rewriter.end();
5970    }
5971
5972    output
5973}
5974
5975/// Hash html markup.
5976pub(crate) async fn hash_html(html: &[u8]) -> u64 {
5977    let normalized_html = normalize_html(html).await;
5978
5979    if !normalized_html.is_empty() {
5980        use std::hash::{Hash, Hasher};
5981        let mut s = ahash::AHasher::default();
5982        normalized_html.hash(&mut s);
5983        let key = s.finish();
5984        key
5985    } else {
5986        Default::default()
5987    }
5988}
5989
5990#[cfg(feature = "tracing")]
5991/// Spawns a new asynchronous task.
5992pub(crate) fn spawn_task<F>(task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
5993where
5994    F: std::future::Future + Send + 'static,
5995    F::Output: Send + 'static,
5996{
5997    tokio::task::Builder::new()
5998        .name(task_name)
5999        .spawn(future)
6000        .expect("failed to spawn task")
6001}
6002
6003#[cfg(not(feature = "tracing"))]
6004#[allow(unused)]
6005/// Spawns a new asynchronous task.
6006pub(crate) fn spawn_task<F>(_task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
6007where
6008    F: std::future::Future + Send + 'static,
6009    F::Output: Send + 'static,
6010{
6011    tokio::task::spawn(future)
6012}
6013
6014#[cfg(feature = "tracing")]
6015/// Spawn a joinset.
6016pub(crate) fn spawn_set<F, T>(
6017    task_name: &str,
6018    set: &mut tokio::task::JoinSet<T>,
6019    future: F,
6020) -> tokio::task::AbortHandle
6021where
6022    F: Future<Output = T>,
6023    F: Send + 'static,
6024    T: Send + 'static,
6025{
6026    set.build_task()
6027        .name(task_name)
6028        .spawn(future)
6029        .expect("set should spawn")
6030}
6031
6032#[cfg(not(feature = "tracing"))]
6033/// Spawn a joinset.
6034pub(crate) fn spawn_set<F, T>(
6035    _task_name: &str,
6036    set: &mut tokio::task::JoinSet<T>,
6037    future: F,
6038) -> tokio::task::AbortHandle
6039where
6040    F: Future<Output = T>,
6041    F: Send + 'static,
6042    T: Send + 'static,
6043{
6044    set.spawn(future)
6045}
6046
6047#[cfg(feature = "balance")]
6048/// Period to wait to rebalance cpu in means of IO being main impact.
6049const REBALANCE_TIME: std::time::Duration = std::time::Duration::from_millis(100);
6050
6051/// Return the semaphore that should be used.
6052#[cfg(feature = "balance")]
6053pub async fn get_semaphore(semaphore: &Arc<Semaphore>, detect: bool) -> &Arc<Semaphore> {
6054    let cpu_load = if detect {
6055        detect_system::get_global_cpu_state().await
6056    } else {
6057        0
6058    };
6059
6060    if cpu_load == 2 {
6061        tokio::time::sleep(REBALANCE_TIME).await;
6062    }
6063
6064    if cpu_load >= 1 {
6065        &*crate::website::SEM_SHARED
6066    } else {
6067        semaphore
6068    }
6069}
6070
6071/// Check if the crawl duration is expired.
6072pub fn crawl_duration_expired(crawl_timeout: &Option<Duration>, start: &Option<Instant>) -> bool {
6073    crawl_timeout
6074        .and_then(|duration| start.map(|start| start.elapsed() >= duration))
6075        .unwrap_or(false)
6076}
6077
6078/// is the content html and safe for formatting.
6079static HTML_TAGS: phf::Set<&'static [u8]> = phf_set! {
6080    b"<!doctype html",
6081    b"<html",
6082    b"<document",
6083};
6084
6085/// Check if the content is HTML.
6086pub fn is_html_content_check(bytes: &[u8]) -> bool {
6087    let check_bytes = if bytes.len() > 1024 {
6088        &bytes[..1024]
6089    } else {
6090        bytes
6091    };
6092
6093    for tag in HTML_TAGS.iter() {
6094        if check_bytes
6095            .windows(tag.len())
6096            .any(|window| window.eq_ignore_ascii_case(tag))
6097        {
6098            return true;
6099        }
6100    }
6101
6102    false
6103}
6104
6105/// Return the semaphore that should be used.
6106#[cfg(not(feature = "balance"))]
6107pub async fn get_semaphore(semaphore: &Arc<Semaphore>, _detect: bool) -> &Arc<Semaphore> {
6108    semaphore
6109}
6110
6111// #[derive(Debug)]
6112// /// Html output sink for the rewriter.
6113// #[cfg(feature = "smart")]
6114// pub(crate) struct HtmlOutputSink {
6115//     /// The bytes collected.
6116//     pub(crate) data: Vec<u8>,
6117//     /// The sender to send once finished.
6118//     pub(crate) sender: Option<tokio::sync::oneshot::Sender<Vec<u8>>>,
6119// }
6120
6121// #[cfg(feature = "smart")]
6122// impl HtmlOutputSink {
6123//     /// A new output sink.
6124//     pub(crate) fn new(sender: tokio::sync::oneshot::Sender<Vec<u8>>) -> Self {
6125//         HtmlOutputSink {
6126//             data: Vec::new(),
6127//             sender: Some(sender),
6128//         }
6129//     }
6130// }
6131
6132// #[cfg(feature = "smart")]
6133// impl OutputSink for HtmlOutputSink {
6134//     fn handle_chunk(&mut self, chunk: &[u8]) {
6135//         self.data.extend_from_slice(chunk);
6136//         if chunk.len() == 0 {
6137//             if let Some(sender) = self.sender.take() {
6138//                 let data_to_send = std::mem::take(&mut self.data);
6139//                 let _ = sender.send(data_to_send);
6140//             }
6141//         }
6142//     }
6143// }
6144
6145/// Consumes `set` and returns (left, right), where `left` are items matching `pred`.
6146pub fn split_hashset_round_robin<T>(mut set: HashSet<T>, parts: usize) -> Vec<HashSet<T>>
6147where
6148    T: Eq + std::hash::Hash,
6149{
6150    if parts <= 1 {
6151        return vec![set];
6152    }
6153    let len = set.len();
6154    let mut buckets: Vec<HashSet<T>> = (0..parts)
6155        .map(|_| HashSet::with_capacity(len / parts + 1))
6156        .collect();
6157
6158    let mut i = 0usize;
6159    for v in set.drain() {
6160        buckets[i % parts].insert(v);
6161        i += 1;
6162    }
6163    buckets
6164}
6165/// Emit a log info event.
6166#[cfg(feature = "tracing")]
6167pub fn emit_log(link: &str) {
6168    tracing::info!("fetch {}", &link);
6169}
6170/// Emit a log info event.
6171#[cfg(not(feature = "tracing"))]
6172pub fn emit_log(link: &str) {
6173    log::info!("fetch {}", &link);
6174}
6175
6176/// Emit a log info event.
6177#[cfg(feature = "tracing")]
6178pub fn emit_log_shutdown(link: &str) {
6179    tracing::info!("shutdown {}", &link);
6180}
6181/// Emit a log info event.
6182#[cfg(not(feature = "tracing"))]
6183pub fn emit_log_shutdown(link: &str) {
6184    log::info!("shutdown {}", &link);
6185}