spider/utils/
mod.rs

1/// Absolute path domain handling.
2pub mod abs;
3/// Connect layer for reqwest.
4pub mod connect;
5/// Generic CSS selectors.
6pub mod css_selectors;
7/// Utils to modify the HTTP header.
8pub mod header_utils;
9/// String interner.
10pub mod interner;
11/// A trie struct.
12pub mod trie;
13
14#[cfg(feature = "balance")]
15/// CPU and Memory detection to balance limitations.
16pub mod detect_system;
17use crate::page::{AntiBotTech, Metadata};
18use crate::{page::STREAMING_CHUNK_SIZE, RelativeSelectors};
19use abs::parse_absolute_url;
20use aho_corasick::AhoCorasick;
21use auto_encoder::is_binary_file;
22use bytes::BufMut;
23use case_insensitive_string::CaseInsensitiveString;
24#[cfg(feature = "chrome")]
25use hashbrown::HashMap;
26use lol_html::{send::HtmlRewriter, OutputSink};
27use phf::phf_set;
28use std::str::FromStr;
29use std::sync::Arc;
30use std::{
31    future::Future,
32    time::{Duration, Instant},
33};
34use tokio::sync::Semaphore;
35use url::Url;
36
37#[cfg(feature = "chrome")]
38use crate::features::chrome_common::{AutomationScripts, ExecutionScripts};
39use crate::page::{MAX_PRE_ALLOCATED_HTML_PAGE_SIZE, MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE};
40use crate::tokio_stream::StreamExt;
41use crate::Client;
42
43#[cfg(feature = "cache_chrome_hybrid")]
44use http_cache_semantics::{RequestLike, ResponseLike};
45
46use log::{info, log_enabled, Level};
47
48#[cfg(not(feature = "rquest"))]
49use reqwest::{Response, StatusCode};
50#[cfg(feature = "rquest")]
51use rquest::{Response, StatusCode};
52
53/// The request error.
54#[cfg(all(not(feature = "cache_request"), not(feature = "rquest")))]
55pub(crate) type RequestError = reqwest::Error;
56
57/// The request error (for `rquest`).
58#[cfg(all(not(feature = "cache_request"), feature = "rquest"))]
59pub(crate) type RequestError = rquest::Error;
60
61/// The request error (for `reqwest_middleware` with caching).
62#[cfg(feature = "cache_request")]
63pub(crate) type RequestError = reqwest_middleware::Error;
64
65/// The request response.
66pub(crate) type RequestResponse = Response;
67
68/// The wait for duration timeouts.
69#[cfg(feature = "chrome")]
70const WAIT_TIMEOUTS: [u64; 6] = [0, 20, 50, 100, 100, 500];
71/// The wait for duration timeouts.
72#[cfg(feature = "chrome")]
73const DOM_WAIT_TIMEOUTS: [u64; 6] = [100, 200, 300, 300, 400, 500];
74
75/// Ignore the content types.
76pub static IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf_set! {
77    "application/pdf",
78    "application/zip",
79    "application/x-rar-compressed",
80    "application/x-tar",
81    "image/png",
82    "image/jpeg",
83    "image/gif",
84    "image/bmp",
85    "image/svg+xml",
86    "video/mp4",
87    "video/x-msvideo",
88    "video/x-matroska",
89    "video/webm",
90    "audio/mpeg",
91    "audio/ogg",
92    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
93    "application/vnd.ms-excel",
94    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
95    "application/vnd.ms-powerpoint",
96    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
97    "application/x-7z-compressed",
98    "application/x-rpm",
99    "application/x-shockwave-flash",
100};
101
102lazy_static! {
103    /// Scan for error anti-bot pages.
104    static ref AC_BODY_SCAN: AhoCorasick = AhoCorasick::new([
105        "cf-error-code",
106        "Access to this page has been denied",
107        "DataDome",
108        "perimeterx",
109        "funcaptcha",
110        "Request unsuccessful. Incapsula incident ID",
111    ]).unwrap();
112
113    static ref AC_URL_SCAN: AhoCorasick = AhoCorasick::builder()
114        .match_kind(aho_corasick::MatchKind::LeftmostFirst) // optional: stops at first match
115        .build([
116            "/cdn-cgi/challenge-platform",       // 0
117            "datadome.co",                       // 1
118            "dd-api.io",                         // 2
119            "perimeterx.net",                    // 3
120            "px-captcha",                        // 4
121            "arkoselabs.com",                    // 5
122            "funcaptcha",                        // 6
123            "kasada.io",                         // 7
124            "fingerprint.com",                   // 8
125            "fpjs.io",                           // 9
126            "incapsula",                         // 10
127            "imperva",                           // 11
128            "radwarebotmanager",                 // 12
129            "reblaze.com",                       // 13
130            "cheq.ai",                           // 14
131        ])
132        .unwrap();
133}
134
135#[cfg(feature = "fs")]
136lazy_static! {
137    static ref TMP_DIR: String = {
138        use std::fs;
139        let mut tmp = std::env::temp_dir();
140
141        tmp.push("spider/");
142
143        // make sure spider dir is created.
144        match fs::create_dir_all(&tmp) {
145            Ok(_) => {
146                let dir_name = tmp.display().to_string();
147
148                match std::time::SystemTime::now().duration_since(std::time::SystemTime::UNIX_EPOCH) {
149                    Ok(dur) => {
150                        string_concat!(dir_name, dur.as_secs().to_string())
151                    }
152                    _ => dir_name,
153                }
154            }
155            _ => "/tmp/".to_string()
156        }
157    };
158}
159
160#[cfg(feature = "chrome")]
161lazy_static! {
162    /// Mask the chrome connection interception bytes from responses. Rejected responses send 17.0 bytes for the response.
163    pub(crate) static ref MASK_BYTES_INTERCEPTION: bool = {
164        std::env::var("MASK_BYTES_INTERCEPTION").unwrap_or_default() == "true"
165    };
166    /// Cloudflare turnstile wait.
167    pub(crate) static ref CF_WAIT_FOR: crate::features::chrome_common::WaitFor = {
168        let mut wait_for = crate::features::chrome_common::WaitFor::default();
169        wait_for.delay = crate::features::chrome_common::WaitForDelay::new(Some(core::time::Duration::from_millis(1000))).into();
170        // wait_for.dom = crate::features::chrome_common::WaitForSelector::new(Some(core::time::Duration::from_millis(1000)), "body".into()).into();
171        wait_for.idle_network = crate::features::chrome_common::WaitForIdleNetwork::new(core::time::Duration::from_secs(8).into()).into();
172        wait_for
173    };
174}
175
176lazy_static! {
177    /// Prevent fetching resources beyond the bytes limit.
178    pub(crate) static ref MAX_SIZE_BYTES: usize = {
179        match std::env::var("SPIDER_MAX_SIZE_BYTES") {
180            Ok(b) => {
181                const DEFAULT_MAX_SIZE_BYTES: usize = 1_073_741_824; // 1GB in bytes
182
183                let b = b.parse::<usize>().unwrap_or(DEFAULT_MAX_SIZE_BYTES);
184
185                if b == 0 {
186                    0
187                } else {
188                    b.max(1_048_576) // min 1mb
189                }
190            },
191            _ => 0
192        }
193    };
194}
195
196#[cfg(all(feature = "chrome", feature = "real_browser"))]
197lazy_static! {
198    static ref CF_END: &'static [u8; 62] =
199        b"target=\"_blank\">Cloudflare</a></div></div></div></body></html>";
200    static ref CF_END2: &'static [u8; 72] =
201        b"Performance &amp; security by Cloudflare</div></div></div></body></html>";
202    static ref CF_HEAD: &'static [u8; 34] = b"<html><head>\n    <style global=\"\">";
203    static ref CF_MOCK_FRAME: &'static [u8; 137] = b"<iframe height=\"1\" width=\"1\" style=\"position: absolute; top: 0px; left: 0px; border: none; visibility: hidden;\"></iframe>\n\n</body></html>";
204    static ref CF_JUST_A_MOMENT:&'static [u8; 81] = b"<!DOCTYPE html><html lang=\"en-US\" dir=\"ltr\"><head><title>Just a moment...</title>";
205}
206
207lazy_static! {
208    /// Apache server forbidden.
209    pub static ref APACHE_FORBIDDEN: &'static [u8; 317] = br#"<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
210<html><head>
211<title>403 Forbidden</title>
212</head><body>
213<h1>Forbidden</h1>
214<p>You don't have permission to access this resource.</p>
215<p>Additionally, a 403 Forbidden
216error was encountered while trying to use an ErrorDocument to handle the request.</p>
217</body></html>"#;
218
219    /// Open Resty forbidden.
220    pub static ref OPEN_RESTY_FORBIDDEN: &'static [u8; 125] = br#"<html><head><title>403 Forbidden</title></head>
221<body>
222<center><h1>403 Forbidden</h1></center>
223<hr><center>openresty</center>"#;
224}
225
226/// Detect if a page is forbidden and should not retry.
227pub fn detect_hard_forbidden_content(b: &[u8]) -> bool {
228    b == *APACHE_FORBIDDEN || b.starts_with(*OPEN_RESTY_FORBIDDEN)
229}
230
231/// Is cloudflare turnstile page? This does nothing without the real_browser feature enabled.
232#[cfg(all(feature = "chrome", feature = "real_browser"))]
233pub(crate) fn detect_cf_turnstyle(b: &Vec<u8>) -> bool {
234    let cf = CF_END.as_ref();
235    let cf2 = CF_END2.as_ref();
236    let cn = CF_HEAD.as_ref();
237    let cnf = CF_MOCK_FRAME.as_ref();
238
239    b.ends_with(cf)
240        || b.ends_with(cf2)
241        || b.starts_with(cn) && b.ends_with(cnf)
242        || b.starts_with(CF_JUST_A_MOMENT.as_ref())
243}
244
245/// Handle protected pages via chrome. This does nothing without the real_browser feature enabled.
246#[cfg(all(feature = "chrome", feature = "real_browser"))]
247async fn cf_handle(
248    b: &mut Vec<u8>,
249    page: &chromiumoxide::Page,
250) -> Result<bool, chromiumoxide::error::CdpError> {
251    let mut validated = false;
252
253    let page_result = tokio::time::timeout(tokio::time::Duration::from_secs(30), async {
254        let mut wait_for = CF_WAIT_FOR.clone();
255
256        page_wait(&page, &Some(wait_for.clone())).await;
257
258        let _ = page
259            .evaluate(r###"document.querySelectorAll("iframe").forEach(el=>el.click());"###)
260            .await;
261
262        wait_for.page_navigations = true;
263
264        page_wait(&page, &Some(wait_for.clone())).await;
265
266        if let Ok(next_content) = page.outer_html_bytes().await {
267            let next_content = if !detect_cf_turnstyle(&next_content) {
268                validated = true;
269                // we should use wait for dom instead.
270                wait_for.delay = crate::features::chrome_common::WaitForDelay::new(Some(
271                    core::time::Duration::from_secs(4),
272                ))
273                .into();
274                page_wait(&page, &Some(wait_for)).await;
275                match page.outer_html_bytes().await {
276                    Ok(nc) => nc,
277                    _ => next_content,
278                }
279            } else {
280                next_content
281            };
282
283            *b = next_content;
284        }
285    })
286    .await;
287
288    match page_result {
289        Ok(_) => Ok(validated),
290        _ => Err(chromiumoxide::error::CdpError::Timeout),
291    }
292}
293
294/// Handle cloudflare protected pages via chrome. This does nothing without the real_browser feature enabled.
295#[cfg(all(feature = "chrome", not(feature = "real_browser")))]
296async fn cf_handle(
297    _b: &mut Vec<u8>,
298    _page: &chromiumoxide::Page,
299) -> Result<(), chromiumoxide::error::CdpError> {
300    Ok(())
301}
302
303/// The response of a web page.
304#[derive(Debug, Default)]
305pub struct PageResponse {
306    /// The page response resource.
307    pub content: Option<Box<Vec<u8>>>,
308    /// The headers of the response. (Always None if a webdriver protocol is used for fetching.).
309    pub headers: Option<reqwest::header::HeaderMap>,
310    #[cfg(feature = "remote_addr")]
311    /// The remote address of the page.
312    pub remote_addr: Option<core::net::SocketAddr>,
313    #[cfg(feature = "cookies")]
314    /// The cookies of the response.
315    pub cookies: Option<reqwest::header::HeaderMap>,
316    /// The status code of the request.
317    pub status_code: StatusCode,
318    /// The final url destination after any redirects.
319    pub final_url: Option<String>,
320    /// The message of the response error if any.
321    pub error_for_status: Option<Result<Response, RequestError>>,
322    #[cfg(feature = "chrome")]
323    /// The screenshot bytes of the page. The ScreenShotConfig bytes boolean needs to be set to true.
324    pub screenshot_bytes: Option<Vec<u8>>,
325    #[cfg(feature = "openai")]
326    /// The credits used from OpenAI in order.
327    pub openai_credits_used: Option<Vec<crate::features::openai_common::OpenAIUsage>>,
328    #[cfg(feature = "openai")]
329    /// The extra data from the AI, example extracting data etc...
330    pub extra_ai_data: Option<Vec<crate::page::AIResults>>,
331    /// A WAF was found on the page.
332    pub waf_check: bool,
333    /// The total bytes transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead.
334    pub bytes_transferred: Option<f64>,
335    /// The signature of the page to use for handling de-duplication.
336    pub signature: Option<u64>,
337    #[cfg(feature = "chrome")]
338    /// All of the response events mapped with the amount of bytes used.
339    pub response_map: Option<HashMap<String, f64>>,
340    #[cfg(feature = "chrome")]
341    /// All of the request events mapped with the time period of the event sent.
342    pub request_map: Option<HashMap<String, f64>>,
343    /// The anti-bot tech used.
344    pub anti_bot_tech: crate::page::AntiBotTech,
345    /// The metadata of the page.
346    pub metadata: Option<Box<Metadata>>,
347}
348
349/// wait for event with timeout
350#[cfg(feature = "chrome")]
351pub async fn wait_for_event<T>(page: &chromiumoxide::Page, timeout: Option<core::time::Duration>)
352where
353    T: chromiumoxide::cdp::IntoEventKind + Unpin + std::fmt::Debug,
354{
355    if let Ok(mut events) = page.event_listener::<T>().await {
356        let wait_until = async {
357            let mut index = 0;
358
359            loop {
360                let current_timeout = WAIT_TIMEOUTS[index];
361                let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
362
363                tokio::select! {
364                    _ = sleep => (),
365                    v = events.next() => {
366                        if !v.is_none () {
367                            break;
368                        }
369                    }
370                }
371
372                index = (index + 1) % WAIT_TIMEOUTS.len();
373            }
374        };
375        match timeout {
376            Some(timeout) => if let Err(_) = tokio::time::timeout(timeout, wait_until).await {},
377            _ => wait_until.await,
378        }
379    }
380}
381
382/// wait for a selector
383#[cfg(feature = "chrome")]
384pub async fn wait_for_selector(
385    page: &chromiumoxide::Page,
386    timeout: Option<core::time::Duration>,
387    selector: &str,
388) -> bool {
389    let mut valid = false;
390    let wait_until = async {
391        let mut index = 0;
392
393        loop {
394            let current_timeout = WAIT_TIMEOUTS[index];
395            let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
396
397            tokio::select! {
398                _ = sleep => (),
399                v = page.find_element(selector) => {
400                    if v.is_ok() {
401                        valid = true;
402                        break;
403                    }
404                }
405            }
406
407            index = (index + 1) % WAIT_TIMEOUTS.len();
408        }
409    };
410
411    match timeout {
412        Some(timeout) => {
413            if let Err(_) = tokio::time::timeout(timeout, wait_until).await {
414                valid = false;
415            }
416        }
417        _ => wait_until.await,
418    };
419
420    valid
421}
422
423/// wait for dom to finish updating target selector
424#[cfg(feature = "chrome")]
425pub async fn wait_for_dom(
426    page: &chromiumoxide::Page,
427    timeout: Option<core::time::Duration>,
428    selector: &str,
429) {
430    let max_duration = timeout.unwrap_or_else(|| core::time::Duration::from_millis(500));
431    let mut deadline = tokio::time::Instant::now() + max_duration;
432
433    let script = crate::features::chrome_common::generate_wait_for_dom_js_code_with_selector_base(
434        max_duration.as_millis() as u32,
435        selector,
436    );
437
438    let wait_until = async {
439        let mut index = 0;
440
441        loop {
442            if tokio::time::Instant::now() >= deadline {
443                break;
444            }
445
446            let current_timeout = DOM_WAIT_TIMEOUTS[index];
447            let result = page.evaluate(script.clone()).await;
448
449            if let Ok(vv) = &result {
450                let value = vv.value();
451                if let Some(value) = value {
452                    if let Some(v) = value.as_bool() {
453                        if v {
454                            break;
455                        } else {
456                            tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout))
457                                .await;
458                            deadline = tokio::time::Instant::now() + max_duration;
459                        }
460                    }
461                }
462            }
463
464            index = (index + 1) % WAIT_TIMEOUTS.len();
465        }
466    };
467
468    let _ = tokio::time::timeout(max_duration, wait_until).await;
469}
470
471/// Get the output path of a screenshot and create any parent folders if needed.
472#[cfg(feature = "chrome")]
473pub async fn create_output_path(
474    base: &std::path::PathBuf,
475    target_url: &str,
476    format: &str,
477) -> String {
478    let out = string_concat!(
479        &percent_encoding::percent_encode(
480            target_url.as_bytes(),
481            percent_encoding::NON_ALPHANUMERIC
482        )
483        .to_string(),
484        format
485    );
486
487    let b = base.join(&out);
488
489    if let Some(p) = b.parent() {
490        let _ = tokio::fs::create_dir_all(&p).await;
491    }
492
493    b.display().to_string()
494}
495
496#[cfg(feature = "chrome")]
497/// Wait for page events.
498/// 1. First wait for idle networks.
499/// 2. Wait for selectors.
500/// 3. Wait for the dom element to finish updated.
501/// 4. Wait for hard delay.
502pub async fn page_wait(
503    page: &chromiumoxide::Page,
504    wait_for: &Option<crate::configuration::WaitFor>,
505) {
506    if let Some(wait_for) = wait_for {
507        if let Some(ref wait) = wait_for.idle_network {
508            wait_for_event::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
509                page,
510                wait.timeout,
511            )
512            .await;
513        }
514
515        if let Some(ref wait) = wait_for.selector {
516            wait_for_selector(page, wait.timeout, &wait.selector).await;
517        }
518
519        if let Some(ref wait) = wait_for.dom {
520            wait_for_dom(page, wait.timeout, &wait.selector).await;
521        }
522
523        if let Some(ref wait) = wait_for.delay {
524            if let Some(timeout) = wait.timeout {
525                tokio::time::sleep(timeout).await
526            }
527        }
528    }
529}
530
531#[derive(Debug, Default)]
532#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
533#[cfg(feature = "openai")]
534/// The json response from OpenAI.
535pub struct JsonResponse {
536    /// The content returned.
537    content: Vec<String>,
538    /// The js script for the browser.
539    js: String,
540    #[cfg_attr(feature = "serde", serde(default))]
541    /// The AI failed to parse the data.
542    error: Option<String>,
543}
544
545/// Handle the OpenAI credits used. This does nothing without 'openai' feature flag.
546#[cfg(feature = "openai")]
547pub fn handle_openai_credits(
548    page_response: &mut PageResponse,
549    tokens_used: crate::features::openai_common::OpenAIUsage,
550) {
551    match page_response.openai_credits_used.as_mut() {
552        Some(v) => v.push(tokens_used),
553        None => page_response.openai_credits_used = Some(vec![tokens_used]),
554    };
555}
556
557#[cfg(not(feature = "openai"))]
558/// Handle the OpenAI credits used. This does nothing without 'openai' feature flag.
559pub fn handle_openai_credits(
560    _page_response: &mut PageResponse,
561    _tokens_used: crate::features::openai_common::OpenAIUsage,
562) {
563}
564
565/// Handle extra OpenAI data used. This does nothing without 'openai' feature flag.
566#[cfg(feature = "openai")]
567pub fn handle_extra_ai_data(
568    page_response: &mut PageResponse,
569    prompt: &str,
570    x: JsonResponse,
571    screenshot_output: Option<Vec<u8>>,
572    error: Option<String>,
573) {
574    let ai_response = crate::page::AIResults {
575        input: prompt.into(),
576        js_output: x.js,
577        content_output: x
578            .content
579            .iter()
580            .map(|c| c.trim_start().into())
581            .collect::<Vec<_>>(),
582        screenshot_output,
583        error,
584    };
585
586    match page_response.extra_ai_data.as_mut() {
587        Some(v) => v.push(ai_response),
588        None => page_response.extra_ai_data = Some(Vec::from([ai_response])),
589    };
590}
591
592/// Accepts different header types (for flexibility).
593pub enum HeaderSource<'a> {
594    /// From reqwest or internal HeaderMap.
595    HeaderMap(&'a crate::client::header::HeaderMap),
596    /// From a string-based HashMap.
597    Map(&'a std::collections::HashMap<String, String>),
598}
599
600/// Detect from headers.
601pub fn detect_anti_bot_from_headers(headers: &HeaderSource) -> Option<AntiBotTech> {
602    macro_rules! has_key {
603        ($key:expr) => {
604            match headers {
605                HeaderSource::HeaderMap(hm) => hm.contains_key($key),
606                HeaderSource::Map(map) => map.contains_key($key),
607            }
608        };
609    }
610
611    if has_key!("cf-chl-bypass") || has_key!("cf-ray") {
612        return Some(AntiBotTech::Cloudflare);
613    }
614    if has_key!("x-captcha-endpoint") {
615        return Some(AntiBotTech::DataDome);
616    }
617    if has_key!("x-perimeterx") || has_key!("pxhd") {
618        return Some(AntiBotTech::PerimeterX);
619    }
620    if has_key!("x-akamaibot") {
621        return Some(AntiBotTech::AkamaiBotManager);
622    }
623    if has_key!("x-imperva-id") || has_key!("x-iinfo") {
624        return Some(AntiBotTech::Imperva);
625    }
626    if has_key!("x-reblaze-uuid") {
627        return Some(AntiBotTech::Reblaze);
628    }
629
630    None
631}
632
633/// Detect the anti-bot technology.
634pub fn detect_anti_bot_from_body(body: &Vec<u8>) -> Option<AntiBotTech> {
635    // Scan body for anti-bot fingerprints (only for small pages)
636    if body.len() < 30_000 {
637        if let Ok(finder) = AC_BODY_SCAN.try_find_iter(body) {
638            for mat in finder {
639                match mat.pattern().as_usize() {
640                    0 => return Some(AntiBotTech::Cloudflare),
641                    1 | 2 => return Some(AntiBotTech::DataDome),
642                    3 => return Some(AntiBotTech::PerimeterX),
643                    4 => return Some(AntiBotTech::ArkoseLabs),
644                    5 => return Some(AntiBotTech::Imperva),
645                    _ => (),
646                }
647            }
648        }
649    }
650
651    None
652}
653
654/// Detect antibot from url
655pub fn detect_antibot_from_url(url: &str) -> Option<AntiBotTech> {
656    if let Some(mat) = AC_URL_SCAN.find(url) {
657        let tech = match mat.pattern().as_usize() {
658            0 => AntiBotTech::Cloudflare,
659            1 | 2 => AntiBotTech::DataDome,
660            3 | 4 => AntiBotTech::PerimeterX,
661            5 | 6 => AntiBotTech::ArkoseLabs,
662            7 => AntiBotTech::Kasada,
663            8 | 9 => AntiBotTech::FingerprintJS,
664            10 | 11 => AntiBotTech::Imperva,
665            12 => AntiBotTech::RadwareBotManager,
666            13 => AntiBotTech::Reblaze,
667            14 => AntiBotTech::CHEQ,
668            _ => return None,
669        };
670        Some(tech)
671    } else {
672        None
673    }
674}
675/// Detect the anti-bot used from the request.
676pub fn detect_anti_bot_tech_response(
677    url: &str,
678    headers: &HeaderSource,
679    body: &Vec<u8>,
680    subject_name: Option<&str>,
681) -> AntiBotTech {
682    // Check by TLS subject (Chrome/CDP TLS details)
683    if let Some(subject) = subject_name {
684        if subject == "challenges.cloudflare.com" {
685            return AntiBotTech::Cloudflare;
686        }
687    }
688
689    if let Some(tech) = detect_anti_bot_from_headers(headers) {
690        return tech;
691    }
692
693    if let Some(tech) = detect_antibot_from_url(url) {
694        return tech;
695    }
696
697    if let Some(anti_bot) = detect_anti_bot_from_body(body) {
698        return anti_bot;
699    }
700
701    AntiBotTech::None
702}
703
704/// Extract to JsonResponse struct. This does nothing without 'openai' feature flag.
705#[cfg(feature = "openai")]
706pub fn handle_ai_data(js: &str) -> Option<JsonResponse> {
707    match serde_json::from_str::<JsonResponse>(&js) {
708        Ok(x) => Some(x),
709        _ => None,
710    }
711}
712
713#[cfg(feature = "chrome")]
714#[derive(Default, Clone, Debug)]
715/// The chrome HTTP response.
716pub struct ChromeHTTPReqRes {
717    /// Is the request blocked by a firewall?
718    pub waf_check: bool,
719    /// The HTTP status code.
720    pub status_code: StatusCode,
721    /// The HTTP method of the request.
722    pub method: String,
723    /// The HTTP response headers for the request.
724    pub response_headers: std::collections::HashMap<String, String>,
725    /// The HTTP request headers for the request.
726    pub request_headers: std::collections::HashMap<String, String>,
727    /// The HTTP protocol of the request.
728    pub protocol: String,
729    /// The anti-bot tech used.
730    pub anti_bot_tech: crate::page::AntiBotTech,
731}
732
733#[cfg(feature = "chrome")]
734/// Perform a http future with chrome.
735pub async fn perform_chrome_http_request(
736    page: &chromiumoxide::Page,
737    source: &str,
738) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
739    let mut waf_check = false;
740    let mut status_code = StatusCode::OK;
741    let mut method = String::from("GET");
742    let mut response_headers: std::collections::HashMap<String, String> =
743        std::collections::HashMap::default();
744    let mut request_headers = std::collections::HashMap::default();
745    let mut protocol = String::from("http/1.1");
746    let mut anti_bot_tech = AntiBotTech::default();
747
748    let frame_id = page.mainframe().await?;
749
750    let page_base =
751        page.http_future(chromiumoxide::cdp::browser_protocol::page::NavigateParams {
752            url: source.to_string(),
753            transition_type: Some(
754                chromiumoxide::cdp::browser_protocol::page::TransitionType::Other,
755            ),
756            frame_id,
757            referrer: None,
758            referrer_policy: None,
759        })?;
760
761    match page_base.await {
762        Ok(page_base) => {
763            if let Some(http_request) = page_base {
764                if let Some(http_method) = http_request.method.as_deref() {
765                    method = http_method.into();
766                }
767
768                request_headers.clone_from(&http_request.headers);
769
770                if let Some(ref response) = http_request.response {
771                    if let Some(ref p) = response.protocol {
772                        protocol.clone_from(p);
773                    }
774
775                    if let Some(res_headers) = response.headers.inner().as_object() {
776                        for (k, v) in res_headers {
777                            response_headers.insert(k.to_string(), v.to_string());
778                        }
779                    }
780
781                    let mut firewall = false;
782
783                    if !response.url.starts_with(source) {
784                        match &response.security_details {
785                            Some(security_details) => {
786                                anti_bot_tech = detect_anti_bot_tech_response(
787                                    &response.url,
788                                    &HeaderSource::Map(&response_headers),
789                                    &Default::default(),
790                                    Some(&security_details.subject_name),
791                                );
792                                firewall = true;
793                            }
794                            _ => {
795                                anti_bot_tech = detect_anti_bot_tech_response(
796                                    &response.url,
797                                    &HeaderSource::Map(&response_headers),
798                                    &Default::default(),
799                                    None,
800                                );
801                                if anti_bot_tech == AntiBotTech::Cloudflare {
802                                    if let Some(xframe_options) =
803                                        response_headers.get("x-frame-options")
804                                    {
805                                        if xframe_options == r#"\"DENY\""# {
806                                            firewall = true;
807                                        }
808                                    } else if let Some(encoding) =
809                                        response_headers.get("Accept-Encoding")
810                                    {
811                                        if encoding == r#"cf-ray"# {
812                                            firewall = true;
813                                        }
814                                    }
815                                } else {
816                                    firewall = true;
817                                }
818                            }
819                        };
820
821                        waf_check = firewall && !matches!(anti_bot_tech, AntiBotTech::None);
822
823                        if !waf_check {
824                            waf_check = match response.protocol {
825                                Some(ref protocol) => protocol == "blob",
826                                _ => false,
827                            }
828                        }
829                    }
830
831                    status_code = StatusCode::from_u16(response.status as u16)
832                        .unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
833                } else {
834                    if let Some(failure_text) = &http_request.failure_text {
835                        if failure_text == "net::ERR_FAILED" {
836                            waf_check = true;
837                        }
838                    }
839                }
840            }
841        }
842        Err(e) => return Err(e),
843    }
844
845    Ok(ChromeHTTPReqRes {
846        waf_check,
847        status_code,
848        method,
849        response_headers,
850        request_headers,
851        protocol,
852        anti_bot_tech,
853    })
854}
855
856/// Use OpenAI to extend the crawl. This does nothing without 'openai' feature flag.
857#[cfg(all(feature = "chrome", not(feature = "openai")))]
858pub async fn run_openai_request(
859    _source: &str,
860    _page: &chromiumoxide::Page,
861    _wait_for: &Option<crate::configuration::WaitFor>,
862    _openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
863    _page_response: &mut PageResponse,
864    _ok: bool,
865) {
866}
867
868/// Use OpenAI to extend the crawl. This does nothing without 'openai' feature flag.
869#[cfg(all(feature = "chrome", feature = "openai"))]
870pub async fn run_openai_request(
871    source: &str,
872    page: &chromiumoxide::Page,
873    wait_for: &Option<crate::configuration::WaitFor>,
874    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
875    mut page_response: &mut PageResponse,
876    ok: bool,
877) {
878    if let Some(gpt_configs) = openai_config {
879        let gpt_configs = match gpt_configs.prompt_url_map {
880            Some(ref h) => {
881                let c = h.get::<case_insensitive_string::CaseInsensitiveString>(&source.into());
882
883                if !c.is_some() && gpt_configs.paths_map {
884                    h.get::<case_insensitive_string::CaseInsensitiveString>(
885                        &get_path_from_url(&source).into(),
886                    )
887                } else {
888                    c
889                }
890            }
891            _ => Some(gpt_configs),
892        };
893
894        if let Some(gpt_configs) = gpt_configs {
895            let mut prompts = gpt_configs.prompt.clone();
896
897            while let Some(prompt) = prompts.next() {
898                let gpt_results = if !gpt_configs.model.is_empty() && ok {
899                    openai_request(
900                        gpt_configs,
901                        match page_response.content.as_ref() {
902                            Some(html) => auto_encoder::auto_encode_bytes(html),
903                            _ => Default::default(),
904                        },
905                        &source,
906                        &prompt,
907                    )
908                    .await
909                } else {
910                    Default::default()
911                };
912
913                let js_script = gpt_results.response;
914                let tokens_used = gpt_results.usage;
915                let gpt_error = gpt_results.error;
916
917                // set the credits used for the request
918                handle_openai_credits(&mut page_response, tokens_used);
919
920                let json_res = if gpt_configs.extra_ai_data {
921                    match handle_ai_data(&js_script) {
922                        Some(jr) => jr,
923                        _ => {
924                            let mut jr = JsonResponse::default();
925                            jr.error = Some("An issue occured with serialization.".into());
926
927                            jr
928                        }
929                    }
930                } else {
931                    let mut x = JsonResponse::default();
932                    x.js = js_script;
933                    x
934                };
935
936                // perform the js script on the page.
937                if !json_res.js.is_empty() {
938                    let html: Option<Box<Vec<u8>>> = match page
939                        .evaluate_function(string_concat!(
940                            "async function() { ",
941                            json_res.js,
942                            "; return document.documentElement.outerHTML; }"
943                        ))
944                        .await
945                    {
946                        Ok(h) => match h.into_value() {
947                            Ok(hh) => Some(hh),
948                            _ => None,
949                        },
950                        _ => None,
951                    };
952
953                    if html.is_some() {
954                        page_wait(&page, &wait_for).await;
955                        if json_res.js.len() <= 400 && json_res.js.contains("window.location") {
956                            if let Ok(b) = page.outer_html_bytes().await {
957                                page_response.content = Some(b.into());
958                            }
959                        } else {
960                            page_response.content = html;
961                        }
962                    }
963                }
964
965                // attach the data to the page
966                if gpt_configs.extra_ai_data {
967                    let screenshot_bytes = if gpt_configs.screenshot && !json_res.js.is_empty() {
968                        let format = chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png;
969
970                        let screenshot_configs = chromiumoxide::page::ScreenshotParams::builder()
971                            .format(format)
972                            .full_page(true)
973                            .quality(45)
974                            .omit_background(false);
975
976                        match page.screenshot(screenshot_configs.build()).await {
977                            Ok(b) => {
978                                log::debug!("took screenshot: {:?}", source);
979                                Some(b)
980                            }
981                            Err(e) => {
982                                log::error!("failed to take screenshot: {:?} - {:?}", e, source);
983                                None
984                            }
985                        }
986                    } else {
987                        None
988                    };
989
990                    handle_extra_ai_data(
991                        page_response,
992                        &prompt,
993                        json_res,
994                        screenshot_bytes,
995                        gpt_error,
996                    );
997                }
998            }
999        }
1000    }
1001}
1002
1003/// Represents an HTTP version
1004#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1005#[non_exhaustive]
1006pub enum HttpVersion {
1007    /// HTTP Version 0.9
1008    Http09,
1009    /// HTTP Version 1.0
1010    Http10,
1011    /// HTTP Version 1.1
1012    Http11,
1013    /// HTTP Version 2.0
1014    H2,
1015    /// HTTP Version 3.0
1016    H3,
1017}
1018
1019/// A basic generic type that represents an HTTP response.
1020#[derive(Debug, Clone)]
1021pub struct HttpResponse {
1022    /// HTTP response body
1023    pub body: Vec<u8>,
1024    /// HTTP response headers
1025    pub headers: std::collections::HashMap<String, String>,
1026    /// HTTP response status code
1027    pub status: u16,
1028    /// HTTP response url
1029    pub url: url::Url,
1030    /// HTTP response version
1031    pub version: HttpVersion,
1032}
1033
1034/// A HTTP request type for caching.
1035#[cfg(feature = "cache_chrome_hybrid")]
1036pub struct HttpRequestLike {
1037    ///  The URI component of a request.
1038    pub uri: http::uri::Uri,
1039    /// The http method.
1040    pub method: reqwest::Method,
1041    /// The http headers.
1042    pub headers: http::HeaderMap,
1043}
1044
1045#[cfg(feature = "cache_chrome_hybrid")]
1046/// A HTTP response type for caching.
1047pub struct HttpResponseLike {
1048    /// The http status code.
1049    pub status: StatusCode,
1050    /// The http headers.
1051    pub headers: http::HeaderMap,
1052}
1053
1054#[cfg(feature = "cache_chrome_hybrid")]
1055impl RequestLike for HttpRequestLike {
1056    fn uri(&self) -> http::uri::Uri {
1057        self.uri.clone()
1058    }
1059    fn is_same_uri(&self, other: &http::Uri) -> bool {
1060        &self.uri == other
1061    }
1062    fn method(&self) -> &reqwest::Method {
1063        &self.method
1064    }
1065    fn headers(&self) -> &http::HeaderMap {
1066        &self.headers
1067    }
1068}
1069
1070#[cfg(feature = "cache_chrome_hybrid")]
1071impl ResponseLike for HttpResponseLike {
1072    fn status(&self) -> StatusCode {
1073        self.status
1074    }
1075    fn headers(&self) -> &http::HeaderMap {
1076        &self.headers
1077    }
1078}
1079
1080/// Convert headers to header map
1081#[cfg(any(
1082    feature = "cache_chrome_hybrid",
1083    feature = "headers",
1084    feature = "cookies"
1085))]
1086pub fn convert_headers(
1087    headers: &std::collections::HashMap<String, String>,
1088) -> reqwest::header::HeaderMap {
1089    let mut header_map = reqwest::header::HeaderMap::new();
1090
1091    for (index, items) in headers.iter().enumerate() {
1092        if let Ok(head) = reqwest::header::HeaderValue::from_str(items.1) {
1093            use std::str::FromStr;
1094            if let Ok(key) = reqwest::header::HeaderName::from_str(items.0) {
1095                header_map.insert(key, head);
1096            }
1097        }
1098        // mal headers
1099        if index > 1000 {
1100            break;
1101        }
1102    }
1103
1104    header_map
1105}
1106
1107#[cfg(feature = "cache_chrome_hybrid")]
1108/// Store the page to cache to be re-used across HTTP request.
1109pub async fn put_hybrid_cache(
1110    cache_key: &str,
1111    http_response: HttpResponse,
1112    method: &str,
1113    http_request_headers: std::collections::HashMap<String, String>,
1114) {
1115    use crate::http_cache_reqwest::CacheManager;
1116    use http_cache_semantics::CachePolicy;
1117
1118    match http_response.url.as_str().parse::<http::uri::Uri>() {
1119        Ok(u) => {
1120            let req = HttpRequestLike {
1121                uri: u,
1122                method: reqwest::Method::from_bytes(method.as_bytes())
1123                    .unwrap_or(reqwest::Method::GET),
1124                headers: convert_headers(&http_response.headers),
1125            };
1126
1127            let res = HttpResponseLike {
1128                status: StatusCode::from_u16(http_response.status)
1129                    .unwrap_or(StatusCode::EXPECTATION_FAILED),
1130                headers: convert_headers(&http_request_headers),
1131            };
1132
1133            let policy = CachePolicy::new(&req, &res);
1134
1135            let _ = crate::website::CACACHE_MANAGER
1136                .put(
1137                    cache_key.into(),
1138                    http_cache_reqwest::HttpResponse {
1139                        url: http_response.url,
1140                        body: http_response.body,
1141                        headers: http_response.headers,
1142                        version: match http_response.version {
1143                            HttpVersion::H2 => http_cache::HttpVersion::H2,
1144                            HttpVersion::Http10 => http_cache::HttpVersion::Http10,
1145                            HttpVersion::H3 => http_cache::HttpVersion::H3,
1146                            HttpVersion::Http09 => http_cache::HttpVersion::Http09,
1147                            HttpVersion::Http11 => http_cache::HttpVersion::Http11,
1148                        },
1149                        status: http_response.status,
1150                    },
1151                    policy,
1152                )
1153                .await;
1154        }
1155        _ => (),
1156    }
1157}
1158
1159#[cfg(not(feature = "cache_chrome_hybrid"))]
1160/// Store the page to cache to be re-used across HTTP request.
1161pub async fn put_hybrid_cache(
1162    _cache_key: &str,
1163    _http_response: HttpResponse,
1164    _method: &str,
1165    _http_request_headers: std::collections::HashMap<String, String>,
1166) {
1167}
1168
1169/// Subtract the duration with overflow handling.
1170#[cfg(feature = "chrome")]
1171fn sub_duration(
1172    base_timeout: std::time::Duration,
1173    elapsed: std::time::Duration,
1174) -> std::time::Duration {
1175    match base_timeout.checked_sub(elapsed) {
1176        Some(remaining_time) => remaining_time,
1177        None => Default::default(),
1178    }
1179}
1180
1181/// Get the initial page headers of the page with navigation.
1182#[cfg(feature = "chrome")]
1183async fn navigate(
1184    page: &chromiumoxide::Page,
1185    url: &str,
1186    chrome_http_req_res: &mut ChromeHTTPReqRes,
1187) -> Result<(), chromiumoxide::error::CdpError> {
1188    *chrome_http_req_res = perform_chrome_http_request(page, url).await?;
1189    Ok(())
1190}
1191
1192#[cfg(all(feature = "real_browser", feature = "chrome"))]
1193/// generate random mouse movement.
1194async fn perform_smart_mouse_movement(
1195    page: &chromiumoxide::Page,
1196    viewport: &Option<crate::configuration::Viewport>,
1197) {
1198    use crate::features::chrome_mouse_movements::GaussianMouse;
1199    use chromiumoxide::layout::Point;
1200    let (viewport_width, viewport_height) = match viewport {
1201        Some(vp) => (vp.width as f64, vp.height as f64),
1202        _ => (1280.0, 720.0),
1203    };
1204    for (x, y) in GaussianMouse::generate_random_coordinates(viewport_width, viewport_height) {
1205        let _ = page.move_mouse(Point::new(x, y)).await;
1206    }
1207}
1208
1209#[cfg(all(not(feature = "real_browser"), feature = "chrome"))]
1210async fn perform_smart_mouse_movement(
1211    _page: &chromiumoxide::Page,
1212    _viewport: &Option<crate::configuration::Viewport>,
1213) {
1214}
1215
1216/// Cache the chrome response
1217#[cfg(all(feature = "chrome", feature = "cache_chrome_hybrid"))]
1218pub async fn cache_chrome_response(
1219    target_url: &str,
1220    page_response: &PageResponse,
1221    chrome_http_req_res: ChromeHTTPReqRes,
1222) {
1223    if let Ok(u) = url::Url::parse(target_url) {
1224        let http_response = HttpResponse {
1225            url: u,
1226            body: match page_response.content.as_ref() {
1227                Some(b) => b.into(),
1228                _ => Default::default(),
1229            },
1230            status: chrome_http_req_res.status_code.into(),
1231            version: match chrome_http_req_res.protocol.as_str() {
1232                "http/0.9" => HttpVersion::Http09,
1233                "http/1" | "http/1.0" => HttpVersion::Http10,
1234                "http/1.1" => HttpVersion::Http11,
1235                "http/2.0" | "http/2" => HttpVersion::H2,
1236                "http/3.0" | "http/3" => HttpVersion::H3,
1237                _ => HttpVersion::Http11,
1238            },
1239            headers: chrome_http_req_res.response_headers,
1240        };
1241        put_hybrid_cache(
1242            &string_concat!("GET", ":", target_url),
1243            http_response,
1244            &"GET",
1245            chrome_http_req_res.request_headers,
1246        )
1247        .await;
1248    }
1249}
1250
1251/// Cache the chrome response
1252#[cfg(all(feature = "chrome", not(feature = "cache_chrome_hybrid")))]
1253pub async fn cache_chrome_response(
1254    _target_url: &str,
1255    _page_response: &PageResponse,
1256    _chrome_http_req_res: ChromeHTTPReqRes,
1257) {
1258}
1259
1260/// 5 mins in ms
1261pub(crate) const FIVE_MINUTES: u32 = 300000;
1262
1263/// Max page timeout for events.
1264#[cfg(feature = "chrome")]
1265const MAX_PAGE_TIMEOUT: tokio::time::Duration =
1266    tokio::time::Duration::from_millis(FIVE_MINUTES as u64);
1267/// Half of the max timeout
1268#[cfg(feature = "chrome")]
1269const HALF_MAX_PAGE_TIMEOUT: tokio::time::Duration =
1270    tokio::time::Duration::from_millis(FIVE_MINUTES as u64 / 2);
1271
1272#[cfg(all(feature = "chrome", feature = "headers"))]
1273fn store_headers(page_response: &PageResponse, chrome_http_req_res: &mut ChromeHTTPReqRes) {
1274    if let Some(response_headers) = &page_response.headers {
1275        chrome_http_req_res.response_headers =
1276            crate::utils::header_utils::header_map_to_hash_map(&response_headers);
1277    }
1278}
1279
1280#[cfg(all(feature = "chrome", not(feature = "headers")))]
1281fn store_headers(_page_response: &PageResponse, _chrome_http_req_res: &mut ChromeHTTPReqRes) {}
1282
1283#[cfg(feature = "chrome")]
1284/// Perform a network request to a resource extracting all content as text streaming via chrome.
1285pub async fn fetch_page_html_chrome_base(
1286    source: &str,
1287    page: &chromiumoxide::Page,
1288    content: bool,
1289    wait_for_navigation: bool,
1290    wait_for: &Option<crate::configuration::WaitFor>,
1291    screenshot: &Option<crate::configuration::ScreenShotConfig>,
1292    page_set: bool,
1293    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
1294    url_target: Option<&str>,
1295    execution_scripts: &Option<ExecutionScripts>,
1296    automation_scripts: &Option<AutomationScripts>,
1297    viewport: &Option<crate::configuration::Viewport>,
1298    request_timeout: &Option<Box<std::time::Duration>>,
1299    track_events: &Option<crate::configuration::ChromeEventTracker>,
1300) -> Result<PageResponse, chromiumoxide::error::CdpError> {
1301    use crate::page::{is_asset_url, DOWNLOADABLE_MEDIA_TYPES, UNKNOWN_STATUS_ERROR};
1302    use chromiumoxide::{
1303        cdp::browser_protocol::network::{
1304            EventLoadingFailed, EventRequestWillBeSent, EventResponseReceived,
1305            GetResponseBodyParams, RequestId, ResourceType,
1306        },
1307        error::CdpError,
1308    };
1309    use hashbrown::HashMap;
1310    use std::time::Duration;
1311    use tokio::{
1312        sync::{oneshot, OnceCell},
1313        time::Instant,
1314    };
1315
1316    #[derive(Debug, Clone, Default)]
1317    /// Map of the response.
1318    struct ResponseMap {
1319        /// The url of the request
1320        url: String,
1321        /// The network request was skipped.
1322        skipped: bool,
1323        /// The bytes transferred
1324        bytes_transferred: f64,
1325    }
1326
1327    #[derive(Debug, Clone, Default)]
1328    struct ResponseBase {
1329        /// The map of the response.
1330        response_map: Option<HashMap<String, ResponseMap>>,
1331        /// The headers of request.
1332        headers: Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
1333        /// The status code.
1334        status_code: Option<i64>,
1335    }
1336
1337    let mut chrome_http_req_res = ChromeHTTPReqRes::default();
1338
1339    // the base networking timeout to prevent any hard hangs.
1340    let mut base_timeout = match request_timeout {
1341        Some(timeout) => **timeout.min(&Box::new(MAX_PAGE_TIMEOUT)),
1342        _ => MAX_PAGE_TIMEOUT,
1343    };
1344
1345    // track the initial base without modifying.
1346    let base_timeout_measurement = base_timeout;
1347    let target_url = url_target.unwrap_or(source);
1348    let asset = is_asset_url(target_url);
1349
1350    let (tx1, rx1) = if asset {
1351        let c = oneshot::channel::<Option<RequestId>>();
1352
1353        (Some(c.0), Some(c.1))
1354    } else {
1355        (None, None)
1356    };
1357
1358    let (track_requests, track_responses) = match track_events {
1359        Some(tracker) => (tracker.requests, tracker.responses),
1360        _ => (false, false),
1361    };
1362
1363    let (event_loading_listener, cancel_listener, received_listener, event_sent_listener) = tokio::join!(
1364        page.event_listener::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
1365        ),
1366        page.event_listener::<EventLoadingFailed>(),
1367        async {
1368            if asset || track_responses {
1369                page.event_listener::<EventResponseReceived>().await
1370            } else {
1371                Err(CdpError::NotFound)
1372            }
1373        },
1374        async {
1375            if track_requests {
1376                page.event_listener::<EventRequestWillBeSent>().await
1377            } else {
1378                Err(CdpError::NotFound)
1379            }
1380        },
1381    );
1382
1383    let (tx, rx) = oneshot::channel::<bool>();
1384
1385    // Listen for network events. todo: capture the last values endtime to track period.
1386    let bytes_collected_handle = tokio::spawn(async move {
1387        let finished_media: Option<OnceCell<RequestId>> =
1388            if asset { Some(OnceCell::new()) } else { None };
1389
1390        let f1 = async {
1391            let mut total = 0.0;
1392
1393            let mut response_map: Option<HashMap<String, f64>> = if track_responses {
1394                Some(HashMap::new())
1395            } else {
1396                None
1397            };
1398
1399            if let Ok(mut listener) = event_loading_listener {
1400                if asset {
1401                    while let Some(event) = listener.next().await {
1402                        total += event.encoded_data_length;
1403
1404                        if let Some(response_map) = response_map.as_mut() {
1405                            response_map
1406                                .entry(event.request_id.inner().clone())
1407                                .and_modify(|e| *e += event.encoded_data_length)
1408                                .or_insert(event.encoded_data_length);
1409                        }
1410
1411                        if let Some(once) = &finished_media {
1412                            if let Some(request_id) = once.get() {
1413                                if request_id == &event.request_id {
1414                                    if let Some(tx1) = tx1 {
1415                                        let _ = tx1.send(Some(request_id.clone()));
1416                                        break;
1417                                    }
1418                                }
1419                            }
1420                        }
1421                    }
1422                } else {
1423                    while let Some(event) = listener.next().await {
1424                        total += event.encoded_data_length;
1425
1426                        if let Some(response_map) = response_map.as_mut() {
1427                            response_map
1428                                .entry(event.request_id.inner().clone())
1429                                .and_modify(|e| *e += event.encoded_data_length)
1430                                .or_insert(event.encoded_data_length);
1431                        }
1432                    }
1433                }
1434            }
1435
1436            (total, response_map)
1437        };
1438
1439        let f2 = async {
1440            if let Ok(mut listener) = cancel_listener {
1441                let mut net_aborted = false;
1442
1443                while let Some(event) = listener.next().await {
1444                    if event.r#type == ResourceType::Document
1445                        && event.error_text == "net::ERR_ABORTED"
1446                        && event.canceled.unwrap_or_default()
1447                    {
1448                        net_aborted = true;
1449                        break;
1450                    }
1451                }
1452
1453                if net_aborted {
1454                    let _ = tx.send(true);
1455                }
1456            }
1457        };
1458
1459        let f3 = async {
1460            let mut response_map: Option<HashMap<String, ResponseMap>> = if track_responses {
1461                Some(HashMap::new())
1462            } else {
1463                None
1464            };
1465
1466            let mut status_code = None;
1467            let mut headers = None;
1468
1469            if asset || response_map.is_some() {
1470                if let Ok(mut listener) = received_listener {
1471                    let mut initial_asset = false;
1472                    let mut allow_download = false;
1473                    let mut intial_request = false;
1474
1475                    while let Some(event) = listener.next().await {
1476                        if !intial_request && event.r#type == ResourceType::Document {
1477                            let redirect =
1478                                event.response.status >= 300 && event.response.status <= 399;
1479
1480                            if !redirect {
1481                                intial_request = true;
1482                                status_code = Some(event.response.status);
1483                                headers = Some(event.response.headers.clone());
1484                            }
1485                        }
1486                        // check if media asset needs to be downloaded.
1487                        if asset {
1488                            if !initial_asset && event.r#type == ResourceType::Document {
1489                                allow_download =
1490                                    DOWNLOADABLE_MEDIA_TYPES.contains(&event.response.mime_type);
1491                            }
1492                            if event.r#type == ResourceType::Media && allow_download {
1493                                if let Some(once) = &finished_media {
1494                                    let _ = once.set(event.request_id.clone());
1495                                }
1496                            }
1497                            initial_asset = true;
1498                        }
1499
1500                        if let Some(response_map) = response_map.as_mut() {
1501                            response_map.insert(
1502                                event.request_id.inner().clone(),
1503                                ResponseMap {
1504                                    url: event.response.url.clone(),
1505                                    bytes_transferred: event.response.encoded_data_length,
1506                                    skipped: *MASK_BYTES_INTERCEPTION
1507                                        && event.response.connection_id == 0.0
1508                                        && event.response.encoded_data_length <= 17.0,
1509                                },
1510                            );
1511                        }
1512                    }
1513                }
1514            }
1515
1516            ResponseBase {
1517                response_map,
1518                status_code,
1519                headers,
1520            }
1521        };
1522
1523        let f4 = async {
1524            let mut request_map: Option<HashMap<String, f64>> = if track_requests {
1525                Some(HashMap::new())
1526            } else {
1527                None
1528            };
1529
1530            if request_map.is_some() {
1531                if let Some(response_map) = request_map.as_mut() {
1532                    if let Ok(mut listener) = event_sent_listener {
1533                        while let Some(event) = listener.next().await {
1534                            response_map
1535                                .insert(event.request.url.clone(), *event.timestamp.inner());
1536                        }
1537                    }
1538                }
1539            }
1540
1541            request_map
1542        };
1543
1544        let (t1, _, res_map, req_map) = tokio::join!(f1, f2, f3, f4);
1545
1546        (t1.0, t1.1, res_map, req_map)
1547    });
1548
1549    let mut block_bytes = false;
1550
1551    let page_navigation = async {
1552        if !page_set {
1553            // used for smart mode re-rendering direct assigning html
1554            if content {
1555                if let Ok(frame) = page.mainframe().await {
1556                    let html = rewrite_base_tag(&source, &url_target).await;
1557
1558                    if let Err(e) = page
1559                        .execute(
1560                            chromiumoxide::cdp::browser_protocol::page::SetDocumentContentParams {
1561                                frame_id: frame.unwrap_or_default(),
1562                                html,
1563                            },
1564                        )
1565                        .await
1566                    {
1567                        log::info!(
1568                            "Set Content Error({:?}) - {:?}",
1569                            e,
1570                            &url_target.unwrap_or(source)
1571                        );
1572                        if let chromiumoxide::error::CdpError::Timeout = e {
1573                            block_bytes = true;
1574                        }
1575                    }
1576                }
1577            } else {
1578                if let Err(e) = navigate(page, source, &mut chrome_http_req_res).await {
1579                    log::info!(
1580                        "Navigation Error({:?}) - {:?}",
1581                        e,
1582                        &url_target.unwrap_or(source)
1583                    );
1584                    if let chromiumoxide::error::CdpError::Timeout = e {
1585                        block_bytes = true;
1586                    }
1587                    return Err(e);
1588                };
1589            }
1590        }
1591
1592        Ok(())
1593    };
1594
1595    let start_time = Instant::now();
1596
1597    let mut request_cancelled = false;
1598
1599    let page_navigate = async {
1600        if cfg!(feature = "real_browser") {
1601            let notify = tokio::sync::Notify::new();
1602
1603            let mouse_loop = async {
1604                let mut index = 0;
1605
1606                loop {
1607                    tokio::select! {
1608                        _ = notify.notified() => {
1609                            break;
1610                        }
1611                        _ = perform_smart_mouse_movement(&page, &viewport) => {
1612                            tokio::time::sleep(std::time::Duration::from_millis(WAIT_TIMEOUTS[index])).await;
1613                        }
1614                    }
1615
1616                    index = (index + 1) % WAIT_TIMEOUTS.len();
1617                }
1618            };
1619
1620            let navigation_loop = async {
1621                let result = page_navigation.await;
1622                notify.notify_waiters();
1623                result
1624            };
1625
1626            let (result, _) = tokio::join!(navigation_loop, mouse_loop);
1627
1628            result
1629        } else {
1630            page_navigation.await
1631        }
1632    };
1633
1634    tokio::select! {
1635        v = tokio::time::timeout(base_timeout + Duration::from_millis(50), page_navigate) => {
1636            if v.is_err() {
1637                request_cancelled = true;
1638            }
1639        }
1640        v = rx => {
1641            if let Ok(v) = v {
1642                request_cancelled = !v;
1643            }
1644        }
1645    };
1646
1647    base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1648
1649    // we do not need to wait for navigation if content is assigned. The method set_content already handles this.
1650    let final_url = if wait_for_navigation && !request_cancelled && !block_bytes {
1651        let last_redirect = tokio::time::timeout(base_timeout, async {
1652            match page.wait_for_navigation_response().await {
1653                Ok(u) => get_last_redirect(&source, &u, &page).await,
1654                _ => None,
1655            }
1656        })
1657        .await;
1658        base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1659        match last_redirect {
1660            Ok(last) => last,
1661            _ => None,
1662        }
1663    } else {
1664        None
1665    };
1666
1667    let chrome_http_req_res1 = if asset {
1668        Some(chrome_http_req_res.clone())
1669    } else {
1670        None
1671    };
1672
1673    let run_events = !base_timeout.is_zero()
1674        && !block_bytes
1675        && !request_cancelled
1676        && (!chrome_http_req_res.status_code.is_server_error()
1677            && !chrome_http_req_res.status_code.is_client_error()
1678            || chrome_http_req_res.status_code == *UNKNOWN_STATUS_ERROR
1679            || chrome_http_req_res.status_code == 404
1680            || chrome_http_req_res.status_code == 403
1681            || chrome_http_req_res.status_code.is_redirection()
1682            || chrome_http_req_res.status_code.is_success());
1683
1684    block_bytes = chrome_http_req_res.status_code == StatusCode::REQUEST_TIMEOUT;
1685
1686    let waf_check = chrome_http_req_res.waf_check;
1687    let mut status_code = chrome_http_req_res.status_code;
1688    let mut anti_bot_tech = chrome_http_req_res.anti_bot_tech;
1689    let mut validate_cf = false;
1690
1691    let run_page_response = async move {
1692        let mut page_response = if run_events {
1693            if waf_check {
1694                base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1695                if let Err(elasped) = tokio::time::timeout(
1696                    base_timeout,
1697                    perform_smart_mouse_movement(&page, &viewport),
1698                )
1699                .await
1700                {
1701                    log::warn!("mouse movement timeout exceeded {elasped}");
1702                }
1703            }
1704
1705            if wait_for.is_some() {
1706                base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1707                if let Err(elasped) =
1708                    tokio::time::timeout(base_timeout, page_wait(&page, &wait_for)).await
1709                {
1710                    log::warn!("max wait for timeout {elasped}");
1711                }
1712            }
1713
1714            base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1715
1716            if execution_scripts.is_some() || automation_scripts.is_some() {
1717                let target_url = if final_url.is_some() {
1718                    match final_url.as_ref() {
1719                        Some(ref u) => u.to_string(),
1720                        _ => Default::default(),
1721                    }
1722                } else if url_target.is_some() {
1723                    url_target.unwrap_or_default().to_string()
1724                } else {
1725                    source.to_string()
1726                };
1727
1728                if let Err(elasped) = tokio::time::timeout(base_timeout, async {
1729                    tokio::join!(
1730                        crate::features::chrome_common::eval_execution_scripts(
1731                            &page,
1732                            &target_url,
1733                            &execution_scripts
1734                        ),
1735                        crate::features::chrome_common::eval_automation_scripts(
1736                            &page,
1737                            &target_url,
1738                            &automation_scripts
1739                        )
1740                    );
1741                })
1742                .await
1743                {
1744                    log::warn!("eval scripts timeout exceeded {elasped}");
1745                }
1746            }
1747
1748            let xml_target = match &final_url {
1749                Some(f) => f.ends_with(".xml"),
1750                _ => target_url.ends_with(".xml"),
1751            };
1752
1753            let page_fn = async {
1754                if xml_target {
1755                    match page.content_bytes_xml().await {
1756                        Ok(page_bytes) => {
1757                            if page_bytes.is_empty() {
1758                                page.outer_html_bytes().await
1759                            } else {
1760                                Ok(page_bytes)
1761                            }
1762                        }
1763                        _ => page.outer_html_bytes().await,
1764                    }
1765                } else {
1766                    page.outer_html_bytes().await
1767                }
1768            };
1769
1770            let results = tokio::time::timeout(base_timeout.max(HALF_MAX_PAGE_TIMEOUT), page_fn);
1771
1772            let mut res: Box<Vec<u8>> = match results.await {
1773                Ok(v) => v.map(Box::new).unwrap_or_default(),
1774                _ => Default::default(),
1775            };
1776
1777            let forbidden = waf_check && res.starts_with(b"<html><head>\n    <style global=") && res.ends_with(b";</script><iframe height=\"1\" width=\"1\" style=\"position: absolute; top: 0px; left: 0px; border: none; visibility: hidden;\"></iframe>\n\n</body></html>");
1778
1779            if cfg!(feature = "real_browser") {
1780                // we can skip this check after a set bytes
1781                if res.len() <= crate::page::TURNSTILE_WALL_PAGE_SIZE
1782                    && anti_bot_tech == AntiBotTech::Cloudflare
1783                    || waf_check
1784                {
1785                    // detect the turnstile page.
1786                    if detect_cf_turnstyle(&res) {
1787                        if let Err(_e) = tokio::time::timeout(base_timeout, async {
1788                            if let Ok(success) = cf_handle(&mut res, &page).await {
1789                                if success {
1790                                    status_code = StatusCode::OK;
1791                                }
1792                            }
1793                        })
1794                        .await
1795                        {
1796                            validate_cf = true;
1797                        }
1798                    }
1799                }
1800            };
1801
1802            let ok = !res.is_empty();
1803
1804            if validate_cf && ok {
1805                if !detect_cf_turnstyle(&res) && status_code == StatusCode::FORBIDDEN {
1806                    status_code = StatusCode::OK;
1807                }
1808            }
1809
1810            let mut page_response = set_page_response(
1811                ok,
1812                res,
1813                if forbidden {
1814                    StatusCode::FORBIDDEN
1815                } else {
1816                    status_code
1817                },
1818                final_url,
1819            );
1820
1821            base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1822
1823            let _ = tokio::time::timeout(
1824                base_timeout,
1825                set_page_response_cookies(&mut page_response, &page),
1826            )
1827            .await;
1828
1829            if openai_config.is_some() && !base_timeout.is_zero() {
1830                base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1831
1832                let openai_request = run_openai_request(
1833                    match url_target {
1834                        Some(ref ut) => ut,
1835                        _ => source,
1836                    },
1837                    page,
1838                    wait_for,
1839                    openai_config,
1840                    &mut page_response,
1841                    ok,
1842                );
1843
1844                let _ = tokio::time::timeout(base_timeout, openai_request).await;
1845            }
1846
1847            if cfg!(feature = "chrome_screenshot") || screenshot.is_some() {
1848                let _ = tokio::time::timeout(
1849                    base_timeout + tokio::time::Duration::from_secs(30),
1850                    perform_screenshot(source, page, screenshot, &mut page_response),
1851                )
1852                .await;
1853            }
1854
1855            page_response
1856        } else {
1857            let res = if !block_bytes {
1858                let results = tokio::time::timeout(
1859                    base_timeout.max(HALF_MAX_PAGE_TIMEOUT),
1860                    page.outer_html_bytes(),
1861                );
1862
1863                match results.await {
1864                    Ok(v) => v.map(Box::new).unwrap_or_default(),
1865                    _ => Default::default(),
1866                }
1867            } else {
1868                Default::default()
1869            };
1870
1871            let mut page_response = set_page_response(!res.is_empty(), res, status_code, final_url);
1872
1873            if !block_bytes {
1874                let _ = tokio::time::timeout(
1875                    base_timeout,
1876                    set_page_response_cookies(&mut page_response, &page),
1877                )
1878                .await;
1879            }
1880
1881            if base_timeout.is_zero() && page_response.content.is_none() {
1882                page_response.status_code = StatusCode::REQUEST_TIMEOUT;
1883            }
1884
1885            page_response
1886        };
1887
1888        if content {
1889            if let Some(final_url) = &page_response.final_url {
1890                if final_url == "about:blank" {
1891                    page_response.final_url = None;
1892                }
1893            }
1894        }
1895
1896        page_response
1897    };
1898
1899    let mut content: Option<Box<Vec<u8>>> = None;
1900
1901    let page_response = match rx1 {
1902        Some(rx1) => {
1903            tokio::select! {
1904                v = tokio::time::timeout(base_timeout, run_page_response) => {
1905                    v.map_err(|_| CdpError::Timeout)
1906                }
1907                c = rx1 => {
1908                    if let Ok(c) = c {
1909                        if let Some(c) = c {
1910                            let params =
1911                            GetResponseBodyParams::new(c.clone());
1912
1913                            if let Ok(command_response) = page.execute(params).await {
1914                              let body_response = command_response;
1915
1916                              let media_file = if body_response.base64_encoded {
1917                                  chromiumoxide::utils::base64::decode(
1918                                      &body_response.body,
1919                                  )
1920                                  .unwrap_or_default()
1921                              } else {
1922                                  body_response.body.as_bytes().to_vec()
1923                              };
1924                              content = Some(media_file.into());
1925                          }
1926                        }
1927                    }
1928
1929                    let mut page_response = PageResponse::default();
1930
1931                    let _ = tokio::time::timeout(
1932                        base_timeout,
1933                        set_page_response_cookies(&mut page_response, &page),
1934                    )
1935                    .await;
1936
1937                    if let Some(mut chrome_http_req_res1) = chrome_http_req_res1 {
1938                        set_page_response_headers(&mut chrome_http_req_res1, &mut page_response);
1939
1940                        page_response.status_code = chrome_http_req_res1.status_code;
1941                        page_response.waf_check = chrome_http_req_res1.waf_check;
1942
1943                        if !page_set {
1944                            let _ = tokio::time::timeout(
1945                                base_timeout,
1946                                cache_chrome_response(&source, &page_response, chrome_http_req_res1),
1947                            )
1948                            .await;
1949                        }
1950
1951                    }
1952
1953                    Ok(page_response)
1954                }
1955            }
1956        }
1957        _ => Ok(run_page_response.await),
1958    };
1959
1960    let mut page_response = page_response.unwrap_or_default();
1961
1962    set_page_response_headers(&mut chrome_http_req_res, &mut page_response);
1963    page_response.status_code = chrome_http_req_res.status_code;
1964    page_response.waf_check = chrome_http_req_res.waf_check;
1965
1966    if content.is_some() {
1967        page_response.content = content.map(|f| f.into());
1968    }
1969
1970    if page_response.status_code == *UNKNOWN_STATUS_ERROR && page_response.content.is_some() {
1971        page_response.status_code = StatusCode::OK;
1972    }
1973
1974    // run initial handling hidden anchors
1975    // if let Ok(new_links) = page.evaluate(crate::features::chrome::ANCHOR_EVENTS).await {
1976    //     if let Ok(results) = new_links.into_value::<hashbrown::HashSet<CaseInsensitiveString>>() {
1977    //         links.extend(page.extract_links_raw(&base, &results).await);
1978    //     }
1979    // }
1980
1981    if cfg!(not(feature = "chrome_store_page")) {
1982        let _ = tokio::time::timeout(
1983            base_timeout.max(HALF_MAX_PAGE_TIMEOUT),
1984            page.execute(chromiumoxide::cdp::browser_protocol::page::CloseParams::default()),
1985        )
1986        .await;
1987
1988        if let Ok((mut transferred, bytes_map, mut rs, request_map)) = bytes_collected_handle.await
1989        {
1990            let response_map = rs.response_map;
1991
1992            if response_map.is_some() {
1993                let mut _response_map = HashMap::new();
1994
1995                if let Some(response_map) = response_map {
1996                    if let Some(bytes_map) = bytes_map {
1997                        let detect_anti_bots =
1998                            response_map.len() <= 4 && anti_bot_tech == AntiBotTech::None;
1999
2000                        for item in response_map {
2001                            if detect_anti_bots && item.1.url.contains("_Incapsula_Resource?") {
2002                                anti_bot_tech = AntiBotTech::Imperva;
2003                            }
2004
2005                            let b = if item.1.skipped {
2006                                0.0
2007                            } else {
2008                                match bytes_map.get(&item.0) {
2009                                    Some(f) => *f,
2010                                    _ => 0.0,
2011                                }
2012                            };
2013
2014                            if item.1.skipped {
2015                                transferred -= item.1.bytes_transferred;
2016                            }
2017
2018                            _response_map.insert(item.1.url, b);
2019                        }
2020                    }
2021                }
2022
2023                page_response.response_map = Some(_response_map);
2024
2025                if let Some(status) = rs.status_code {
2026                    if let Ok(scode) = status.try_into() {
2027                        if let Ok(status) = StatusCode::from_u16(scode) {
2028                            page_response.status_code = status;
2029                        }
2030                    }
2031                }
2032
2033                set_page_response_headers_raw(&mut rs.headers, &mut page_response);
2034                store_headers(&page_response, &mut chrome_http_req_res);
2035
2036                if anti_bot_tech == AntiBotTech::None {
2037                    let final_url = match &page_response.final_url {
2038                        Some(final_url) => final_url,
2039                        _ => target_url,
2040                    };
2041                    if let Some(h) = &page_response.headers {
2042                        if let Some(content) = &page_response.content {
2043                            anti_bot_tech = detect_anti_bot_tech_response(
2044                                &final_url,
2045                                &HeaderSource::HeaderMap(h),
2046                                &content,
2047                                None,
2048                            );
2049                        }
2050                    }
2051                }
2052
2053                if let Some(content) = &page_response.content {
2054                    // validate if the turnstile page is still open.
2055                    if anti_bot_tech == AntiBotTech::Cloudflare
2056                        && page_response.status_code == StatusCode::FORBIDDEN
2057                    {
2058                        let cf_turnstile = detect_cf_turnstyle(&content);
2059
2060                        if !cf_turnstile {
2061                            page_response.status_code = StatusCode::OK;
2062                        }
2063                    }
2064                }
2065
2066                if !page_set {
2067                    let _ = tokio::time::timeout(
2068                        base_timeout,
2069                        cache_chrome_response(&source, &page_response, chrome_http_req_res),
2070                    )
2071                    .await;
2072                }
2073            }
2074            if request_map.is_some() {
2075                page_response.request_map = request_map;
2076            }
2077
2078            page_response.bytes_transferred = Some(transferred);
2079        }
2080    }
2081
2082    page_response.anti_bot_tech = anti_bot_tech;
2083
2084    Ok(page_response)
2085}
2086
2087/// Set the page response.
2088#[cfg(feature = "chrome")]
2089fn set_page_response(
2090    ok: bool,
2091    res: Box<Vec<u8>>,
2092    status_code: StatusCode,
2093    final_url: Option<String>,
2094) -> PageResponse {
2095    PageResponse {
2096        content: if ok { Some(res.into()) } else { None },
2097        status_code,
2098        final_url,
2099        ..Default::default()
2100    }
2101}
2102
2103/// Set the page response.
2104#[cfg(all(feature = "chrome", feature = "headers"))]
2105fn set_page_response_headers(
2106    chrome_http_req_res: &mut ChromeHTTPReqRes,
2107    page_response: &mut PageResponse,
2108) {
2109    let response_headers = convert_headers(&chrome_http_req_res.response_headers);
2110
2111    if !response_headers.is_empty() {
2112        page_response.headers = Some(response_headers);
2113    }
2114}
2115
2116/// Set the page response.
2117#[cfg(all(feature = "chrome", not(feature = "headers")))]
2118fn set_page_response_headers(
2119    _chrome_http_req_res: &mut ChromeHTTPReqRes,
2120    _page_response: &mut PageResponse,
2121) {
2122}
2123
2124/// Set the page response.
2125#[cfg(all(feature = "chrome", feature = "headers"))]
2126fn set_page_response_headers_raw(
2127    chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
2128    page_response: &mut PageResponse,
2129) {
2130    if let Some(chrome_headers) = chrome_http_req_res {
2131        let mut header_map = reqwest::header::HeaderMap::new();
2132
2133        if let Some(obj) = chrome_headers.inner().as_object() {
2134            for (index, (key, value)) in obj.iter().enumerate() {
2135                use std::str::FromStr;
2136                if let (Ok(header_name), Ok(header_value)) = (
2137                    reqwest::header::HeaderName::from_str(key),
2138                    reqwest::header::HeaderValue::from_str(&value.to_string()),
2139                ) {
2140                    header_map.insert(header_name, header_value);
2141                }
2142                if index > 1000 {
2143                    break;
2144                }
2145            }
2146        }
2147        if !header_map.is_empty() {
2148            page_response.headers = Some(header_map);
2149        }
2150    }
2151}
2152
2153/// Set the page response.
2154#[cfg(all(feature = "chrome", not(feature = "headers")))]
2155fn set_page_response_headers_raw(
2156    _chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
2157    _page_response: &mut PageResponse,
2158) {
2159}
2160
2161/// Set the page response.
2162#[cfg(all(feature = "chrome", feature = "cookies"))]
2163async fn set_page_response_cookies(page_response: &mut PageResponse, page: &chromiumoxide::Page) {
2164    if let Ok(mut cookies) = page.get_cookies().await {
2165        let mut cookies_map: std::collections::HashMap<String, String> =
2166            std::collections::HashMap::new();
2167
2168        for cookie in cookies.drain(..) {
2169            cookies_map.insert(cookie.name, cookie.value);
2170        }
2171
2172        let response_headers = convert_headers(&cookies_map);
2173
2174        if !response_headers.is_empty() {
2175            page_response.cookies = Some(response_headers);
2176        }
2177    }
2178}
2179
2180/// Set the page response.
2181#[cfg(all(feature = "chrome", not(feature = "cookies")))]
2182async fn set_page_response_cookies(_page_response: &mut PageResponse, _page: &chromiumoxide::Page) {
2183}
2184
2185/// Perform a screenshot shortcut.
2186#[cfg(feature = "chrome")]
2187pub async fn perform_screenshot(
2188    target_url: &str,
2189    page: &chromiumoxide::Page,
2190    screenshot: &Option<crate::configuration::ScreenShotConfig>,
2191    page_response: &mut PageResponse,
2192) {
2193    use base64::engine::general_purpose::STANDARD;
2194    use base64::Engine;
2195
2196    match screenshot {
2197        Some(ref ss) => {
2198            let output_format = string_concat!(
2199                ".",
2200                ss.params
2201                    .cdp_params
2202                    .format
2203                    .as_ref()
2204                    .unwrap_or_else(|| &crate::configuration::CaptureScreenshotFormat::Png)
2205                    .to_string()
2206            );
2207            let ss_params = chromiumoxide::page::ScreenshotParams::from(ss.params.clone());
2208
2209            let full_page = ss_params.full_page.unwrap_or_default();
2210            let omit_background = ss_params.omit_background.unwrap_or_default();
2211            let mut cdp_params = ss_params.cdp_params;
2212
2213            cdp_params.optimize_for_speed = Some(true);
2214
2215            if full_page {
2216                cdp_params.capture_beyond_viewport = Some(true);
2217            }
2218
2219            if omit_background {
2220                let _ = page.execute(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams {
2221                    color: Some(chromiumoxide::cdp::browser_protocol::dom::Rgba {
2222                        r: 0,
2223                        g: 0,
2224                        b: 0,
2225                        a: Some(0.),
2226                    }),
2227                })
2228                .await;
2229            }
2230
2231            match page.execute(cdp_params).await {
2232                Ok(b) => match STANDARD.decode(&b.data) {
2233                    Ok(b) => {
2234                        if ss.save {
2235                            let output_path = create_output_path(
2236                                &ss.output_dir.clone().unwrap_or_else(|| "./storage/".into()),
2237                                &target_url,
2238                                &output_format,
2239                            )
2240                            .await;
2241                            let _ = tokio::fs::write(output_path, &b).await;
2242                        }
2243                        if ss.bytes {
2244                            page_response.screenshot_bytes = Some(b);
2245                        }
2246                    }
2247                    _ => (),
2248                },
2249                Err(e) => {
2250                    log::error!("failed to take screenshot: {:?} - {:?}", e, target_url)
2251                }
2252            };
2253
2254            if omit_background {
2255                let _ = page.execute(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams { color: None })
2256                        .await;
2257            }
2258        }
2259        _ => {
2260            let output_path = create_output_path(
2261                &std::env::var("SCREENSHOT_DIRECTORY")
2262                    .unwrap_or_else(|_| "./storage/".to_string())
2263                    .into(),
2264                &target_url,
2265                &".png",
2266            )
2267            .await;
2268
2269            match page
2270                .save_screenshot(
2271                    chromiumoxide::page::ScreenshotParams::builder()
2272                        .format(
2273                            chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png,
2274                        )
2275                        .full_page(match std::env::var("SCREENSHOT_FULL_PAGE") {
2276                            Ok(t) => t == "true",
2277                            _ => true,
2278                        })
2279                        .omit_background(match std::env::var("SCREENSHOT_OMIT_BACKGROUND") {
2280                            Ok(t) => t == "true",
2281                            _ => true,
2282                        })
2283                        .build(),
2284                    &output_path,
2285                )
2286                .await
2287            {
2288                Ok(_) => log::debug!("saved screenshot: {:?}", output_path),
2289                Err(e) => log::error!("failed to save screenshot: {:?} - {:?}", e, output_path),
2290            };
2291        }
2292    }
2293}
2294
2295#[cfg(feature = "chrome")]
2296/// Check if url matches the last item in a redirect chain for chrome CDP
2297pub async fn get_last_redirect(
2298    target_url: &str,
2299    u: &Option<std::sync::Arc<chromiumoxide::handler::http::HttpRequest>>,
2300    page: &chromiumoxide::Page,
2301) -> Option<String> {
2302    if let Some(http_request) = u {
2303        if let Some(redirect) = http_request.redirect_chain.last() {
2304            if let Some(url) = redirect.url.as_ref() {
2305                return if target_url != url {
2306                    Some(url.clone())
2307                } else {
2308                    None
2309                };
2310            }
2311        }
2312    }
2313    page.url().await.ok()?
2314}
2315
2316/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
2317#[cfg(feature = "cookies")]
2318pub fn get_cookies(res: &Response) -> Option<crate::client::header::HeaderMap> {
2319    use crate::client::header::{HeaderMap, HeaderName, HeaderValue};
2320
2321    let mut headers = HeaderMap::new();
2322
2323    for cookie in res.cookies() {
2324        if let Ok(h) = HeaderValue::from_str(cookie.value()) {
2325            if let Ok(n) = HeaderName::from_str(cookie.name()) {
2326                headers.insert(n, h);
2327            }
2328        }
2329    }
2330
2331    if !headers.is_empty() {
2332        Some(headers)
2333    } else {
2334        None
2335    }
2336}
2337
2338#[cfg(not(feature = "cookies"))]
2339/// The response cookies mapped. This does nothing without the cookies feature flag enabled.
2340pub fn get_cookies(res: &Response) -> Option<crate::client::header::HeaderMap> {
2341    None
2342}
2343
2344/// Block streaming
2345pub(crate) fn block_streaming(res: &Response, only_html: bool) -> bool {
2346    let mut block_streaming = false;
2347
2348    if only_html {
2349        if let Some(content_type) = res.headers().get(crate::client::header::CONTENT_TYPE) {
2350            if let Ok(content_type_str) = content_type.to_str() {
2351                if IGNORE_CONTENT_TYPES.contains(content_type_str) {
2352                    block_streaming = true;
2353                }
2354            }
2355        }
2356    }
2357
2358    block_streaming
2359}
2360
2361/// Handle the response bytes
2362pub async fn handle_response_bytes(
2363    res: Response,
2364    target_url: &str,
2365    only_html: bool,
2366) -> PageResponse {
2367    let u = res.url().as_str();
2368
2369    let rd = if target_url != u {
2370        Some(u.into())
2371    } else {
2372        None
2373    };
2374
2375    let status_code: StatusCode = res.status();
2376    let headers = res.headers().clone();
2377    #[cfg(feature = "remote_addr")]
2378    let remote_addr = res.remote_addr();
2379    let cookies = get_cookies(&res);
2380
2381    let mut content: Option<Box<Vec<u8>>> = None;
2382    let mut anti_bot_tech = AntiBotTech::default();
2383
2384    if !block_streaming(&res, only_html) {
2385        let mut data = match res.content_length() {
2386            Some(cap) if cap >= MAX_PRE_ALLOCATED_HTML_PAGE_SIZE => {
2387                Vec::with_capacity(cap.max(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE) as usize)
2388            }
2389            _ => Vec::with_capacity(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE),
2390        };
2391        let mut stream = res.bytes_stream();
2392        let mut first_bytes = true;
2393
2394        while let Some(item) = stream.next().await {
2395            match item {
2396                Ok(text) => {
2397                    if only_html && first_bytes {
2398                        first_bytes = false;
2399                        if is_binary_file(&text) {
2400                            break;
2401                        }
2402                    }
2403                    let limit = *MAX_SIZE_BYTES;
2404
2405                    if limit > 0 && data.len() + text.len() > limit {
2406                        break;
2407                    }
2408
2409                    data.put(text)
2410                }
2411                Err(e) => {
2412                    log::error!("{e} in {}", target_url);
2413                    break;
2414                }
2415            }
2416        }
2417
2418        anti_bot_tech = detect_anti_bot_tech_response(
2419            &target_url,
2420            &HeaderSource::HeaderMap(&headers),
2421            &data,
2422            None,
2423        );
2424        content.replace(Box::new(data.into()));
2425    }
2426
2427    PageResponse {
2428        #[cfg(feature = "headers")]
2429        headers: Some(headers),
2430        #[cfg(feature = "remote_addr")]
2431        remote_addr,
2432        #[cfg(feature = "cookies")]
2433        cookies,
2434        content,
2435        final_url: rd,
2436        status_code,
2437        anti_bot_tech,
2438        ..Default::default()
2439    }
2440}
2441
2442/// Handle the response bytes writing links while crawling
2443pub async fn handle_response_bytes_writer<'h, O>(
2444    res: Response,
2445    target_url: &str,
2446    only_html: bool,
2447    rewriter: &mut HtmlRewriter<'h, O>,
2448    collected_bytes: &mut Vec<u8>,
2449) -> (PageResponse, bool)
2450where
2451    O: OutputSink + Send + 'static,
2452{
2453    let u = res.url().as_str();
2454
2455    let final_url: Option<String> = if target_url != u {
2456        Some(u.into())
2457    } else {
2458        None
2459    };
2460
2461    let status_code: StatusCode = res.status();
2462    let headers = res.headers().clone();
2463    #[cfg(feature = "remote_addr")]
2464    let remote_addr = res.remote_addr();
2465    let cookies = get_cookies(&res);
2466    let mut anti_bot_tech = AntiBotTech::default();
2467
2468    let mut rewrite_error = false;
2469
2470    if !block_streaming(&res, only_html) {
2471        let mut stream = res.bytes_stream();
2472        let mut first_bytes = true;
2473        let mut data_len = 0;
2474
2475        while let Some(item) = stream.next().await {
2476            match item {
2477                Ok(res_bytes) => {
2478                    if only_html && first_bytes {
2479                        first_bytes = false;
2480                        if is_binary_file(&res_bytes) {
2481                            break;
2482                        }
2483                    }
2484                    let limit = *MAX_SIZE_BYTES;
2485                    let bytes_len = res_bytes.len();
2486
2487                    if limit > 0 && data_len + bytes_len > limit {
2488                        break;
2489                    }
2490
2491                    data_len += bytes_len;
2492
2493                    if !rewrite_error {
2494                        if rewriter.write(&res_bytes).is_err() {
2495                            rewrite_error = true;
2496                        }
2497                    }
2498
2499                    collected_bytes.put(res_bytes);
2500                }
2501                Err(e) => {
2502                    log::error!("{e} in {}", target_url);
2503                    break;
2504                }
2505            }
2506        }
2507
2508        anti_bot_tech = detect_anti_bot_tech_response(
2509            &target_url,
2510            &HeaderSource::HeaderMap(&headers),
2511            &collected_bytes,
2512            None,
2513        );
2514    }
2515
2516    (
2517        PageResponse {
2518            #[cfg(feature = "headers")]
2519            headers: Some(headers),
2520            #[cfg(feature = "remote_addr")]
2521            remote_addr,
2522            #[cfg(feature = "cookies")]
2523            cookies,
2524            final_url,
2525            status_code,
2526            anti_bot_tech,
2527            ..Default::default()
2528        },
2529        rewrite_error,
2530    )
2531}
2532
2533/// Continue to parse a valid web page.
2534pub(crate) fn valid_parsing_status(res: &Response) -> bool {
2535    res.status().is_success() || res.status() == 404
2536}
2537
2538/// Perform a network request to a resource extracting all content streaming.
2539async fn fetch_page_html_raw_base(
2540    target_url: &str,
2541    client: &Client,
2542    only_html: bool,
2543) -> PageResponse {
2544    match client.get(target_url).send().await {
2545        Ok(res) if valid_parsing_status(&res) => {
2546            handle_response_bytes(res, target_url, only_html).await
2547        }
2548        Ok(res) => handle_response_bytes(res, target_url, only_html).await,
2549        Err(err) => {
2550            log::info!("error fetching {}", target_url);
2551            let mut page_response = PageResponse::default();
2552
2553            if let Some(status_code) = err.status() {
2554                page_response.status_code = status_code;
2555            } else {
2556                page_response.status_code = crate::page::get_error_http_status_code(&err);
2557            }
2558
2559            page_response.error_for_status = Some(Err(err));
2560            page_response
2561        }
2562    }
2563}
2564
2565/// Perform a network request to a resource extracting all content streaming.
2566pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageResponse {
2567    fetch_page_html_raw_base(target_url, client, false).await
2568}
2569
2570/// Perform a network request to a resource extracting all content streaming.
2571pub async fn fetch_page_html_raw_only_html(target_url: &str, client: &Client) -> PageResponse {
2572    fetch_page_html_raw_base(target_url, client, false).await
2573}
2574
2575/// Perform a network request to a resource extracting all content as text.
2576#[cfg(feature = "decentralized")]
2577pub async fn fetch_page(target_url: &str, client: &Client) -> Option<Vec<u8>> {
2578    match client.get(target_url).send().await {
2579        Ok(res) if valid_parsing_status(&res) => match res.bytes().await {
2580            Ok(text) => Some(text.into()),
2581            Err(_) => {
2582                log("- error fetching {}", &target_url);
2583                None
2584            }
2585        },
2586        Ok(_) => None,
2587        Err(_) => {
2588            log("- error parsing html bytes {}", &target_url);
2589            None
2590        }
2591    }
2592}
2593
2594#[cfg(all(feature = "decentralized", feature = "headers"))]
2595/// Fetch a page with the headers returned.
2596pub enum FetchPageResult {
2597    /// Success extracting contents of the page
2598    Success(reqwest::header::HeaderMap, Option<Vec<u8>>),
2599    /// No success extracting content
2600    NoSuccess(reqwest::header::HeaderMap),
2601    /// A network error occured.
2602    FetchError,
2603}
2604
2605#[cfg(all(feature = "decentralized", feature = "headers"))]
2606/// Perform a network request to a resource with the response headers..
2607pub async fn fetch_page_and_headers(target_url: &str, client: &Client) -> FetchPageResult {
2608    match client.get(target_url).send().await {
2609        Ok(res) if valid_parsing_status(&res) => {
2610            let headers = res.headers().clone();
2611            let b = match res.bytes().await {
2612                Ok(text) => Some(text),
2613                Err(_) => {
2614                    log("- error fetching {}", &target_url);
2615                    None
2616                }
2617            };
2618            FetchPageResult::Success(headers, b)
2619        }
2620        Ok(res) => FetchPageResult::NoSuccess(res.headers().clone()),
2621        Err(_) => {
2622            log("- error parsing html bytes {}", &target_url);
2623            FetchPageResult::FetchError
2624        }
2625    }
2626}
2627
2628#[cfg(all(not(feature = "fs"), not(feature = "chrome")))]
2629/// Perform a network request to a resource extracting all content as text streaming.
2630pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
2631    fetch_page_html_raw(target_url, client).await
2632}
2633
2634/// Perform a network request to a resource extracting all content as text streaming.
2635#[cfg(all(feature = "fs", not(feature = "chrome")))]
2636pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
2637    use crate::bytes::BufMut;
2638    use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
2639    use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
2640
2641    match client.get(target_url).send().await {
2642        Ok(res) if valid_parsing_status(&res) => {
2643            let u = res.url().as_str();
2644
2645            let rd = if target_url != u {
2646                Some(u.into())
2647            } else {
2648                None
2649            };
2650
2651            let status_code = res.status();
2652            let cookies = get_cookies(&res);
2653            #[cfg(feature = "headers")]
2654            let headers = res.headers().clone();
2655            #[cfg(feature = "remote_addr")]
2656            let remote_addr = res.remote_addr();
2657            let mut stream = res.bytes_stream();
2658            let mut data = Vec::new();
2659            let mut file: Option<tokio::fs::File> = None;
2660            let mut file_path = String::new();
2661
2662            while let Some(item) = stream.next().await {
2663                match item {
2664                    Ok(text) => {
2665                        let wrote_disk = file.is_some();
2666
2667                        // perform operations entire in memory to build resource
2668                        if !wrote_disk && data.capacity() < 8192 {
2669                            data.put(text);
2670                        } else {
2671                            if !wrote_disk {
2672                                file_path = string_concat!(
2673                                    TMP_DIR,
2674                                    &utf8_percent_encode(target_url, NON_ALPHANUMERIC).to_string()
2675                                );
2676                                match tokio::fs::File::create(&file_path).await {
2677                                    Ok(f) => {
2678                                        let file = file.insert(f);
2679
2680                                        data.put(text);
2681
2682                                        if let Ok(_) = file.write_all(&data.as_ref()).await {
2683                                            data.clear();
2684                                        }
2685                                    }
2686                                    _ => data.put(text),
2687                                };
2688                            } else {
2689                                if let Some(f) = file.as_mut() {
2690                                    if let Err(_) = f.write_all(&text).await {
2691                                        data.put(text)
2692                                    }
2693                                }
2694                            }
2695                        }
2696                    }
2697                    Err(e) => {
2698                        log::error!("{e} in {}", target_url);
2699                        break;
2700                    }
2701                }
2702            }
2703
2704            PageResponse {
2705                #[cfg(feature = "headers")]
2706                headers: Some(headers),
2707                #[cfg(feature = "remote_addr")]
2708                remote_addr,
2709                #[cfg(feature = "cookies")]
2710                cookies,
2711                content: Some(if file.is_some() {
2712                    let mut buffer = vec![];
2713
2714                    if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
2715                        if let Ok(_) = b.read_to_end(&mut buffer).await {
2716                            let _ = tokio::fs::remove_file(file_path).await;
2717                        }
2718                    }
2719
2720                    Box::new(buffer.into())
2721                } else {
2722                    Box::new(data.into())
2723                }),
2724                status_code,
2725                final_url: rd,
2726                ..Default::default()
2727            }
2728        }
2729        Ok(res) => {
2730            let u = res.url().as_str();
2731
2732            let rd = if target_url != u {
2733                Some(u.into())
2734            } else {
2735                None
2736            };
2737
2738            PageResponse {
2739                #[cfg(feature = "headers")]
2740                headers: Some(res.headers().clone()),
2741                #[cfg(feature = "remote_addr")]
2742                remote_addr: res.remote_addr(),
2743                #[cfg(feature = "cookies")]
2744                cookies: get_cookies(&res),
2745                status_code: res.status(),
2746                final_url: rd,
2747                ..Default::default()
2748            }
2749        }
2750        Err(err) => {
2751            log::info!("error fetching {}", target_url);
2752            let mut page_response = PageResponse::default();
2753
2754            if let Some(status_code) = err.status() {
2755                page_response.status_code = status_code;
2756            } else {
2757                page_response.status_code = crate::page::get_error_http_status_code(&err);
2758            }
2759
2760            page_response.error_for_status = Some(Err(err));
2761            page_response
2762        }
2763    }
2764}
2765
2766/// Perform a network request to a resource extracting all content as text streaming.
2767#[cfg(all(feature = "fs", feature = "chrome"))]
2768/// Perform a network request to a resource extracting all content as text streaming via chrome.
2769pub async fn fetch_page_html(
2770    target_url: &str,
2771    client: &Client,
2772    page: &chromiumoxide::Page,
2773    wait_for: &Option<crate::configuration::WaitFor>,
2774    screenshot: &Option<crate::configuration::ScreenShotConfig>,
2775    page_set: bool,
2776    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2777    execution_scripts: &Option<ExecutionScripts>,
2778    automation_scripts: &Option<AutomationScripts>,
2779    viewport: &Option<crate::configuration::Viewport>,
2780    request_timeout: &Option<Box<std::time::Duration>>,
2781    track_events: &Option<crate::configuration::ChromeEventTracker>,
2782) -> PageResponse {
2783    use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
2784    use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
2785
2786    match &page {
2787        page => {
2788            match fetch_page_html_chrome_base(
2789                &target_url,
2790                &page,
2791                false,
2792                true,
2793                wait_for,
2794                screenshot,
2795                page_set,
2796                openai_config,
2797                None,
2798                execution_scripts,
2799                automation_scripts,
2800                &viewport,
2801                &request_timeout,
2802                &track_events,
2803            )
2804            .await
2805            {
2806                Ok(page) => page,
2807                _ => {
2808                    log::info!(
2809                        "- error fetching chrome page defaulting to raw http request {}",
2810                        &target_url,
2811                    );
2812
2813                    use crate::bytes::BufMut;
2814
2815                    match client.get(target_url).send().await {
2816                        Ok(res) if valid_parsing_status(&res) => {
2817                            #[cfg(feature = "headers")]
2818                            let headers = res.headers().clone();
2819                            let cookies = get_cookies(&res);
2820                            let status_code = res.status();
2821                            let mut stream = res.bytes_stream();
2822                            let mut data = Vec::new();
2823
2824                            let mut file: Option<tokio::fs::File> = None;
2825                            let mut file_path = String::new();
2826
2827                            while let Some(item) = stream.next().await {
2828                                match item {
2829                                    Ok(text) => {
2830                                        let wrote_disk = file.is_some();
2831
2832                                        // perform operations entire in memory to build resource
2833                                        if !wrote_disk && data.capacity() < 8192 {
2834                                            data.put(text);
2835                                        } else {
2836                                            if !wrote_disk {
2837                                                file_path = string_concat!(
2838                                                    TMP_DIR,
2839                                                    &utf8_percent_encode(
2840                                                        target_url,
2841                                                        NON_ALPHANUMERIC
2842                                                    )
2843                                                    .to_string()
2844                                                );
2845                                                match tokio::fs::File::create(&file_path).await {
2846                                                    Ok(f) => {
2847                                                        let file = file.insert(f);
2848
2849                                                        data.put(text);
2850
2851                                                        if let Ok(_) =
2852                                                            file.write_all(&data.as_ref()).await
2853                                                        {
2854                                                            data.clear();
2855                                                        }
2856                                                    }
2857                                                    _ => data.put(text),
2858                                                };
2859                                            } else {
2860                                                if let Some(f) = file.as_mut() {
2861                                                    if let Ok(_) = f.write_all(&text).await {
2862                                                        data.put(text)
2863                                                    }
2864                                                }
2865                                            }
2866                                        }
2867                                    }
2868                                    Err(e) => {
2869                                        log::error!("{e} in {}", target_url);
2870                                        break;
2871                                    }
2872                                }
2873                            }
2874
2875                            PageResponse {
2876                                #[cfg(feature = "headers")]
2877                                headers: Some(headers),
2878                                #[cfg(feature = "remote_addr")]
2879                                remote_addr: res.remote_addr(),
2880                                #[cfg(feature = "cookies")]
2881                                cookies,
2882                                content: Some(if file.is_some() {
2883                                    let mut buffer = vec![];
2884
2885                                    if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
2886                                        if let Ok(_) = b.read_to_end(&mut buffer).await {
2887                                            let _ = tokio::fs::remove_file(file_path).await;
2888                                        }
2889                                    }
2890
2891                                    Box::new(buffer.into())
2892                                } else {
2893                                    Box::new(data.into())
2894                                }),
2895                                status_code,
2896                                ..Default::default()
2897                            }
2898                        }
2899
2900                        Ok(res) => PageResponse {
2901                            #[cfg(feature = "headers")]
2902                            headers: Some(res.headers().clone()),
2903                            #[cfg(feature = "remote_addr")]
2904                            remote_addr: res.remote_addr(),
2905                            #[cfg(feature = "cookies")]
2906                            cookies: get_cookies(&res),
2907                            status_code: res.status(),
2908                            ..Default::default()
2909                        },
2910                        Err(err) => {
2911                            log::info!("error fetching {}", target_url);
2912                            let mut page_response = PageResponse::default();
2913
2914                            if let Some(status_code) = err.status() {
2915                                page_response.status_code = status_code;
2916                            } else {
2917                                page_response.status_code =
2918                                    crate::page::get_error_http_status_code(&err);
2919                            }
2920
2921                            page_response.error_for_status = Some(Err(err));
2922                            page_response
2923                        }
2924                    }
2925                }
2926            }
2927        }
2928    }
2929}
2930
2931#[cfg(all(not(feature = "fs"), feature = "chrome"))]
2932/// Perform a network request to a resource extracting all content as text streaming via chrome.
2933pub async fn fetch_page_html(
2934    target_url: &str,
2935    client: &Client,
2936    page: &chromiumoxide::Page,
2937    wait_for: &Option<crate::configuration::WaitFor>,
2938    screenshot: &Option<crate::configuration::ScreenShotConfig>,
2939    page_set: bool,
2940    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2941    execution_scripts: &Option<ExecutionScripts>,
2942    automation_scripts: &Option<AutomationScripts>,
2943    viewport: &Option<crate::configuration::Viewport>,
2944    request_timeout: &Option<Box<std::time::Duration>>,
2945    track_events: &Option<crate::configuration::ChromeEventTracker>,
2946) -> PageResponse {
2947    match fetch_page_html_chrome_base(
2948        &target_url,
2949        &page,
2950        false,
2951        true,
2952        wait_for,
2953        screenshot,
2954        page_set,
2955        openai_config,
2956        None,
2957        execution_scripts,
2958        automation_scripts,
2959        viewport,
2960        request_timeout,
2961        track_events,
2962    )
2963    .await
2964    {
2965        Ok(page) => page,
2966        Err(err) => {
2967            log::error!("{:?}", err);
2968            fetch_page_html_raw(&target_url, &client).await
2969        }
2970    }
2971}
2972
2973#[cfg(feature = "chrome")]
2974/// Perform a network request to a resource extracting all content as text streaming via chrome.
2975pub async fn fetch_page_html_chrome(
2976    target_url: &str,
2977    client: &Client,
2978    page: &chromiumoxide::Page,
2979    wait_for: &Option<crate::configuration::WaitFor>,
2980    screenshot: &Option<crate::configuration::ScreenShotConfig>,
2981    page_set: bool,
2982    openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2983    execution_scripts: &Option<ExecutionScripts>,
2984    automation_scripts: &Option<AutomationScripts>,
2985    viewport: &Option<crate::configuration::Viewport>,
2986    request_timeout: &Option<Box<std::time::Duration>>,
2987    track_events: &Option<crate::configuration::ChromeEventTracker>,
2988) -> PageResponse {
2989    match &page {
2990        page => {
2991            match fetch_page_html_chrome_base(
2992                &target_url,
2993                &page,
2994                false,
2995                true,
2996                wait_for,
2997                screenshot,
2998                page_set,
2999                openai_config,
3000                None,
3001                execution_scripts,
3002                automation_scripts,
3003                viewport,
3004                request_timeout,
3005                track_events,
3006            )
3007            .await
3008            {
3009                Ok(page) => page,
3010                Err(err) => {
3011                    log::error!(
3012                        "{:?}. Error requesting: {} - defaulting to raw http request",
3013                        err,
3014                        target_url
3015                    );
3016
3017                    use crate::bytes::BufMut;
3018
3019                    match client.get(target_url).send().await {
3020                        Ok(res) if valid_parsing_status(&res) => {
3021                            #[cfg(feature = "headers")]
3022                            let headers = res.headers().clone();
3023                            #[cfg(feature = "remote_addr")]
3024                            let remote_addr = res.remote_addr();
3025                            let cookies = get_cookies(&res);
3026                            let status_code = res.status();
3027                            let mut stream = res.bytes_stream();
3028                            let mut data = Vec::new();
3029
3030                            while let Some(item) = stream.next().await {
3031                                match item {
3032                                    Ok(text) => {
3033                                        let limit = *MAX_SIZE_BYTES;
3034
3035                                        if limit > 0 && data.len() + text.len() > limit {
3036                                            break;
3037                                        }
3038                                        data.put(text)
3039                                    }
3040                                    Err(e) => {
3041                                        log::error!("{e} in {}", target_url);
3042                                        break;
3043                                    }
3044                                }
3045                            }
3046
3047                            PageResponse {
3048                                #[cfg(feature = "headers")]
3049                                headers: Some(headers),
3050                                #[cfg(feature = "remote_addr")]
3051                                remote_addr,
3052                                #[cfg(feature = "cookies")]
3053                                cookies,
3054                                content: Some(Box::new(data.into())),
3055                                status_code,
3056                                ..Default::default()
3057                            }
3058                        }
3059                        Ok(res) => PageResponse {
3060                            #[cfg(feature = "headers")]
3061                            headers: Some(res.headers().clone()),
3062                            #[cfg(feature = "remote_addr")]
3063                            remote_addr: res.remote_addr(),
3064                            #[cfg(feature = "cookies")]
3065                            cookies: get_cookies(&res),
3066                            status_code: res.status(),
3067                            ..Default::default()
3068                        },
3069                        Err(err) => {
3070                            log::info!("error fetching {}", target_url);
3071                            let mut page_response = PageResponse::default();
3072
3073                            if let Some(status_code) = err.status() {
3074                                page_response.status_code = status_code;
3075                            } else {
3076                                page_response.status_code =
3077                                    crate::page::get_error_http_status_code(&err);
3078                            }
3079
3080                            page_response.error_for_status = Some(Err(err));
3081                            page_response
3082                        }
3083                    }
3084                }
3085            }
3086        }
3087    }
3088}
3089
3090#[cfg(not(feature = "openai"))]
3091/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
3092pub async fn openai_request(
3093    _gpt_configs: &crate::configuration::GPTConfigs,
3094    _resource: String,
3095    _url: &str,
3096    _prompt: &str,
3097) -> crate::features::openai_common::OpenAIReturn {
3098    Default::default()
3099}
3100
3101#[cfg(feature = "openai")]
3102lazy_static! {
3103    static ref CORE_BPE_TOKEN_COUNT: tiktoken_rs::CoreBPE = tiktoken_rs::cl100k_base().unwrap();
3104    static ref SEM: tokio::sync::Semaphore = {
3105        let logical = num_cpus::get();
3106        let physical = num_cpus::get_physical();
3107
3108        let sem_limit = if logical > physical {
3109            (logical) / (physical)
3110        } else {
3111            logical
3112        };
3113
3114        let (sem_limit, sem_max) = if logical == physical {
3115            (sem_limit * physical, 20)
3116        } else {
3117            (sem_limit * 4, 10)
3118        };
3119        let sem_limit = sem_limit / 3;
3120        tokio::sync::Semaphore::const_new(sem_limit.max(sem_max))
3121    };
3122    static ref CLIENT: async_openai::Client<async_openai::config::OpenAIConfig> =
3123        async_openai::Client::new();
3124}
3125
3126#[cfg(feature = "openai")]
3127/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
3128pub async fn openai_request_base(
3129    gpt_configs: &crate::configuration::GPTConfigs,
3130    resource: String,
3131    url: &str,
3132    prompt: &str,
3133) -> crate::features::openai_common::OpenAIReturn {
3134    match SEM.acquire().await {
3135        Ok(permit) => {
3136            let mut chat_completion_defaults =
3137                async_openai::types::CreateChatCompletionRequestArgs::default();
3138            let gpt_base = chat_completion_defaults
3139                .max_tokens(gpt_configs.max_tokens)
3140                .model(&gpt_configs.model);
3141            let gpt_base = match gpt_configs.user {
3142                Some(ref user) => gpt_base.user(user),
3143                _ => gpt_base,
3144            };
3145            let gpt_base = match gpt_configs.temperature {
3146                Some(temp) => gpt_base.temperature(temp),
3147                _ => gpt_base,
3148            };
3149            let gpt_base = match gpt_configs.top_p {
3150                Some(tp) => gpt_base.top_p(tp),
3151                _ => gpt_base,
3152            };
3153
3154            let core_bpe = match tiktoken_rs::get_bpe_from_model(&gpt_configs.model) {
3155                Ok(bpe) => Some(bpe),
3156                _ => None,
3157            };
3158
3159            let (tokens, prompt_tokens) = match core_bpe {
3160                Some(ref core_bpe) => (
3161                    core_bpe.encode_with_special_tokens(&resource),
3162                    core_bpe.encode_with_special_tokens(&prompt),
3163                ),
3164                _ => (
3165                    CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&resource),
3166                    CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
3167                ),
3168            };
3169
3170            // // we can use the output count later to perform concurrent actions.
3171            let output_tokens_count = tokens.len() + prompt_tokens.len();
3172
3173            let mut max_tokens = crate::features::openai::calculate_max_tokens(
3174                &gpt_configs.model,
3175                gpt_configs.max_tokens,
3176                &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
3177                &resource,
3178                &prompt,
3179            );
3180
3181            // we need to slim down the content to fit the window.
3182            let resource = if output_tokens_count > max_tokens {
3183                let r = clean_html(&resource);
3184
3185                max_tokens = crate::features::openai::calculate_max_tokens(
3186                    &gpt_configs.model,
3187                    gpt_configs.max_tokens,
3188                    &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
3189                    &r,
3190                    &prompt,
3191                );
3192
3193                let (tokens, prompt_tokens) = match core_bpe {
3194                    Some(ref core_bpe) => (
3195                        core_bpe.encode_with_special_tokens(&r),
3196                        core_bpe.encode_with_special_tokens(&prompt),
3197                    ),
3198                    _ => (
3199                        CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
3200                        CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
3201                    ),
3202                };
3203
3204                let output_tokens_count = tokens.len() + prompt_tokens.len();
3205
3206                if output_tokens_count > max_tokens {
3207                    let r = clean_html_slim(&r);
3208
3209                    max_tokens = crate::features::openai::calculate_max_tokens(
3210                        &gpt_configs.model,
3211                        gpt_configs.max_tokens,
3212                        &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
3213                        &r,
3214                        &prompt,
3215                    );
3216
3217                    let (tokens, prompt_tokens) = match core_bpe {
3218                        Some(ref core_bpe) => (
3219                            core_bpe.encode_with_special_tokens(&r),
3220                            core_bpe.encode_with_special_tokens(&prompt),
3221                        ),
3222                        _ => (
3223                            CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
3224                            CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
3225                        ),
3226                    };
3227
3228                    let output_tokens_count = tokens.len() + prompt_tokens.len();
3229
3230                    if output_tokens_count > max_tokens {
3231                        clean_html_full(&r)
3232                    } else {
3233                        r
3234                    }
3235                } else {
3236                    r
3237                }
3238            } else {
3239                clean_html(&resource)
3240            };
3241
3242            let mut tokens_used = crate::features::openai_common::OpenAIUsage::default();
3243            let json_mode = gpt_configs.extra_ai_data;
3244
3245            let response_format = {
3246                let mut mode = if json_mode {
3247                    async_openai::types::ResponseFormat::JsonObject
3248                } else {
3249                    async_openai::types::ResponseFormat::Text
3250                };
3251
3252                if let Some(ref structure) = gpt_configs.json_schema {
3253                    if let Some(ref schema) = structure.schema {
3254                        if let Ok(mut schema) =
3255                            crate::features::serde_json::from_str::<serde_json::Value>(&schema)
3256                        {
3257                            if json_mode {
3258                                // Insert the "js" property into the schema's properties. Todo: capture if the js property exist and re-word prompt to match new js property with after removal.
3259                                if let Some(properties) = schema.get_mut("properties") {
3260                                    if let Some(properties_map) = properties.as_object_mut() {
3261                                        properties_map.insert(
3262                                            "js".to_string(),
3263                                            serde_json::json!({
3264                                                "type": "string"
3265                                            }),
3266                                        );
3267                                    }
3268                                }
3269                            }
3270
3271                            mode = async_openai::types::ResponseFormat::JsonSchema {
3272                                json_schema: async_openai::types::ResponseFormatJsonSchema {
3273                                    description: structure.description.clone(),
3274                                    name: structure.name.clone(),
3275                                    schema: if schema.is_null() { None } else { Some(schema) },
3276                                    strict: structure.strict,
3277                                },
3278                            }
3279                        }
3280                    }
3281                }
3282
3283                mode
3284            };
3285
3286            match async_openai::types::ChatCompletionRequestAssistantMessageArgs::default()
3287                .content(string_concat!("URL: ", url, "\n", "HTML: ", resource))
3288                .build()
3289            {
3290                Ok(resource_completion) => {
3291                    let mut messages: Vec<async_openai::types::ChatCompletionRequestMessage> =
3292                        vec![crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT.clone()];
3293
3294                    if json_mode {
3295                        messages.push(
3296                            crate::features::openai::BROWSER_ACTIONS_SYSTEM_EXTRA_PROMPT.clone(),
3297                        );
3298                    }
3299
3300                    messages.push(resource_completion.into());
3301
3302                    if !prompt.is_empty() {
3303                        messages.push(
3304                            match async_openai::types::ChatCompletionRequestUserMessageArgs::default()
3305                            .content(prompt)
3306                            .build()
3307                        {
3308                            Ok(o) => o,
3309                            _ => Default::default(),
3310                        }
3311                        .into()
3312                        )
3313                    }
3314
3315                    let v = match gpt_base
3316                        .max_tokens(max_tokens as u32)
3317                        .messages(messages)
3318                        .response_format(response_format)
3319                        .build()
3320                    {
3321                        Ok(request) => {
3322                            let res = match gpt_configs.api_key {
3323                                Some(ref key) => {
3324                                    if !key.is_empty() {
3325                                        let conf = CLIENT.config().to_owned();
3326                                        async_openai::Client::with_config(conf.with_api_key(key))
3327                                            .chat()
3328                                            .create(request)
3329                                            .await
3330                                    } else {
3331                                        CLIENT.chat().create(request).await
3332                                    }
3333                                }
3334                                _ => CLIENT.chat().create(request).await,
3335                            };
3336
3337                            match res {
3338                                Ok(mut response) => {
3339                                    let mut choice = response.choices.first_mut();
3340
3341                                    if let Some(usage) = response.usage.take() {
3342                                        tokens_used.prompt_tokens = usage.prompt_tokens;
3343                                        tokens_used.completion_tokens = usage.completion_tokens;
3344                                        tokens_used.total_tokens = usage.total_tokens;
3345                                    }
3346
3347                                    match choice.as_mut() {
3348                                        Some(c) => match c.message.content.take() {
3349                                            Some(content) => content,
3350                                            _ => Default::default(),
3351                                        },
3352                                        _ => Default::default(),
3353                                    }
3354                                }
3355                                Err(err) => {
3356                                    log::error!("{:?}", err);
3357                                    Default::default()
3358                                }
3359                            }
3360                        }
3361                        _ => Default::default(),
3362                    };
3363
3364                    drop(permit);
3365
3366                    crate::features::openai_common::OpenAIReturn {
3367                        response: v,
3368                        usage: tokens_used,
3369                        error: None,
3370                    }
3371                }
3372                Err(e) => {
3373                    let mut d = crate::features::openai_common::OpenAIReturn::default();
3374
3375                    d.error = Some(e.to_string());
3376
3377                    d
3378                }
3379            }
3380        }
3381        Err(e) => {
3382            let mut d = crate::features::openai_common::OpenAIReturn::default();
3383
3384            d.error = Some(e.to_string());
3385
3386            d
3387        }
3388    }
3389}
3390
3391#[cfg(all(feature = "openai", not(feature = "cache_openai")))]
3392/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
3393pub async fn openai_request(
3394    gpt_configs: &crate::configuration::GPTConfigs,
3395    resource: String,
3396    url: &str,
3397    prompt: &str,
3398) -> crate::features::openai_common::OpenAIReturn {
3399    openai_request_base(gpt_configs, resource, url, prompt).await
3400}
3401
3402#[cfg(all(feature = "openai", feature = "cache_openai"))]
3403/// Perform a request to OpenAI Chat. This does nothing without the 'openai' flag enabled.
3404pub async fn openai_request(
3405    gpt_configs: &crate::configuration::GPTConfigs,
3406    resource: String,
3407    url: &str,
3408    prompt: &str,
3409) -> crate::features::openai_common::OpenAIReturn {
3410    match &gpt_configs.cache {
3411        Some(cache) => {
3412            use std::hash::{Hash, Hasher};
3413            let mut s = ahash::AHasher::default();
3414
3415            url.hash(&mut s);
3416            prompt.hash(&mut s);
3417            gpt_configs.model.hash(&mut s);
3418            gpt_configs.max_tokens.hash(&mut s);
3419            gpt_configs.extra_ai_data.hash(&mut s);
3420            // non-determinstic
3421            resource.hash(&mut s);
3422
3423            let key = s.finish();
3424
3425            match cache.get(&key).await {
3426                Some(cache) => {
3427                    let mut c = cache;
3428                    c.usage.cached = true;
3429                    c
3430                }
3431                _ => {
3432                    let r = openai_request_base(gpt_configs, resource, url, prompt).await;
3433                    let _ = cache.insert(key, r.clone()).await;
3434                    r
3435                }
3436            }
3437        }
3438        _ => openai_request_base(gpt_configs, resource, url, prompt).await,
3439    }
3440}
3441
3442/// Clean the html removing css and js default using the scraper crate.
3443pub fn clean_html_raw(html: &str) -> String {
3444    html.to_string()
3445}
3446
3447/// Clean the html removing css and js
3448#[cfg(feature = "openai")]
3449pub fn clean_html_base(html: &str) -> String {
3450    use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
3451
3452    match rewrite_str(
3453        html,
3454        RewriteStrSettings {
3455            element_content_handlers: vec![
3456                element!("script", |el| {
3457                    el.remove();
3458                    Ok(())
3459                }),
3460                element!("style", |el| {
3461                    el.remove();
3462                    Ok(())
3463                }),
3464                element!("link", |el| {
3465                    el.remove();
3466                    Ok(())
3467                }),
3468                element!("iframe", |el| {
3469                    el.remove();
3470                    Ok(())
3471                }),
3472                element!("[style*='display:none']", |el| {
3473                    el.remove();
3474                    Ok(())
3475                }),
3476                element!("[id*='ad']", |el| {
3477                    el.remove();
3478                    Ok(())
3479                }),
3480                element!("[class*='ad']", |el| {
3481                    el.remove();
3482                    Ok(())
3483                }),
3484                element!("[id*='tracking']", |el| {
3485                    el.remove();
3486                    Ok(())
3487                }),
3488                element!("[class*='tracking']", |el| {
3489                    el.remove();
3490                    Ok(())
3491                }),
3492                element!("meta", |el| {
3493                    if let Some(attribute) = el.get_attribute("name") {
3494                        if attribute != "title" && attribute != "description" {
3495                            el.remove();
3496                        }
3497                    } else {
3498                        el.remove();
3499                    }
3500                    Ok(())
3501                }),
3502            ],
3503            document_content_handlers: vec![doc_comments!(|c| {
3504                c.remove();
3505                Ok(())
3506            })],
3507            ..RewriteStrSettings::default()
3508        },
3509    ) {
3510        Ok(r) => r,
3511        _ => html.into(),
3512    }
3513}
3514
3515/// Make sure the base tag exist on the page.
3516#[cfg(feature = "chrome")]
3517pub async fn rewrite_base_tag(html: &str, base_url: &Option<&str>) -> String {
3518    use lol_html::{element, html_content::ContentType};
3519    use std::sync::OnceLock;
3520
3521    if html.is_empty() {
3522        return Default::default();
3523    }
3524
3525    let base_tag_inserted = OnceLock::new();
3526    let already_present = OnceLock::new();
3527
3528    let base_url_len = base_url.map(|s| s.len());
3529
3530    let rewriter_settings: lol_html::Settings<'_, '_, lol_html::send::SendHandlerTypes> =
3531        lol_html::send::Settings {
3532            element_content_handlers: vec![
3533                // Handler for <base> to mark if it is present with href
3534                element!("base", {
3535                    |el| {
3536                        // check base tags that do not exist yet.
3537                        if base_tag_inserted.get().is_none() {
3538                            // Check if a <base> with href already exists
3539                            if let Some(attr) = el.get_attribute("href") {
3540                                let valid_http =
3541                                    attr.starts_with("http://") || attr.starts_with("https://");
3542
3543                                // we can validate if the domain is the same if not to remove it.
3544                                if valid_http {
3545                                    let _ = base_tag_inserted.set(true);
3546                                    let _ = already_present.set(true);
3547                                } else {
3548                                    el.remove();
3549                                }
3550                            } else {
3551                                el.remove();
3552                            }
3553                        }
3554
3555                        Ok(())
3556                    }
3557                }),
3558                // Handler for <head> to insert <base> tag if not present
3559                element!("head", {
3560                    |el: &mut lol_html::send::Element| {
3561                        if let Some(handlers) = el.end_tag_handlers() {
3562                            let base_tag_inserted = base_tag_inserted.clone();
3563                            let base_url =
3564                                format!(r#"<base href="{}">"#, base_url.unwrap_or_default());
3565
3566                            handlers.push(Box::new(move |end| {
3567                                if base_tag_inserted.get().is_none() {
3568                                    let _ = base_tag_inserted.set(true);
3569                                    end.before(&base_url, ContentType::Html);
3570                                }
3571                                Ok(())
3572                            }))
3573                        }
3574                        Ok(())
3575                    }
3576                }),
3577                // Handler for html if <head> not present to insert <head><base></head> tag if not present
3578                element!("html", {
3579                    |el: &mut lol_html::send::Element| {
3580                        if let Some(handlers) = el.end_tag_handlers() {
3581                            let base_tag_inserted = base_tag_inserted.clone();
3582                            let base_url = format!(
3583                                r#"<head><base href="{}"></head>"#,
3584                                base_url.unwrap_or_default()
3585                            );
3586
3587                            handlers.push(Box::new(move |end| {
3588                                if base_tag_inserted.get().is_none() {
3589                                    let _ = base_tag_inserted.set(true);
3590                                    end.before(&base_url, ContentType::Html);
3591                                }
3592                                Ok(())
3593                            }))
3594                        }
3595                        Ok(())
3596                    }
3597                }),
3598            ],
3599            ..lol_html::send::Settings::new_for_handler_types()
3600        };
3601
3602    let mut buffer = Vec::with_capacity(
3603        html.len()
3604            + match base_url_len {
3605                Some(l) => l + 29,
3606                _ => 0,
3607            },
3608    );
3609
3610    let mut rewriter = lol_html::send::HtmlRewriter::new(rewriter_settings, |c: &[u8]| {
3611        buffer.extend_from_slice(c);
3612    });
3613
3614    let mut stream = tokio_stream::iter(html.as_bytes().chunks(*STREAMING_CHUNK_SIZE));
3615
3616    let mut wrote_error = false;
3617
3618    while let Some(chunk) = stream.next().await {
3619        // early exist
3620        if already_present.get().is_some() {
3621            break;
3622        }
3623        if rewriter.write(chunk).is_err() {
3624            wrote_error = true;
3625            break;
3626        }
3627    }
3628
3629    if !wrote_error {
3630        let _ = rewriter.end();
3631    }
3632
3633    if already_present.get().is_some() {
3634        html.to_string()
3635    } else {
3636        auto_encoder::auto_encode_bytes(&buffer)
3637    }
3638}
3639
3640/// Clean the HTML to slim fit GPT models. This removes base64 images from the prompt.
3641#[cfg(feature = "openai")]
3642pub fn clean_html_slim(html: &str) -> String {
3643    use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
3644    match rewrite_str(
3645        html,
3646        RewriteStrSettings {
3647            element_content_handlers: vec![
3648                element!("script", |el| {
3649                    el.remove();
3650                    Ok(())
3651                }),
3652                element!("style", |el| {
3653                    el.remove();
3654                    Ok(())
3655                }),
3656                element!("svg", |el| {
3657                    el.remove();
3658                    Ok(())
3659                }),
3660                element!("noscript", |el| {
3661                    el.remove();
3662                    Ok(())
3663                }),
3664                element!("link", |el| {
3665                    el.remove();
3666                    Ok(())
3667                }),
3668                element!("iframe", |el| {
3669                    el.remove();
3670                    Ok(())
3671                }),
3672                element!("canvas", |el| {
3673                    el.remove();
3674                    Ok(())
3675                }),
3676                element!("video", |el| {
3677                    el.remove();
3678                    Ok(())
3679                }),
3680                element!("img", |el| {
3681                    if let Some(src) = el.get_attribute("src") {
3682                        if src.starts_with("data:image") {
3683                            el.remove();
3684                        }
3685                    }
3686                    Ok(())
3687                }),
3688                element!("picture", |el| {
3689                    if let Some(src) = el.get_attribute("src") {
3690                        if src.starts_with("data:image") {
3691                            el.remove();
3692                        }
3693                    }
3694                    Ok(())
3695                }),
3696                element!("[style*='display:none']", |el| {
3697                    el.remove();
3698                    Ok(())
3699                }),
3700                element!("[id*='ad']", |el| {
3701                    el.remove();
3702                    Ok(())
3703                }),
3704                element!("[class*='ad']", |el| {
3705                    el.remove();
3706                    Ok(())
3707                }),
3708                element!("[id*='tracking']", |el| {
3709                    el.remove();
3710                    Ok(())
3711                }),
3712                element!("[class*='tracking']", |el| {
3713                    el.remove();
3714                    Ok(())
3715                }),
3716                element!("meta", |el| {
3717                    if let Some(attribute) = el.get_attribute("name") {
3718                        if attribute != "title" && attribute != "description" {
3719                            el.remove();
3720                        }
3721                    } else {
3722                        el.remove();
3723                    }
3724                    Ok(())
3725                }),
3726            ],
3727            document_content_handlers: vec![doc_comments!(|c| {
3728                c.remove();
3729                Ok(())
3730            })],
3731            ..RewriteStrSettings::default()
3732        },
3733    ) {
3734        Ok(r) => r,
3735        _ => html.into(),
3736    }
3737}
3738
3739/// Clean the most of the extra properties in the html to fit the context.
3740#[cfg(feature = "openai")]
3741pub fn clean_html_full(html: &str) -> String {
3742    use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
3743
3744    match rewrite_str(
3745        html,
3746        RewriteStrSettings {
3747            element_content_handlers: vec![
3748                element!("nav, footer", |el| {
3749                    el.remove();
3750                    Ok(())
3751                }),
3752                element!("meta", |el| {
3753                    let name = el.get_attribute("name").map(|n| n.to_lowercase());
3754
3755                    if !matches!(name.as_deref(), Some("viewport") | Some("charset")) {
3756                        el.remove();
3757                    }
3758
3759                    Ok(())
3760                }),
3761                element!("*", |el| {
3762                    let attrs_to_keep = ["id", "data-", "class"];
3763                    let attributes_list = el.attributes().iter();
3764                    let mut remove_list = Vec::new();
3765
3766                    for attr in attributes_list {
3767                        if !attrs_to_keep.contains(&attr.name().as_str()) {
3768                            remove_list.push(attr.name());
3769                        }
3770                    }
3771
3772                    for attr in remove_list {
3773                        el.remove_attribute(&attr);
3774                    }
3775
3776                    Ok(())
3777                }),
3778            ],
3779            document_content_handlers: vec![doc_comments!(|c| {
3780                c.remove();
3781                Ok(())
3782            })],
3783            ..RewriteStrSettings::default()
3784        },
3785    ) {
3786        Ok(r) => r,
3787        _ => html.into(),
3788    }
3789}
3790
3791/// Clean the html removing css and js
3792#[cfg(not(feature = "openai"))]
3793pub fn clean_html(html: &str) -> String {
3794    clean_html_raw(html)
3795}
3796
3797/// Clean the html removing css and js
3798#[cfg(all(feature = "openai", not(feature = "openai_slim_fit")))]
3799pub fn clean_html(html: &str) -> String {
3800    clean_html_base(html)
3801}
3802
3803/// Clean the html removing css and js
3804#[cfg(all(feature = "openai", feature = "openai_slim_fit"))]
3805pub fn clean_html(html: &str) -> String {
3806    clean_html_slim(html)
3807}
3808
3809#[cfg(not(feature = "openai"))]
3810/// Clean and remove all base64 images from the prompt.
3811pub fn clean_html_slim(html: &str) -> String {
3812    html.into()
3813}
3814
3815/// Log to console if configuration verbose.
3816pub fn log(message: &'static str, data: impl AsRef<str>) {
3817    if log_enabled!(Level::Info) {
3818        info!("{message} - {}", data.as_ref());
3819    }
3820}
3821
3822#[cfg(feature = "control")]
3823/// determine action
3824#[derive(PartialEq, Debug)]
3825pub enum Handler {
3826    /// Crawl start state
3827    Start,
3828    /// Crawl pause state
3829    Pause,
3830    /// Crawl resume
3831    Resume,
3832    /// Crawl shutdown
3833    Shutdown,
3834}
3835
3836#[cfg(feature = "control")]
3837lazy_static! {
3838    /// control handle for crawls
3839    pub static ref CONTROLLER: std::sync::Arc<tokio::sync::RwLock<(tokio::sync::watch::Sender<(String, Handler)>,
3840        tokio::sync::watch::Receiver<(String, Handler)>)>> =
3841            std::sync::Arc::new(tokio::sync::RwLock::new(tokio::sync::watch::channel(("handles".to_string(), Handler::Start))));
3842}
3843
3844#[cfg(feature = "control")]
3845/// Pause a target website running crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
3846pub async fn pause(target: &str) {
3847    match CONTROLLER
3848        .write()
3849        .await
3850        .0
3851        .send((target.into(), Handler::Pause))
3852    {
3853        _ => (),
3854    };
3855}
3856
3857#[cfg(feature = "control")]
3858/// Resume a target website crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
3859pub async fn resume(target: &str) {
3860    match CONTROLLER
3861        .write()
3862        .await
3863        .0
3864        .send((target.into(), Handler::Resume))
3865    {
3866        _ => (),
3867    };
3868}
3869
3870#[cfg(feature = "control")]
3871/// Shutdown a target website crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
3872pub async fn shutdown(target: &str) {
3873    match CONTROLLER
3874        .write()
3875        .await
3876        .0
3877        .send((target.into(), Handler::Shutdown))
3878    {
3879        _ => (),
3880    };
3881}
3882
3883#[cfg(feature = "control")]
3884/// Reset a target website crawl. The crawl_id is prepended directly to the domain and required if set. ex: d22323edsd-https://mydomain.com
3885pub async fn reset(target: &str) {
3886    match CONTROLLER
3887        .write()
3888        .await
3889        .0
3890        .send((target.into(), Handler::Start))
3891    {
3892        _ => (),
3893    };
3894}
3895
3896/// Setup selectors for handling link targets.
3897pub(crate) fn setup_website_selectors(url: &str, allowed: AllowedDomainTypes) -> RelativeSelectors {
3898    let subdomains = allowed.subdomains;
3899    let tld = allowed.tld;
3900
3901    crate::page::get_page_selectors_base(url, subdomains, tld)
3902}
3903
3904/// Allow subdomains or tlds.
3905#[derive(Debug, Default, Clone, Copy)]
3906pub struct AllowedDomainTypes {
3907    /// Subdomains
3908    pub subdomains: bool,
3909    /// Tlds
3910    pub tld: bool,
3911}
3912
3913impl AllowedDomainTypes {
3914    /// A new domain type.
3915    pub fn new(subdomains: bool, tld: bool) -> Self {
3916        Self { subdomains, tld }
3917    }
3918}
3919
3920/// Modify the selectors for targetting a website.
3921pub(crate) fn modify_selectors(
3922    prior_domain: &Option<Box<Url>>,
3923    domain: &str,
3924    domain_parsed: &mut Option<Box<Url>>,
3925    url: &mut Box<CaseInsensitiveString>,
3926    base: &mut RelativeSelectors,
3927    allowed: AllowedDomainTypes,
3928) {
3929    *domain_parsed = parse_absolute_url(domain);
3930    *url = Box::new(domain.into());
3931    let s = setup_website_selectors(url.inner(), allowed);
3932    base.0 = s.0;
3933    base.1 = s.1;
3934    if let Some(prior_domain) = prior_domain {
3935        if let Some(dname) = prior_domain.host_str() {
3936            base.2 = dname.into();
3937        }
3938    }
3939}
3940
3941/// Get the last segment path.
3942pub fn get_last_segment(path: &str) -> &str {
3943    if let Some(pos) = path.rfind('/') {
3944        let next_position = pos + 1;
3945        if next_position < path.len() {
3946            &path[next_position..]
3947        } else {
3948            ""
3949        }
3950    } else {
3951        path
3952    }
3953}
3954
3955/// Get the path from a url
3956pub(crate) fn get_path_from_url(url: &str) -> &str {
3957    if let Some(start_pos) = url.find("//") {
3958        let mut pos = start_pos + 2;
3959
3960        if let Some(third_slash_pos) = url[pos..].find('/') {
3961            pos += third_slash_pos;
3962            &url[pos..]
3963        } else {
3964            "/"
3965        }
3966    } else {
3967        "/"
3968    }
3969}
3970
3971/// Get the domain from a url.
3972pub(crate) fn get_domain_from_url(url: &str) -> &str {
3973    if let Some(start_pos) = url.find("//") {
3974        let pos = start_pos + 2;
3975
3976        if let Some(first_slash_pos) = url[pos..].find('/') {
3977            &url[pos..pos + first_slash_pos]
3978        } else {
3979            &url[pos..]
3980        }
3981    } else {
3982        if let Some(first_slash_pos) = url.find('/') {
3983            &url[..first_slash_pos]
3984        } else {
3985            &url
3986        }
3987    }
3988}
3989
3990/// Determine if networking is capable for a URL.
3991pub fn networking_capable(url: &str) -> bool {
3992    url.starts_with("https://")
3993        || url.starts_with("http://")
3994        || url.starts_with("file://")
3995        || url.starts_with("ftp://")
3996}
3997
3998/// Prepare the url for parsing if it fails. Use this method if the url does not start with http or https.
3999pub fn prepare_url(u: &str) -> String {
4000    if let Some(index) = u.find("://") {
4001        let split_index = u
4002            .char_indices()
4003            .nth(index + 3)
4004            .map(|(i, _)| i)
4005            .unwrap_or(u.len());
4006
4007        format!("https://{}", &u[split_index..])
4008    } else {
4009        format!("https://{}", u)
4010    }
4011}
4012
4013/// normalize the html markup to prevent Maliciousness.
4014pub(crate) async fn normalize_html(html: &[u8]) -> Vec<u8> {
4015    use lol_html::{element, send::Settings};
4016
4017    let mut output = Vec::new();
4018
4019    let mut rewriter = HtmlRewriter::new(
4020        Settings {
4021            element_content_handlers: vec![
4022                element!("a[href]", |el| {
4023                    el.remove_attribute("href");
4024                    Ok(())
4025                }),
4026                element!("script, style, iframe, base, noscript", |el| {
4027                    el.remove();
4028                    Ok(())
4029                }),
4030                element!("*", |el| {
4031                    let mut remove_attr = vec![];
4032
4033                    for attr in el.attributes() {
4034                        let name = attr.name();
4035                        let remove =
4036                            !(name.starts_with("data-") || name == "id" || name == "class");
4037                        if remove {
4038                            remove_attr.push(name);
4039                        }
4040                    }
4041
4042                    for name in remove_attr {
4043                        el.remove_attribute(&name);
4044                    }
4045
4046                    Ok(())
4047                }),
4048            ],
4049            ..Settings::new_send()
4050        },
4051        |c: &[u8]| output.extend_from_slice(c),
4052    );
4053
4054    let chunks = html.chunks(*STREAMING_CHUNK_SIZE);
4055    let mut stream = tokio_stream::iter(chunks);
4056    let mut wrote_error = false;
4057
4058    while let Some(chunk) = stream.next().await {
4059        if rewriter.write(chunk).is_err() {
4060            wrote_error = true;
4061            break;
4062        }
4063    }
4064
4065    if !wrote_error {
4066        let _ = rewriter.end();
4067    }
4068
4069    output
4070}
4071
4072/// Hash html markup.
4073pub(crate) async fn hash_html(html: &[u8]) -> u64 {
4074    let normalized_html = normalize_html(html).await;
4075
4076    if !normalized_html.is_empty() {
4077        use std::hash::{Hash, Hasher};
4078        let mut s = ahash::AHasher::default();
4079        normalized_html.hash(&mut s);
4080        let key = s.finish();
4081        key
4082    } else {
4083        Default::default()
4084    }
4085}
4086
4087#[cfg(feature = "tracing")]
4088/// Spawns a new asynchronous task.
4089pub(crate) fn spawn_task<F>(task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
4090where
4091    F: std::future::Future + Send + 'static,
4092    F::Output: Send + 'static,
4093{
4094    tokio::task::Builder::new()
4095        .name(task_name)
4096        .spawn(future)
4097        .expect("failed to spawn task")
4098}
4099
4100#[cfg(not(feature = "tracing"))]
4101#[allow(unused)]
4102/// Spawns a new asynchronous task.
4103pub(crate) fn spawn_task<F>(_task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
4104where
4105    F: std::future::Future + Send + 'static,
4106    F::Output: Send + 'static,
4107{
4108    tokio::task::spawn(future)
4109}
4110
4111#[cfg(feature = "tracing")]
4112/// Spawn a joinset.
4113pub(crate) fn spawn_set<F, T>(
4114    task_name: &str,
4115    set: &mut tokio::task::JoinSet<T>,
4116    future: F,
4117) -> tokio::task::AbortHandle
4118where
4119    F: Future<Output = T>,
4120    F: Send + 'static,
4121    T: Send + 'static,
4122{
4123    set.build_task()
4124        .name(task_name)
4125        .spawn(future)
4126        .expect("set should spawn")
4127}
4128
4129#[cfg(not(feature = "tracing"))]
4130/// Spawn a joinset.
4131pub(crate) fn spawn_set<F, T>(
4132    _task_name: &str,
4133    set: &mut tokio::task::JoinSet<T>,
4134    future: F,
4135) -> tokio::task::AbortHandle
4136where
4137    F: Future<Output = T>,
4138    F: Send + 'static,
4139    T: Send + 'static,
4140{
4141    set.spawn(future)
4142}
4143
4144#[cfg(feature = "balance")]
4145/// Period to wait to rebalance cpu in means of IO being main impact.
4146const REBALANCE_TIME: std::time::Duration = std::time::Duration::from_millis(100);
4147
4148/// Return the semaphore that should be used.
4149#[cfg(feature = "balance")]
4150pub async fn get_semaphore(semaphore: &Arc<Semaphore>, detect: bool) -> &Arc<Semaphore> {
4151    let cpu_load = if detect {
4152        crate::utils::detect_system::get_global_cpu_state().await
4153    } else {
4154        0
4155    };
4156
4157    if cpu_load == 2 {
4158        tokio::time::sleep(REBALANCE_TIME).await;
4159    }
4160
4161    if cpu_load >= 1 {
4162        &*crate::website::SEM_SHARED
4163    } else {
4164        semaphore
4165    }
4166}
4167
4168/// Check if the crawl duration is expired.
4169pub fn crawl_duration_expired(crawl_timeout: &Option<Duration>, start: &Option<Instant>) -> bool {
4170    crawl_timeout
4171        .and_then(|duration| start.map(|start| start.elapsed() >= duration))
4172        .unwrap_or(false)
4173}
4174
4175/// is the content html and safe for formatting.
4176static HTML_TAGS: phf::Set<&'static [u8]> = phf_set! {
4177    b"<!doctype html",
4178    b"<html",
4179    b"<document",
4180};
4181
4182/// Check if the content is HTML.
4183pub fn is_html_content_check(bytes: &[u8]) -> bool {
4184    let check_bytes = if bytes.len() > 1024 {
4185        &bytes[..1024]
4186    } else {
4187        bytes
4188    };
4189
4190    for tag in HTML_TAGS.iter() {
4191        if check_bytes
4192            .windows(tag.len())
4193            .any(|window| window.eq_ignore_ascii_case(tag))
4194        {
4195            return true;
4196        }
4197    }
4198
4199    false
4200}
4201
4202/// Return the semaphore that should be used.
4203#[cfg(not(feature = "balance"))]
4204pub async fn get_semaphore(semaphore: &Arc<Semaphore>, _detect: bool) -> &Arc<Semaphore> {
4205    semaphore
4206}
4207
4208#[derive(Debug)]
4209/// Html output sink for the rewriter.
4210#[cfg(feature = "smart")]
4211pub(crate) struct HtmlOutputSink {
4212    /// The bytes collected.
4213    pub(crate) data: Vec<u8>,
4214    /// The sender to send once finished.
4215    pub(crate) sender: Option<tokio::sync::oneshot::Sender<Vec<u8>>>,
4216}
4217
4218#[cfg(feature = "smart")]
4219impl HtmlOutputSink {
4220    /// A new output sink.
4221    pub(crate) fn new(sender: tokio::sync::oneshot::Sender<Vec<u8>>) -> Self {
4222        HtmlOutputSink {
4223            data: Vec::new(),
4224            sender: Some(sender),
4225        }
4226    }
4227}
4228
4229#[cfg(feature = "smart")]
4230impl OutputSink for HtmlOutputSink {
4231    fn handle_chunk(&mut self, chunk: &[u8]) {
4232        self.data.extend_from_slice(chunk);
4233        if chunk.len() == 0 {
4234            if let Some(sender) = self.sender.take() {
4235                let data_to_send = std::mem::take(&mut self.data);
4236                let _ = sender.send(data_to_send);
4237            }
4238        }
4239    }
4240}
4241
4242/// Emit a log info event.
4243#[cfg(feature = "tracing")]
4244pub fn emit_log(link: &str) {
4245    tracing::info!("fetch {}", &link);
4246}
4247/// Emit a log info event.
4248#[cfg(not(feature = "tracing"))]
4249pub fn emit_log(link: &str) {
4250    log::info!("fetch {}", &link);
4251}
4252
4253/// Emit a log info event.
4254#[cfg(feature = "tracing")]
4255pub fn emit_log_shutdown(link: &str) {
4256    tracing::info!("shutdown {}", &link);
4257}
4258/// Emit a log info event.
4259#[cfg(not(feature = "tracing"))]
4260pub fn emit_log_shutdown(link: &str) {
4261    log::info!("shutdown {}", &link);
4262}