1pub mod abs;
3pub mod connect;
5pub mod css_selectors;
7pub mod header_utils;
9pub mod interner;
11pub mod trie;
13
14#[cfg(feature = "balance")]
15pub mod detect_system;
17use crate::page::{AntiBotTech, Metadata};
18use crate::{page::STREAMING_CHUNK_SIZE, RelativeSelectors};
19use abs::parse_absolute_url;
20use aho_corasick::AhoCorasick;
21use auto_encoder::is_binary_file;
22use bytes::BufMut;
23use case_insensitive_string::CaseInsensitiveString;
24#[cfg(feature = "chrome")]
25use hashbrown::HashMap;
26use lol_html::{send::HtmlRewriter, OutputSink};
27use phf::phf_set;
28use std::str::FromStr;
29use std::sync::Arc;
30use std::{
31 future::Future,
32 time::{Duration, Instant},
33};
34use tokio::sync::Semaphore;
35use url::Url;
36
37#[cfg(feature = "chrome")]
38use crate::features::chrome_common::{AutomationScripts, ExecutionScripts};
39use crate::page::{MAX_PRE_ALLOCATED_HTML_PAGE_SIZE, MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE};
40use crate::tokio_stream::StreamExt;
41use crate::Client;
42
43#[cfg(feature = "cache_chrome_hybrid")]
44use http_cache_semantics::{RequestLike, ResponseLike};
45
46use log::{info, log_enabled, Level};
47
48#[cfg(not(feature = "rquest"))]
49use reqwest::{Response, StatusCode};
50#[cfg(feature = "rquest")]
51use rquest::{Response, StatusCode};
52
53#[cfg(all(not(feature = "cache_request"), not(feature = "rquest")))]
55pub(crate) type RequestError = reqwest::Error;
56
57#[cfg(all(not(feature = "cache_request"), feature = "rquest"))]
59pub(crate) type RequestError = rquest::Error;
60
61#[cfg(feature = "cache_request")]
63pub(crate) type RequestError = reqwest_middleware::Error;
64
65pub(crate) type RequestResponse = Response;
67
68#[cfg(feature = "chrome")]
70const WAIT_TIMEOUTS: [u64; 6] = [0, 20, 50, 100, 100, 500];
71#[cfg(feature = "chrome")]
73const DOM_WAIT_TIMEOUTS: [u64; 6] = [100, 200, 300, 300, 400, 500];
74
75pub static IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf_set! {
77 "application/pdf",
78 "application/zip",
79 "application/x-rar-compressed",
80 "application/x-tar",
81 "image/png",
82 "image/jpeg",
83 "image/gif",
84 "image/bmp",
85 "image/svg+xml",
86 "video/mp4",
87 "video/x-msvideo",
88 "video/x-matroska",
89 "video/webm",
90 "audio/mpeg",
91 "audio/ogg",
92 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
93 "application/vnd.ms-excel",
94 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
95 "application/vnd.ms-powerpoint",
96 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
97 "application/x-7z-compressed",
98 "application/x-rpm",
99 "application/x-shockwave-flash",
100};
101
102lazy_static! {
103 static ref AC_BODY_SCAN: AhoCorasick = AhoCorasick::new([
105 "cf-error-code",
106 "Access to this page has been denied",
107 "DataDome",
108 "perimeterx",
109 "funcaptcha",
110 "Request unsuccessful. Incapsula incident ID",
111 ]).unwrap();
112
113 static ref AC_URL_SCAN: AhoCorasick = AhoCorasick::builder()
114 .match_kind(aho_corasick::MatchKind::LeftmostFirst) .build([
116 "/cdn-cgi/challenge-platform", "datadome.co", "dd-api.io", "perimeterx.net", "px-captcha", "arkoselabs.com", "funcaptcha", "kasada.io", "fingerprint.com", "fpjs.io", "incapsula", "imperva", "radwarebotmanager", "reblaze.com", "cheq.ai", ])
132 .unwrap();
133}
134
135#[cfg(feature = "fs")]
136lazy_static! {
137 static ref TMP_DIR: String = {
138 use std::fs;
139 let mut tmp = std::env::temp_dir();
140
141 tmp.push("spider/");
142
143 match fs::create_dir_all(&tmp) {
145 Ok(_) => {
146 let dir_name = tmp.display().to_string();
147
148 match std::time::SystemTime::now().duration_since(std::time::SystemTime::UNIX_EPOCH) {
149 Ok(dur) => {
150 string_concat!(dir_name, dur.as_secs().to_string())
151 }
152 _ => dir_name,
153 }
154 }
155 _ => "/tmp/".to_string()
156 }
157 };
158}
159
160#[cfg(feature = "chrome")]
161lazy_static! {
162 pub(crate) static ref MASK_BYTES_INTERCEPTION: bool = {
164 std::env::var("MASK_BYTES_INTERCEPTION").unwrap_or_default() == "true"
165 };
166 pub(crate) static ref CF_WAIT_FOR: crate::features::chrome_common::WaitFor = {
168 let mut wait_for = crate::features::chrome_common::WaitFor::default();
169 wait_for.delay = crate::features::chrome_common::WaitForDelay::new(Some(core::time::Duration::from_millis(1000))).into();
170 wait_for.idle_network = crate::features::chrome_common::WaitForIdleNetwork::new(core::time::Duration::from_secs(8).into()).into();
172 wait_for
173 };
174}
175
176lazy_static! {
177 pub(crate) static ref MAX_SIZE_BYTES: usize = {
179 match std::env::var("SPIDER_MAX_SIZE_BYTES") {
180 Ok(b) => {
181 const DEFAULT_MAX_SIZE_BYTES: usize = 1_073_741_824; let b = b.parse::<usize>().unwrap_or(DEFAULT_MAX_SIZE_BYTES);
184
185 if b == 0 {
186 0
187 } else {
188 b.max(1_048_576) }
190 },
191 _ => 0
192 }
193 };
194}
195
196#[cfg(all(feature = "chrome", feature = "real_browser"))]
197lazy_static! {
198 static ref CF_END: &'static [u8; 62] =
199 b"target=\"_blank\">Cloudflare</a></div></div></div></body></html>";
200 static ref CF_END2: &'static [u8; 72] =
201 b"Performance & security by Cloudflare</div></div></div></body></html>";
202 static ref CF_HEAD: &'static [u8; 34] = b"<html><head>\n <style global=\"\">";
203 static ref CF_MOCK_FRAME: &'static [u8; 137] = b"<iframe height=\"1\" width=\"1\" style=\"position: absolute; top: 0px; left: 0px; border: none; visibility: hidden;\"></iframe>\n\n</body></html>";
204 static ref CF_JUST_A_MOMENT:&'static [u8; 81] = b"<!DOCTYPE html><html lang=\"en-US\" dir=\"ltr\"><head><title>Just a moment...</title>";
205}
206
207lazy_static! {
208 pub static ref APACHE_FORBIDDEN: &'static [u8; 317] = br#"<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
210<html><head>
211<title>403 Forbidden</title>
212</head><body>
213<h1>Forbidden</h1>
214<p>You don't have permission to access this resource.</p>
215<p>Additionally, a 403 Forbidden
216error was encountered while trying to use an ErrorDocument to handle the request.</p>
217</body></html>"#;
218
219 pub static ref OPEN_RESTY_FORBIDDEN: &'static [u8; 125] = br#"<html><head><title>403 Forbidden</title></head>
221<body>
222<center><h1>403 Forbidden</h1></center>
223<hr><center>openresty</center>"#;
224}
225
226pub fn detect_hard_forbidden_content(b: &[u8]) -> bool {
228 b == *APACHE_FORBIDDEN || b.starts_with(*OPEN_RESTY_FORBIDDEN)
229}
230
231#[cfg(all(feature = "chrome", feature = "real_browser"))]
233pub(crate) fn detect_cf_turnstyle(b: &Vec<u8>) -> bool {
234 let cf = CF_END.as_ref();
235 let cf2 = CF_END2.as_ref();
236 let cn = CF_HEAD.as_ref();
237 let cnf = CF_MOCK_FRAME.as_ref();
238
239 b.ends_with(cf)
240 || b.ends_with(cf2)
241 || b.starts_with(cn) && b.ends_with(cnf)
242 || b.starts_with(CF_JUST_A_MOMENT.as_ref())
243}
244
245#[cfg(all(feature = "chrome", feature = "real_browser"))]
247async fn cf_handle(
248 b: &mut Vec<u8>,
249 page: &chromiumoxide::Page,
250) -> Result<bool, chromiumoxide::error::CdpError> {
251 let mut validated = false;
252
253 let page_result = tokio::time::timeout(tokio::time::Duration::from_secs(30), async {
254 let mut wait_for = CF_WAIT_FOR.clone();
255
256 page_wait(&page, &Some(wait_for.clone())).await;
257
258 let _ = page
259 .evaluate(r###"document.querySelectorAll("iframe").forEach(el=>el.click());"###)
260 .await;
261
262 wait_for.page_navigations = true;
263
264 page_wait(&page, &Some(wait_for.clone())).await;
265
266 if let Ok(next_content) = page.outer_html_bytes().await {
267 let next_content = if !detect_cf_turnstyle(&next_content) {
268 validated = true;
269 wait_for.delay = crate::features::chrome_common::WaitForDelay::new(Some(
271 core::time::Duration::from_secs(4),
272 ))
273 .into();
274 page_wait(&page, &Some(wait_for)).await;
275 match page.outer_html_bytes().await {
276 Ok(nc) => nc,
277 _ => next_content,
278 }
279 } else {
280 next_content
281 };
282
283 *b = next_content;
284 }
285 })
286 .await;
287
288 match page_result {
289 Ok(_) => Ok(validated),
290 _ => Err(chromiumoxide::error::CdpError::Timeout),
291 }
292}
293
294#[cfg(all(feature = "chrome", not(feature = "real_browser")))]
296async fn cf_handle(
297 _b: &mut Vec<u8>,
298 _page: &chromiumoxide::Page,
299) -> Result<(), chromiumoxide::error::CdpError> {
300 Ok(())
301}
302
303#[derive(Debug, Default)]
305pub struct PageResponse {
306 pub content: Option<Box<Vec<u8>>>,
308 pub headers: Option<reqwest::header::HeaderMap>,
310 #[cfg(feature = "remote_addr")]
311 pub remote_addr: Option<core::net::SocketAddr>,
313 #[cfg(feature = "cookies")]
314 pub cookies: Option<reqwest::header::HeaderMap>,
316 pub status_code: StatusCode,
318 pub final_url: Option<String>,
320 pub error_for_status: Option<Result<Response, RequestError>>,
322 #[cfg(feature = "chrome")]
323 pub screenshot_bytes: Option<Vec<u8>>,
325 #[cfg(feature = "openai")]
326 pub openai_credits_used: Option<Vec<crate::features::openai_common::OpenAIUsage>>,
328 #[cfg(feature = "openai")]
329 pub extra_ai_data: Option<Vec<crate::page::AIResults>>,
331 pub waf_check: bool,
333 pub bytes_transferred: Option<f64>,
335 pub signature: Option<u64>,
337 #[cfg(feature = "chrome")]
338 pub response_map: Option<HashMap<String, f64>>,
340 #[cfg(feature = "chrome")]
341 pub request_map: Option<HashMap<String, f64>>,
343 pub anti_bot_tech: crate::page::AntiBotTech,
345 pub metadata: Option<Box<Metadata>>,
347}
348
349#[cfg(feature = "chrome")]
351pub async fn wait_for_event<T>(page: &chromiumoxide::Page, timeout: Option<core::time::Duration>)
352where
353 T: chromiumoxide::cdp::IntoEventKind + Unpin + std::fmt::Debug,
354{
355 if let Ok(mut events) = page.event_listener::<T>().await {
356 let wait_until = async {
357 let mut index = 0;
358
359 loop {
360 let current_timeout = WAIT_TIMEOUTS[index];
361 let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
362
363 tokio::select! {
364 _ = sleep => (),
365 v = events.next() => {
366 if !v.is_none () {
367 break;
368 }
369 }
370 }
371
372 index = (index + 1) % WAIT_TIMEOUTS.len();
373 }
374 };
375 match timeout {
376 Some(timeout) => if let Err(_) = tokio::time::timeout(timeout, wait_until).await {},
377 _ => wait_until.await,
378 }
379 }
380}
381
382#[cfg(feature = "chrome")]
384pub async fn wait_for_selector(
385 page: &chromiumoxide::Page,
386 timeout: Option<core::time::Duration>,
387 selector: &str,
388) -> bool {
389 let mut valid = false;
390 let wait_until = async {
391 let mut index = 0;
392
393 loop {
394 let current_timeout = WAIT_TIMEOUTS[index];
395 let sleep = tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout));
396
397 tokio::select! {
398 _ = sleep => (),
399 v = page.find_element(selector) => {
400 if v.is_ok() {
401 valid = true;
402 break;
403 }
404 }
405 }
406
407 index = (index + 1) % WAIT_TIMEOUTS.len();
408 }
409 };
410
411 match timeout {
412 Some(timeout) => {
413 if let Err(_) = tokio::time::timeout(timeout, wait_until).await {
414 valid = false;
415 }
416 }
417 _ => wait_until.await,
418 };
419
420 valid
421}
422
423#[cfg(feature = "chrome")]
425pub async fn wait_for_dom(
426 page: &chromiumoxide::Page,
427 timeout: Option<core::time::Duration>,
428 selector: &str,
429) {
430 let max_duration = timeout.unwrap_or_else(|| core::time::Duration::from_millis(500));
431 let mut deadline = tokio::time::Instant::now() + max_duration;
432
433 let script = crate::features::chrome_common::generate_wait_for_dom_js_code_with_selector_base(
434 max_duration.as_millis() as u32,
435 selector,
436 );
437
438 let wait_until = async {
439 let mut index = 0;
440
441 loop {
442 if tokio::time::Instant::now() >= deadline {
443 break;
444 }
445
446 let current_timeout = DOM_WAIT_TIMEOUTS[index];
447 let result = page.evaluate(script.clone()).await;
448
449 if let Ok(vv) = &result {
450 let value = vv.value();
451 if let Some(value) = value {
452 if let Some(v) = value.as_bool() {
453 if v {
454 break;
455 } else {
456 tokio::time::sleep(tokio::time::Duration::from_millis(current_timeout))
457 .await;
458 deadline = tokio::time::Instant::now() + max_duration;
459 }
460 }
461 }
462 }
463
464 index = (index + 1) % WAIT_TIMEOUTS.len();
465 }
466 };
467
468 let _ = tokio::time::timeout(max_duration, wait_until).await;
469}
470
471#[cfg(feature = "chrome")]
473pub async fn create_output_path(
474 base: &std::path::PathBuf,
475 target_url: &str,
476 format: &str,
477) -> String {
478 let out = string_concat!(
479 &percent_encoding::percent_encode(
480 target_url.as_bytes(),
481 percent_encoding::NON_ALPHANUMERIC
482 )
483 .to_string(),
484 format
485 );
486
487 let b = base.join(&out);
488
489 if let Some(p) = b.parent() {
490 let _ = tokio::fs::create_dir_all(&p).await;
491 }
492
493 b.display().to_string()
494}
495
496#[cfg(feature = "chrome")]
497pub async fn page_wait(
503 page: &chromiumoxide::Page,
504 wait_for: &Option<crate::configuration::WaitFor>,
505) {
506 if let Some(wait_for) = wait_for {
507 if let Some(ref wait) = wait_for.idle_network {
508 wait_for_event::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
509 page,
510 wait.timeout,
511 )
512 .await;
513 }
514
515 if let Some(ref wait) = wait_for.selector {
516 wait_for_selector(page, wait.timeout, &wait.selector).await;
517 }
518
519 if let Some(ref wait) = wait_for.dom {
520 wait_for_dom(page, wait.timeout, &wait.selector).await;
521 }
522
523 if let Some(ref wait) = wait_for.delay {
524 if let Some(timeout) = wait.timeout {
525 tokio::time::sleep(timeout).await
526 }
527 }
528 }
529}
530
531#[derive(Debug, Default)]
532#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
533#[cfg(feature = "openai")]
534pub struct JsonResponse {
536 content: Vec<String>,
538 js: String,
540 #[cfg_attr(feature = "serde", serde(default))]
541 error: Option<String>,
543}
544
545#[cfg(feature = "openai")]
547pub fn handle_openai_credits(
548 page_response: &mut PageResponse,
549 tokens_used: crate::features::openai_common::OpenAIUsage,
550) {
551 match page_response.openai_credits_used.as_mut() {
552 Some(v) => v.push(tokens_used),
553 None => page_response.openai_credits_used = Some(vec![tokens_used]),
554 };
555}
556
557#[cfg(not(feature = "openai"))]
558pub fn handle_openai_credits(
560 _page_response: &mut PageResponse,
561 _tokens_used: crate::features::openai_common::OpenAIUsage,
562) {
563}
564
565#[cfg(feature = "openai")]
567pub fn handle_extra_ai_data(
568 page_response: &mut PageResponse,
569 prompt: &str,
570 x: JsonResponse,
571 screenshot_output: Option<Vec<u8>>,
572 error: Option<String>,
573) {
574 let ai_response = crate::page::AIResults {
575 input: prompt.into(),
576 js_output: x.js,
577 content_output: x
578 .content
579 .iter()
580 .map(|c| c.trim_start().into())
581 .collect::<Vec<_>>(),
582 screenshot_output,
583 error,
584 };
585
586 match page_response.extra_ai_data.as_mut() {
587 Some(v) => v.push(ai_response),
588 None => page_response.extra_ai_data = Some(Vec::from([ai_response])),
589 };
590}
591
592pub enum HeaderSource<'a> {
594 HeaderMap(&'a crate::client::header::HeaderMap),
596 Map(&'a std::collections::HashMap<String, String>),
598}
599
600pub fn detect_anti_bot_from_headers(headers: &HeaderSource) -> Option<AntiBotTech> {
602 macro_rules! has_key {
603 ($key:expr) => {
604 match headers {
605 HeaderSource::HeaderMap(hm) => hm.contains_key($key),
606 HeaderSource::Map(map) => map.contains_key($key),
607 }
608 };
609 }
610
611 if has_key!("cf-chl-bypass") || has_key!("cf-ray") {
612 return Some(AntiBotTech::Cloudflare);
613 }
614 if has_key!("x-captcha-endpoint") {
615 return Some(AntiBotTech::DataDome);
616 }
617 if has_key!("x-perimeterx") || has_key!("pxhd") {
618 return Some(AntiBotTech::PerimeterX);
619 }
620 if has_key!("x-akamaibot") {
621 return Some(AntiBotTech::AkamaiBotManager);
622 }
623 if has_key!("x-imperva-id") || has_key!("x-iinfo") {
624 return Some(AntiBotTech::Imperva);
625 }
626 if has_key!("x-reblaze-uuid") {
627 return Some(AntiBotTech::Reblaze);
628 }
629
630 None
631}
632
633pub fn detect_anti_bot_from_body(body: &Vec<u8>) -> Option<AntiBotTech> {
635 if body.len() < 30_000 {
637 if let Ok(finder) = AC_BODY_SCAN.try_find_iter(body) {
638 for mat in finder {
639 match mat.pattern().as_usize() {
640 0 => return Some(AntiBotTech::Cloudflare),
641 1 | 2 => return Some(AntiBotTech::DataDome),
642 3 => return Some(AntiBotTech::PerimeterX),
643 4 => return Some(AntiBotTech::ArkoseLabs),
644 5 => return Some(AntiBotTech::Imperva),
645 _ => (),
646 }
647 }
648 }
649 }
650
651 None
652}
653
654pub fn detect_antibot_from_url(url: &str) -> Option<AntiBotTech> {
656 if let Some(mat) = AC_URL_SCAN.find(url) {
657 let tech = match mat.pattern().as_usize() {
658 0 => AntiBotTech::Cloudflare,
659 1 | 2 => AntiBotTech::DataDome,
660 3 | 4 => AntiBotTech::PerimeterX,
661 5 | 6 => AntiBotTech::ArkoseLabs,
662 7 => AntiBotTech::Kasada,
663 8 | 9 => AntiBotTech::FingerprintJS,
664 10 | 11 => AntiBotTech::Imperva,
665 12 => AntiBotTech::RadwareBotManager,
666 13 => AntiBotTech::Reblaze,
667 14 => AntiBotTech::CHEQ,
668 _ => return None,
669 };
670 Some(tech)
671 } else {
672 None
673 }
674}
675pub fn detect_anti_bot_tech_response(
677 url: &str,
678 headers: &HeaderSource,
679 body: &Vec<u8>,
680 subject_name: Option<&str>,
681) -> AntiBotTech {
682 if let Some(subject) = subject_name {
684 if subject == "challenges.cloudflare.com" {
685 return AntiBotTech::Cloudflare;
686 }
687 }
688
689 if let Some(tech) = detect_anti_bot_from_headers(headers) {
690 return tech;
691 }
692
693 if let Some(tech) = detect_antibot_from_url(url) {
694 return tech;
695 }
696
697 if let Some(anti_bot) = detect_anti_bot_from_body(body) {
698 return anti_bot;
699 }
700
701 AntiBotTech::None
702}
703
704#[cfg(feature = "openai")]
706pub fn handle_ai_data(js: &str) -> Option<JsonResponse> {
707 match serde_json::from_str::<JsonResponse>(&js) {
708 Ok(x) => Some(x),
709 _ => None,
710 }
711}
712
713#[cfg(feature = "chrome")]
714#[derive(Default, Clone, Debug)]
715pub struct ChromeHTTPReqRes {
717 pub waf_check: bool,
719 pub status_code: StatusCode,
721 pub method: String,
723 pub response_headers: std::collections::HashMap<String, String>,
725 pub request_headers: std::collections::HashMap<String, String>,
727 pub protocol: String,
729 pub anti_bot_tech: crate::page::AntiBotTech,
731}
732
733#[cfg(feature = "chrome")]
734pub async fn perform_chrome_http_request(
736 page: &chromiumoxide::Page,
737 source: &str,
738) -> Result<ChromeHTTPReqRes, chromiumoxide::error::CdpError> {
739 let mut waf_check = false;
740 let mut status_code = StatusCode::OK;
741 let mut method = String::from("GET");
742 let mut response_headers: std::collections::HashMap<String, String> =
743 std::collections::HashMap::default();
744 let mut request_headers = std::collections::HashMap::default();
745 let mut protocol = String::from("http/1.1");
746 let mut anti_bot_tech = AntiBotTech::default();
747
748 let frame_id = page.mainframe().await?;
749
750 let page_base =
751 page.http_future(chromiumoxide::cdp::browser_protocol::page::NavigateParams {
752 url: source.to_string(),
753 transition_type: Some(
754 chromiumoxide::cdp::browser_protocol::page::TransitionType::Other,
755 ),
756 frame_id,
757 referrer: None,
758 referrer_policy: None,
759 })?;
760
761 match page_base.await {
762 Ok(page_base) => {
763 if let Some(http_request) = page_base {
764 if let Some(http_method) = http_request.method.as_deref() {
765 method = http_method.into();
766 }
767
768 request_headers.clone_from(&http_request.headers);
769
770 if let Some(ref response) = http_request.response {
771 if let Some(ref p) = response.protocol {
772 protocol.clone_from(p);
773 }
774
775 if let Some(res_headers) = response.headers.inner().as_object() {
776 for (k, v) in res_headers {
777 response_headers.insert(k.to_string(), v.to_string());
778 }
779 }
780
781 let mut firewall = false;
782
783 if !response.url.starts_with(source) {
784 match &response.security_details {
785 Some(security_details) => {
786 anti_bot_tech = detect_anti_bot_tech_response(
787 &response.url,
788 &HeaderSource::Map(&response_headers),
789 &Default::default(),
790 Some(&security_details.subject_name),
791 );
792 firewall = true;
793 }
794 _ => {
795 anti_bot_tech = detect_anti_bot_tech_response(
796 &response.url,
797 &HeaderSource::Map(&response_headers),
798 &Default::default(),
799 None,
800 );
801 if anti_bot_tech == AntiBotTech::Cloudflare {
802 if let Some(xframe_options) =
803 response_headers.get("x-frame-options")
804 {
805 if xframe_options == r#"\"DENY\""# {
806 firewall = true;
807 }
808 } else if let Some(encoding) =
809 response_headers.get("Accept-Encoding")
810 {
811 if encoding == r#"cf-ray"# {
812 firewall = true;
813 }
814 }
815 } else {
816 firewall = true;
817 }
818 }
819 };
820
821 waf_check = firewall && !matches!(anti_bot_tech, AntiBotTech::None);
822
823 if !waf_check {
824 waf_check = match response.protocol {
825 Some(ref protocol) => protocol == "blob",
826 _ => false,
827 }
828 }
829 }
830
831 status_code = StatusCode::from_u16(response.status as u16)
832 .unwrap_or_else(|_| StatusCode::EXPECTATION_FAILED);
833 } else {
834 if let Some(failure_text) = &http_request.failure_text {
835 if failure_text == "net::ERR_FAILED" {
836 waf_check = true;
837 }
838 }
839 }
840 }
841 }
842 Err(e) => return Err(e),
843 }
844
845 Ok(ChromeHTTPReqRes {
846 waf_check,
847 status_code,
848 method,
849 response_headers,
850 request_headers,
851 protocol,
852 anti_bot_tech,
853 })
854}
855
856#[cfg(all(feature = "chrome", not(feature = "openai")))]
858pub async fn run_openai_request(
859 _source: &str,
860 _page: &chromiumoxide::Page,
861 _wait_for: &Option<crate::configuration::WaitFor>,
862 _openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
863 _page_response: &mut PageResponse,
864 _ok: bool,
865) {
866}
867
868#[cfg(all(feature = "chrome", feature = "openai"))]
870pub async fn run_openai_request(
871 source: &str,
872 page: &chromiumoxide::Page,
873 wait_for: &Option<crate::configuration::WaitFor>,
874 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
875 mut page_response: &mut PageResponse,
876 ok: bool,
877) {
878 if let Some(gpt_configs) = openai_config {
879 let gpt_configs = match gpt_configs.prompt_url_map {
880 Some(ref h) => {
881 let c = h.get::<case_insensitive_string::CaseInsensitiveString>(&source.into());
882
883 if !c.is_some() && gpt_configs.paths_map {
884 h.get::<case_insensitive_string::CaseInsensitiveString>(
885 &get_path_from_url(&source).into(),
886 )
887 } else {
888 c
889 }
890 }
891 _ => Some(gpt_configs),
892 };
893
894 if let Some(gpt_configs) = gpt_configs {
895 let mut prompts = gpt_configs.prompt.clone();
896
897 while let Some(prompt) = prompts.next() {
898 let gpt_results = if !gpt_configs.model.is_empty() && ok {
899 openai_request(
900 gpt_configs,
901 match page_response.content.as_ref() {
902 Some(html) => auto_encoder::auto_encode_bytes(html),
903 _ => Default::default(),
904 },
905 &source,
906 &prompt,
907 )
908 .await
909 } else {
910 Default::default()
911 };
912
913 let js_script = gpt_results.response;
914 let tokens_used = gpt_results.usage;
915 let gpt_error = gpt_results.error;
916
917 handle_openai_credits(&mut page_response, tokens_used);
919
920 let json_res = if gpt_configs.extra_ai_data {
921 match handle_ai_data(&js_script) {
922 Some(jr) => jr,
923 _ => {
924 let mut jr = JsonResponse::default();
925 jr.error = Some("An issue occured with serialization.".into());
926
927 jr
928 }
929 }
930 } else {
931 let mut x = JsonResponse::default();
932 x.js = js_script;
933 x
934 };
935
936 if !json_res.js.is_empty() {
938 let html: Option<Box<Vec<u8>>> = match page
939 .evaluate_function(string_concat!(
940 "async function() { ",
941 json_res.js,
942 "; return document.documentElement.outerHTML; }"
943 ))
944 .await
945 {
946 Ok(h) => match h.into_value() {
947 Ok(hh) => Some(hh),
948 _ => None,
949 },
950 _ => None,
951 };
952
953 if html.is_some() {
954 page_wait(&page, &wait_for).await;
955 if json_res.js.len() <= 400 && json_res.js.contains("window.location") {
956 if let Ok(b) = page.outer_html_bytes().await {
957 page_response.content = Some(b.into());
958 }
959 } else {
960 page_response.content = html;
961 }
962 }
963 }
964
965 if gpt_configs.extra_ai_data {
967 let screenshot_bytes = if gpt_configs.screenshot && !json_res.js.is_empty() {
968 let format = chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png;
969
970 let screenshot_configs = chromiumoxide::page::ScreenshotParams::builder()
971 .format(format)
972 .full_page(true)
973 .quality(45)
974 .omit_background(false);
975
976 match page.screenshot(screenshot_configs.build()).await {
977 Ok(b) => {
978 log::debug!("took screenshot: {:?}", source);
979 Some(b)
980 }
981 Err(e) => {
982 log::error!("failed to take screenshot: {:?} - {:?}", e, source);
983 None
984 }
985 }
986 } else {
987 None
988 };
989
990 handle_extra_ai_data(
991 page_response,
992 &prompt,
993 json_res,
994 screenshot_bytes,
995 gpt_error,
996 );
997 }
998 }
999 }
1000 }
1001}
1002
1003#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1005#[non_exhaustive]
1006pub enum HttpVersion {
1007 Http09,
1009 Http10,
1011 Http11,
1013 H2,
1015 H3,
1017}
1018
1019#[derive(Debug, Clone)]
1021pub struct HttpResponse {
1022 pub body: Vec<u8>,
1024 pub headers: std::collections::HashMap<String, String>,
1026 pub status: u16,
1028 pub url: url::Url,
1030 pub version: HttpVersion,
1032}
1033
1034#[cfg(feature = "cache_chrome_hybrid")]
1036pub struct HttpRequestLike {
1037 pub uri: http::uri::Uri,
1039 pub method: reqwest::Method,
1041 pub headers: http::HeaderMap,
1043}
1044
1045#[cfg(feature = "cache_chrome_hybrid")]
1046pub struct HttpResponseLike {
1048 pub status: StatusCode,
1050 pub headers: http::HeaderMap,
1052}
1053
1054#[cfg(feature = "cache_chrome_hybrid")]
1055impl RequestLike for HttpRequestLike {
1056 fn uri(&self) -> http::uri::Uri {
1057 self.uri.clone()
1058 }
1059 fn is_same_uri(&self, other: &http::Uri) -> bool {
1060 &self.uri == other
1061 }
1062 fn method(&self) -> &reqwest::Method {
1063 &self.method
1064 }
1065 fn headers(&self) -> &http::HeaderMap {
1066 &self.headers
1067 }
1068}
1069
1070#[cfg(feature = "cache_chrome_hybrid")]
1071impl ResponseLike for HttpResponseLike {
1072 fn status(&self) -> StatusCode {
1073 self.status
1074 }
1075 fn headers(&self) -> &http::HeaderMap {
1076 &self.headers
1077 }
1078}
1079
1080#[cfg(any(
1082 feature = "cache_chrome_hybrid",
1083 feature = "headers",
1084 feature = "cookies"
1085))]
1086pub fn convert_headers(
1087 headers: &std::collections::HashMap<String, String>,
1088) -> reqwest::header::HeaderMap {
1089 let mut header_map = reqwest::header::HeaderMap::new();
1090
1091 for (index, items) in headers.iter().enumerate() {
1092 if let Ok(head) = reqwest::header::HeaderValue::from_str(items.1) {
1093 use std::str::FromStr;
1094 if let Ok(key) = reqwest::header::HeaderName::from_str(items.0) {
1095 header_map.insert(key, head);
1096 }
1097 }
1098 if index > 1000 {
1100 break;
1101 }
1102 }
1103
1104 header_map
1105}
1106
1107#[cfg(feature = "cache_chrome_hybrid")]
1108pub async fn put_hybrid_cache(
1110 cache_key: &str,
1111 http_response: HttpResponse,
1112 method: &str,
1113 http_request_headers: std::collections::HashMap<String, String>,
1114) {
1115 use crate::http_cache_reqwest::CacheManager;
1116 use http_cache_semantics::CachePolicy;
1117
1118 match http_response.url.as_str().parse::<http::uri::Uri>() {
1119 Ok(u) => {
1120 let req = HttpRequestLike {
1121 uri: u,
1122 method: reqwest::Method::from_bytes(method.as_bytes())
1123 .unwrap_or(reqwest::Method::GET),
1124 headers: convert_headers(&http_response.headers),
1125 };
1126
1127 let res = HttpResponseLike {
1128 status: StatusCode::from_u16(http_response.status)
1129 .unwrap_or(StatusCode::EXPECTATION_FAILED),
1130 headers: convert_headers(&http_request_headers),
1131 };
1132
1133 let policy = CachePolicy::new(&req, &res);
1134
1135 let _ = crate::website::CACACHE_MANAGER
1136 .put(
1137 cache_key.into(),
1138 http_cache_reqwest::HttpResponse {
1139 url: http_response.url,
1140 body: http_response.body,
1141 headers: http_response.headers,
1142 version: match http_response.version {
1143 HttpVersion::H2 => http_cache::HttpVersion::H2,
1144 HttpVersion::Http10 => http_cache::HttpVersion::Http10,
1145 HttpVersion::H3 => http_cache::HttpVersion::H3,
1146 HttpVersion::Http09 => http_cache::HttpVersion::Http09,
1147 HttpVersion::Http11 => http_cache::HttpVersion::Http11,
1148 },
1149 status: http_response.status,
1150 },
1151 policy,
1152 )
1153 .await;
1154 }
1155 _ => (),
1156 }
1157}
1158
1159#[cfg(not(feature = "cache_chrome_hybrid"))]
1160pub async fn put_hybrid_cache(
1162 _cache_key: &str,
1163 _http_response: HttpResponse,
1164 _method: &str,
1165 _http_request_headers: std::collections::HashMap<String, String>,
1166) {
1167}
1168
1169#[cfg(feature = "chrome")]
1171fn sub_duration(
1172 base_timeout: std::time::Duration,
1173 elapsed: std::time::Duration,
1174) -> std::time::Duration {
1175 match base_timeout.checked_sub(elapsed) {
1176 Some(remaining_time) => remaining_time,
1177 None => Default::default(),
1178 }
1179}
1180
1181#[cfg(feature = "chrome")]
1183async fn navigate(
1184 page: &chromiumoxide::Page,
1185 url: &str,
1186 chrome_http_req_res: &mut ChromeHTTPReqRes,
1187) -> Result<(), chromiumoxide::error::CdpError> {
1188 *chrome_http_req_res = perform_chrome_http_request(page, url).await?;
1189 Ok(())
1190}
1191
1192#[cfg(all(feature = "real_browser", feature = "chrome"))]
1193async fn perform_smart_mouse_movement(
1195 page: &chromiumoxide::Page,
1196 viewport: &Option<crate::configuration::Viewport>,
1197) {
1198 use crate::features::chrome_mouse_movements::GaussianMouse;
1199 use chromiumoxide::layout::Point;
1200 let (viewport_width, viewport_height) = match viewport {
1201 Some(vp) => (vp.width as f64, vp.height as f64),
1202 _ => (1280.0, 720.0),
1203 };
1204 for (x, y) in GaussianMouse::generate_random_coordinates(viewport_width, viewport_height) {
1205 let _ = page.move_mouse(Point::new(x, y)).await;
1206 }
1207}
1208
1209#[cfg(all(not(feature = "real_browser"), feature = "chrome"))]
1210async fn perform_smart_mouse_movement(
1211 _page: &chromiumoxide::Page,
1212 _viewport: &Option<crate::configuration::Viewport>,
1213) {
1214}
1215
1216#[cfg(all(feature = "chrome", feature = "cache_chrome_hybrid"))]
1218pub async fn cache_chrome_response(
1219 target_url: &str,
1220 page_response: &PageResponse,
1221 chrome_http_req_res: ChromeHTTPReqRes,
1222) {
1223 if let Ok(u) = url::Url::parse(target_url) {
1224 let http_response = HttpResponse {
1225 url: u,
1226 body: match page_response.content.as_ref() {
1227 Some(b) => b.into(),
1228 _ => Default::default(),
1229 },
1230 status: chrome_http_req_res.status_code.into(),
1231 version: match chrome_http_req_res.protocol.as_str() {
1232 "http/0.9" => HttpVersion::Http09,
1233 "http/1" | "http/1.0" => HttpVersion::Http10,
1234 "http/1.1" => HttpVersion::Http11,
1235 "http/2.0" | "http/2" => HttpVersion::H2,
1236 "http/3.0" | "http/3" => HttpVersion::H3,
1237 _ => HttpVersion::Http11,
1238 },
1239 headers: chrome_http_req_res.response_headers,
1240 };
1241 put_hybrid_cache(
1242 &string_concat!("GET", ":", target_url),
1243 http_response,
1244 &"GET",
1245 chrome_http_req_res.request_headers,
1246 )
1247 .await;
1248 }
1249}
1250
1251#[cfg(all(feature = "chrome", not(feature = "cache_chrome_hybrid")))]
1253pub async fn cache_chrome_response(
1254 _target_url: &str,
1255 _page_response: &PageResponse,
1256 _chrome_http_req_res: ChromeHTTPReqRes,
1257) {
1258}
1259
1260pub(crate) const FIVE_MINUTES: u32 = 300000;
1262
1263#[cfg(feature = "chrome")]
1265const MAX_PAGE_TIMEOUT: tokio::time::Duration =
1266 tokio::time::Duration::from_millis(FIVE_MINUTES as u64);
1267#[cfg(feature = "chrome")]
1269const HALF_MAX_PAGE_TIMEOUT: tokio::time::Duration =
1270 tokio::time::Duration::from_millis(FIVE_MINUTES as u64 / 2);
1271
1272#[cfg(all(feature = "chrome", feature = "headers"))]
1273fn store_headers(page_response: &PageResponse, chrome_http_req_res: &mut ChromeHTTPReqRes) {
1274 if let Some(response_headers) = &page_response.headers {
1275 chrome_http_req_res.response_headers =
1276 crate::utils::header_utils::header_map_to_hash_map(&response_headers);
1277 }
1278}
1279
1280#[cfg(all(feature = "chrome", not(feature = "headers")))]
1281fn store_headers(_page_response: &PageResponse, _chrome_http_req_res: &mut ChromeHTTPReqRes) {}
1282
1283#[cfg(feature = "chrome")]
1284pub async fn fetch_page_html_chrome_base(
1286 source: &str,
1287 page: &chromiumoxide::Page,
1288 content: bool,
1289 wait_for_navigation: bool,
1290 wait_for: &Option<crate::configuration::WaitFor>,
1291 screenshot: &Option<crate::configuration::ScreenShotConfig>,
1292 page_set: bool,
1293 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
1294 url_target: Option<&str>,
1295 execution_scripts: &Option<ExecutionScripts>,
1296 automation_scripts: &Option<AutomationScripts>,
1297 viewport: &Option<crate::configuration::Viewport>,
1298 request_timeout: &Option<Box<std::time::Duration>>,
1299 track_events: &Option<crate::configuration::ChromeEventTracker>,
1300) -> Result<PageResponse, chromiumoxide::error::CdpError> {
1301 use crate::page::{is_asset_url, DOWNLOADABLE_MEDIA_TYPES, UNKNOWN_STATUS_ERROR};
1302 use chromiumoxide::{
1303 cdp::browser_protocol::network::{
1304 EventLoadingFailed, EventRequestWillBeSent, EventResponseReceived,
1305 GetResponseBodyParams, RequestId, ResourceType,
1306 },
1307 error::CdpError,
1308 };
1309 use hashbrown::HashMap;
1310 use std::time::Duration;
1311 use tokio::{
1312 sync::{oneshot, OnceCell},
1313 time::Instant,
1314 };
1315
1316 #[derive(Debug, Clone, Default)]
1317 struct ResponseMap {
1319 url: String,
1321 skipped: bool,
1323 bytes_transferred: f64,
1325 }
1326
1327 #[derive(Debug, Clone, Default)]
1328 struct ResponseBase {
1329 response_map: Option<HashMap<String, ResponseMap>>,
1331 headers: Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
1333 status_code: Option<i64>,
1335 }
1336
1337 let mut chrome_http_req_res = ChromeHTTPReqRes::default();
1338
1339 let mut base_timeout = match request_timeout {
1341 Some(timeout) => **timeout.min(&Box::new(MAX_PAGE_TIMEOUT)),
1342 _ => MAX_PAGE_TIMEOUT,
1343 };
1344
1345 let base_timeout_measurement = base_timeout;
1347 let target_url = url_target.unwrap_or(source);
1348 let asset = is_asset_url(target_url);
1349
1350 let (tx1, rx1) = if asset {
1351 let c = oneshot::channel::<Option<RequestId>>();
1352
1353 (Some(c.0), Some(c.1))
1354 } else {
1355 (None, None)
1356 };
1357
1358 let (track_requests, track_responses) = match track_events {
1359 Some(tracker) => (tracker.requests, tracker.responses),
1360 _ => (false, false),
1361 };
1362
1363 let (event_loading_listener, cancel_listener, received_listener, event_sent_listener) = tokio::join!(
1364 page.event_listener::<chromiumoxide::cdp::browser_protocol::network::EventLoadingFinished>(
1365 ),
1366 page.event_listener::<EventLoadingFailed>(),
1367 async {
1368 if asset || track_responses {
1369 page.event_listener::<EventResponseReceived>().await
1370 } else {
1371 Err(CdpError::NotFound)
1372 }
1373 },
1374 async {
1375 if track_requests {
1376 page.event_listener::<EventRequestWillBeSent>().await
1377 } else {
1378 Err(CdpError::NotFound)
1379 }
1380 },
1381 );
1382
1383 let (tx, rx) = oneshot::channel::<bool>();
1384
1385 let bytes_collected_handle = tokio::spawn(async move {
1387 let finished_media: Option<OnceCell<RequestId>> =
1388 if asset { Some(OnceCell::new()) } else { None };
1389
1390 let f1 = async {
1391 let mut total = 0.0;
1392
1393 let mut response_map: Option<HashMap<String, f64>> = if track_responses {
1394 Some(HashMap::new())
1395 } else {
1396 None
1397 };
1398
1399 if let Ok(mut listener) = event_loading_listener {
1400 if asset {
1401 while let Some(event) = listener.next().await {
1402 total += event.encoded_data_length;
1403
1404 if let Some(response_map) = response_map.as_mut() {
1405 response_map
1406 .entry(event.request_id.inner().clone())
1407 .and_modify(|e| *e += event.encoded_data_length)
1408 .or_insert(event.encoded_data_length);
1409 }
1410
1411 if let Some(once) = &finished_media {
1412 if let Some(request_id) = once.get() {
1413 if request_id == &event.request_id {
1414 if let Some(tx1) = tx1 {
1415 let _ = tx1.send(Some(request_id.clone()));
1416 break;
1417 }
1418 }
1419 }
1420 }
1421 }
1422 } else {
1423 while let Some(event) = listener.next().await {
1424 total += event.encoded_data_length;
1425
1426 if let Some(response_map) = response_map.as_mut() {
1427 response_map
1428 .entry(event.request_id.inner().clone())
1429 .and_modify(|e| *e += event.encoded_data_length)
1430 .or_insert(event.encoded_data_length);
1431 }
1432 }
1433 }
1434 }
1435
1436 (total, response_map)
1437 };
1438
1439 let f2 = async {
1440 if let Ok(mut listener) = cancel_listener {
1441 let mut net_aborted = false;
1442
1443 while let Some(event) = listener.next().await {
1444 if event.r#type == ResourceType::Document
1445 && event.error_text == "net::ERR_ABORTED"
1446 && event.canceled.unwrap_or_default()
1447 {
1448 net_aborted = true;
1449 break;
1450 }
1451 }
1452
1453 if net_aborted {
1454 let _ = tx.send(true);
1455 }
1456 }
1457 };
1458
1459 let f3 = async {
1460 let mut response_map: Option<HashMap<String, ResponseMap>> = if track_responses {
1461 Some(HashMap::new())
1462 } else {
1463 None
1464 };
1465
1466 let mut status_code = None;
1467 let mut headers = None;
1468
1469 if asset || response_map.is_some() {
1470 if let Ok(mut listener) = received_listener {
1471 let mut initial_asset = false;
1472 let mut allow_download = false;
1473 let mut intial_request = false;
1474
1475 while let Some(event) = listener.next().await {
1476 if !intial_request && event.r#type == ResourceType::Document {
1477 let redirect =
1478 event.response.status >= 300 && event.response.status <= 399;
1479
1480 if !redirect {
1481 intial_request = true;
1482 status_code = Some(event.response.status);
1483 headers = Some(event.response.headers.clone());
1484 }
1485 }
1486 if asset {
1488 if !initial_asset && event.r#type == ResourceType::Document {
1489 allow_download =
1490 DOWNLOADABLE_MEDIA_TYPES.contains(&event.response.mime_type);
1491 }
1492 if event.r#type == ResourceType::Media && allow_download {
1493 if let Some(once) = &finished_media {
1494 let _ = once.set(event.request_id.clone());
1495 }
1496 }
1497 initial_asset = true;
1498 }
1499
1500 if let Some(response_map) = response_map.as_mut() {
1501 response_map.insert(
1502 event.request_id.inner().clone(),
1503 ResponseMap {
1504 url: event.response.url.clone(),
1505 bytes_transferred: event.response.encoded_data_length,
1506 skipped: *MASK_BYTES_INTERCEPTION
1507 && event.response.connection_id == 0.0
1508 && event.response.encoded_data_length <= 17.0,
1509 },
1510 );
1511 }
1512 }
1513 }
1514 }
1515
1516 ResponseBase {
1517 response_map,
1518 status_code,
1519 headers,
1520 }
1521 };
1522
1523 let f4 = async {
1524 let mut request_map: Option<HashMap<String, f64>> = if track_requests {
1525 Some(HashMap::new())
1526 } else {
1527 None
1528 };
1529
1530 if request_map.is_some() {
1531 if let Some(response_map) = request_map.as_mut() {
1532 if let Ok(mut listener) = event_sent_listener {
1533 while let Some(event) = listener.next().await {
1534 response_map
1535 .insert(event.request.url.clone(), *event.timestamp.inner());
1536 }
1537 }
1538 }
1539 }
1540
1541 request_map
1542 };
1543
1544 let (t1, _, res_map, req_map) = tokio::join!(f1, f2, f3, f4);
1545
1546 (t1.0, t1.1, res_map, req_map)
1547 });
1548
1549 let mut block_bytes = false;
1550
1551 let page_navigation = async {
1552 if !page_set {
1553 if content {
1555 if let Ok(frame) = page.mainframe().await {
1556 let html = rewrite_base_tag(&source, &url_target).await;
1557
1558 if let Err(e) = page
1559 .execute(
1560 chromiumoxide::cdp::browser_protocol::page::SetDocumentContentParams {
1561 frame_id: frame.unwrap_or_default(),
1562 html,
1563 },
1564 )
1565 .await
1566 {
1567 log::info!(
1568 "Set Content Error({:?}) - {:?}",
1569 e,
1570 &url_target.unwrap_or(source)
1571 );
1572 if let chromiumoxide::error::CdpError::Timeout = e {
1573 block_bytes = true;
1574 }
1575 }
1576 }
1577 } else {
1578 if let Err(e) = navigate(page, source, &mut chrome_http_req_res).await {
1579 log::info!(
1580 "Navigation Error({:?}) - {:?}",
1581 e,
1582 &url_target.unwrap_or(source)
1583 );
1584 if let chromiumoxide::error::CdpError::Timeout = e {
1585 block_bytes = true;
1586 }
1587 return Err(e);
1588 };
1589 }
1590 }
1591
1592 Ok(())
1593 };
1594
1595 let start_time = Instant::now();
1596
1597 let mut request_cancelled = false;
1598
1599 let page_navigate = async {
1600 if cfg!(feature = "real_browser") {
1601 let notify = tokio::sync::Notify::new();
1602
1603 let mouse_loop = async {
1604 let mut index = 0;
1605
1606 loop {
1607 tokio::select! {
1608 _ = notify.notified() => {
1609 break;
1610 }
1611 _ = perform_smart_mouse_movement(&page, &viewport) => {
1612 tokio::time::sleep(std::time::Duration::from_millis(WAIT_TIMEOUTS[index])).await;
1613 }
1614 }
1615
1616 index = (index + 1) % WAIT_TIMEOUTS.len();
1617 }
1618 };
1619
1620 let navigation_loop = async {
1621 let result = page_navigation.await;
1622 notify.notify_waiters();
1623 result
1624 };
1625
1626 let (result, _) = tokio::join!(navigation_loop, mouse_loop);
1627
1628 result
1629 } else {
1630 page_navigation.await
1631 }
1632 };
1633
1634 tokio::select! {
1635 v = tokio::time::timeout(base_timeout + Duration::from_millis(50), page_navigate) => {
1636 if v.is_err() {
1637 request_cancelled = true;
1638 }
1639 }
1640 v = rx => {
1641 if let Ok(v) = v {
1642 request_cancelled = !v;
1643 }
1644 }
1645 };
1646
1647 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1648
1649 let final_url = if wait_for_navigation && !request_cancelled && !block_bytes {
1651 let last_redirect = tokio::time::timeout(base_timeout, async {
1652 match page.wait_for_navigation_response().await {
1653 Ok(u) => get_last_redirect(&source, &u, &page).await,
1654 _ => None,
1655 }
1656 })
1657 .await;
1658 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1659 match last_redirect {
1660 Ok(last) => last,
1661 _ => None,
1662 }
1663 } else {
1664 None
1665 };
1666
1667 let chrome_http_req_res1 = if asset {
1668 Some(chrome_http_req_res.clone())
1669 } else {
1670 None
1671 };
1672
1673 let run_events = !base_timeout.is_zero()
1674 && !block_bytes
1675 && !request_cancelled
1676 && (!chrome_http_req_res.status_code.is_server_error()
1677 && !chrome_http_req_res.status_code.is_client_error()
1678 || chrome_http_req_res.status_code == *UNKNOWN_STATUS_ERROR
1679 || chrome_http_req_res.status_code == 404
1680 || chrome_http_req_res.status_code == 403
1681 || chrome_http_req_res.status_code.is_redirection()
1682 || chrome_http_req_res.status_code.is_success());
1683
1684 block_bytes = chrome_http_req_res.status_code == StatusCode::REQUEST_TIMEOUT;
1685
1686 let waf_check = chrome_http_req_res.waf_check;
1687 let mut status_code = chrome_http_req_res.status_code;
1688 let mut anti_bot_tech = chrome_http_req_res.anti_bot_tech;
1689 let mut validate_cf = false;
1690
1691 let run_page_response = async move {
1692 let mut page_response = if run_events {
1693 if waf_check {
1694 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1695 if let Err(elasped) = tokio::time::timeout(
1696 base_timeout,
1697 perform_smart_mouse_movement(&page, &viewport),
1698 )
1699 .await
1700 {
1701 log::warn!("mouse movement timeout exceeded {elasped}");
1702 }
1703 }
1704
1705 if wait_for.is_some() {
1706 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1707 if let Err(elasped) =
1708 tokio::time::timeout(base_timeout, page_wait(&page, &wait_for)).await
1709 {
1710 log::warn!("max wait for timeout {elasped}");
1711 }
1712 }
1713
1714 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1715
1716 if execution_scripts.is_some() || automation_scripts.is_some() {
1717 let target_url = if final_url.is_some() {
1718 match final_url.as_ref() {
1719 Some(ref u) => u.to_string(),
1720 _ => Default::default(),
1721 }
1722 } else if url_target.is_some() {
1723 url_target.unwrap_or_default().to_string()
1724 } else {
1725 source.to_string()
1726 };
1727
1728 if let Err(elasped) = tokio::time::timeout(base_timeout, async {
1729 tokio::join!(
1730 crate::features::chrome_common::eval_execution_scripts(
1731 &page,
1732 &target_url,
1733 &execution_scripts
1734 ),
1735 crate::features::chrome_common::eval_automation_scripts(
1736 &page,
1737 &target_url,
1738 &automation_scripts
1739 )
1740 );
1741 })
1742 .await
1743 {
1744 log::warn!("eval scripts timeout exceeded {elasped}");
1745 }
1746 }
1747
1748 let xml_target = match &final_url {
1749 Some(f) => f.ends_with(".xml"),
1750 _ => target_url.ends_with(".xml"),
1751 };
1752
1753 let page_fn = async {
1754 if xml_target {
1755 match page.content_bytes_xml().await {
1756 Ok(page_bytes) => {
1757 if page_bytes.is_empty() {
1758 page.outer_html_bytes().await
1759 } else {
1760 Ok(page_bytes)
1761 }
1762 }
1763 _ => page.outer_html_bytes().await,
1764 }
1765 } else {
1766 page.outer_html_bytes().await
1767 }
1768 };
1769
1770 let results = tokio::time::timeout(base_timeout.max(HALF_MAX_PAGE_TIMEOUT), page_fn);
1771
1772 let mut res: Box<Vec<u8>> = match results.await {
1773 Ok(v) => v.map(Box::new).unwrap_or_default(),
1774 _ => Default::default(),
1775 };
1776
1777 let forbidden = waf_check && res.starts_with(b"<html><head>\n <style global=") && res.ends_with(b";</script><iframe height=\"1\" width=\"1\" style=\"position: absolute; top: 0px; left: 0px; border: none; visibility: hidden;\"></iframe>\n\n</body></html>");
1778
1779 if cfg!(feature = "real_browser") {
1780 if res.len() <= crate::page::TURNSTILE_WALL_PAGE_SIZE
1782 && anti_bot_tech == AntiBotTech::Cloudflare
1783 || waf_check
1784 {
1785 if detect_cf_turnstyle(&res) {
1787 if let Err(_e) = tokio::time::timeout(base_timeout, async {
1788 if let Ok(success) = cf_handle(&mut res, &page).await {
1789 if success {
1790 status_code = StatusCode::OK;
1791 }
1792 }
1793 })
1794 .await
1795 {
1796 validate_cf = true;
1797 }
1798 }
1799 }
1800 };
1801
1802 let ok = !res.is_empty();
1803
1804 if validate_cf && ok {
1805 if !detect_cf_turnstyle(&res) && status_code == StatusCode::FORBIDDEN {
1806 status_code = StatusCode::OK;
1807 }
1808 }
1809
1810 let mut page_response = set_page_response(
1811 ok,
1812 res,
1813 if forbidden {
1814 StatusCode::FORBIDDEN
1815 } else {
1816 status_code
1817 },
1818 final_url,
1819 );
1820
1821 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1822
1823 let _ = tokio::time::timeout(
1824 base_timeout,
1825 set_page_response_cookies(&mut page_response, &page),
1826 )
1827 .await;
1828
1829 if openai_config.is_some() && !base_timeout.is_zero() {
1830 base_timeout = sub_duration(base_timeout_measurement, start_time.elapsed());
1831
1832 let openai_request = run_openai_request(
1833 match url_target {
1834 Some(ref ut) => ut,
1835 _ => source,
1836 },
1837 page,
1838 wait_for,
1839 openai_config,
1840 &mut page_response,
1841 ok,
1842 );
1843
1844 let _ = tokio::time::timeout(base_timeout, openai_request).await;
1845 }
1846
1847 if cfg!(feature = "chrome_screenshot") || screenshot.is_some() {
1848 let _ = tokio::time::timeout(
1849 base_timeout + tokio::time::Duration::from_secs(30),
1850 perform_screenshot(source, page, screenshot, &mut page_response),
1851 )
1852 .await;
1853 }
1854
1855 page_response
1856 } else {
1857 let res = if !block_bytes {
1858 let results = tokio::time::timeout(
1859 base_timeout.max(HALF_MAX_PAGE_TIMEOUT),
1860 page.outer_html_bytes(),
1861 );
1862
1863 match results.await {
1864 Ok(v) => v.map(Box::new).unwrap_or_default(),
1865 _ => Default::default(),
1866 }
1867 } else {
1868 Default::default()
1869 };
1870
1871 let mut page_response = set_page_response(!res.is_empty(), res, status_code, final_url);
1872
1873 if !block_bytes {
1874 let _ = tokio::time::timeout(
1875 base_timeout,
1876 set_page_response_cookies(&mut page_response, &page),
1877 )
1878 .await;
1879 }
1880
1881 if base_timeout.is_zero() && page_response.content.is_none() {
1882 page_response.status_code = StatusCode::REQUEST_TIMEOUT;
1883 }
1884
1885 page_response
1886 };
1887
1888 if content {
1889 if let Some(final_url) = &page_response.final_url {
1890 if final_url == "about:blank" {
1891 page_response.final_url = None;
1892 }
1893 }
1894 }
1895
1896 page_response
1897 };
1898
1899 let mut content: Option<Box<Vec<u8>>> = None;
1900
1901 let page_response = match rx1 {
1902 Some(rx1) => {
1903 tokio::select! {
1904 v = tokio::time::timeout(base_timeout, run_page_response) => {
1905 v.map_err(|_| CdpError::Timeout)
1906 }
1907 c = rx1 => {
1908 if let Ok(c) = c {
1909 if let Some(c) = c {
1910 let params =
1911 GetResponseBodyParams::new(c.clone());
1912
1913 if let Ok(command_response) = page.execute(params).await {
1914 let body_response = command_response;
1915
1916 let media_file = if body_response.base64_encoded {
1917 chromiumoxide::utils::base64::decode(
1918 &body_response.body,
1919 )
1920 .unwrap_or_default()
1921 } else {
1922 body_response.body.as_bytes().to_vec()
1923 };
1924 content = Some(media_file.into());
1925 }
1926 }
1927 }
1928
1929 let mut page_response = PageResponse::default();
1930
1931 let _ = tokio::time::timeout(
1932 base_timeout,
1933 set_page_response_cookies(&mut page_response, &page),
1934 )
1935 .await;
1936
1937 if let Some(mut chrome_http_req_res1) = chrome_http_req_res1 {
1938 set_page_response_headers(&mut chrome_http_req_res1, &mut page_response);
1939
1940 page_response.status_code = chrome_http_req_res1.status_code;
1941 page_response.waf_check = chrome_http_req_res1.waf_check;
1942
1943 if !page_set {
1944 let _ = tokio::time::timeout(
1945 base_timeout,
1946 cache_chrome_response(&source, &page_response, chrome_http_req_res1),
1947 )
1948 .await;
1949 }
1950
1951 }
1952
1953 Ok(page_response)
1954 }
1955 }
1956 }
1957 _ => Ok(run_page_response.await),
1958 };
1959
1960 let mut page_response = page_response.unwrap_or_default();
1961
1962 set_page_response_headers(&mut chrome_http_req_res, &mut page_response);
1963 page_response.status_code = chrome_http_req_res.status_code;
1964 page_response.waf_check = chrome_http_req_res.waf_check;
1965
1966 if content.is_some() {
1967 page_response.content = content.map(|f| f.into());
1968 }
1969
1970 if page_response.status_code == *UNKNOWN_STATUS_ERROR && page_response.content.is_some() {
1971 page_response.status_code = StatusCode::OK;
1972 }
1973
1974 if cfg!(not(feature = "chrome_store_page")) {
1982 let _ = tokio::time::timeout(
1983 base_timeout.max(HALF_MAX_PAGE_TIMEOUT),
1984 page.execute(chromiumoxide::cdp::browser_protocol::page::CloseParams::default()),
1985 )
1986 .await;
1987
1988 if let Ok((mut transferred, bytes_map, mut rs, request_map)) = bytes_collected_handle.await
1989 {
1990 let response_map = rs.response_map;
1991
1992 if response_map.is_some() {
1993 let mut _response_map = HashMap::new();
1994
1995 if let Some(response_map) = response_map {
1996 if let Some(bytes_map) = bytes_map {
1997 let detect_anti_bots =
1998 response_map.len() <= 4 && anti_bot_tech == AntiBotTech::None;
1999
2000 for item in response_map {
2001 if detect_anti_bots && item.1.url.contains("_Incapsula_Resource?") {
2002 anti_bot_tech = AntiBotTech::Imperva;
2003 }
2004
2005 let b = if item.1.skipped {
2006 0.0
2007 } else {
2008 match bytes_map.get(&item.0) {
2009 Some(f) => *f,
2010 _ => 0.0,
2011 }
2012 };
2013
2014 if item.1.skipped {
2015 transferred -= item.1.bytes_transferred;
2016 }
2017
2018 _response_map.insert(item.1.url, b);
2019 }
2020 }
2021 }
2022
2023 page_response.response_map = Some(_response_map);
2024
2025 if let Some(status) = rs.status_code {
2026 if let Ok(scode) = status.try_into() {
2027 if let Ok(status) = StatusCode::from_u16(scode) {
2028 page_response.status_code = status;
2029 }
2030 }
2031 }
2032
2033 set_page_response_headers_raw(&mut rs.headers, &mut page_response);
2034 store_headers(&page_response, &mut chrome_http_req_res);
2035
2036 if anti_bot_tech == AntiBotTech::None {
2037 let final_url = match &page_response.final_url {
2038 Some(final_url) => final_url,
2039 _ => target_url,
2040 };
2041 if let Some(h) = &page_response.headers {
2042 if let Some(content) = &page_response.content {
2043 anti_bot_tech = detect_anti_bot_tech_response(
2044 &final_url,
2045 &HeaderSource::HeaderMap(h),
2046 &content,
2047 None,
2048 );
2049 }
2050 }
2051 }
2052
2053 if let Some(content) = &page_response.content {
2054 if anti_bot_tech == AntiBotTech::Cloudflare
2056 && page_response.status_code == StatusCode::FORBIDDEN
2057 {
2058 let cf_turnstile = detect_cf_turnstyle(&content);
2059
2060 if !cf_turnstile {
2061 page_response.status_code = StatusCode::OK;
2062 }
2063 }
2064 }
2065
2066 if !page_set {
2067 let _ = tokio::time::timeout(
2068 base_timeout,
2069 cache_chrome_response(&source, &page_response, chrome_http_req_res),
2070 )
2071 .await;
2072 }
2073 }
2074 if request_map.is_some() {
2075 page_response.request_map = request_map;
2076 }
2077
2078 page_response.bytes_transferred = Some(transferred);
2079 }
2080 }
2081
2082 page_response.anti_bot_tech = anti_bot_tech;
2083
2084 Ok(page_response)
2085}
2086
2087#[cfg(feature = "chrome")]
2089fn set_page_response(
2090 ok: bool,
2091 res: Box<Vec<u8>>,
2092 status_code: StatusCode,
2093 final_url: Option<String>,
2094) -> PageResponse {
2095 PageResponse {
2096 content: if ok { Some(res.into()) } else { None },
2097 status_code,
2098 final_url,
2099 ..Default::default()
2100 }
2101}
2102
2103#[cfg(all(feature = "chrome", feature = "headers"))]
2105fn set_page_response_headers(
2106 chrome_http_req_res: &mut ChromeHTTPReqRes,
2107 page_response: &mut PageResponse,
2108) {
2109 let response_headers = convert_headers(&chrome_http_req_res.response_headers);
2110
2111 if !response_headers.is_empty() {
2112 page_response.headers = Some(response_headers);
2113 }
2114}
2115
2116#[cfg(all(feature = "chrome", not(feature = "headers")))]
2118fn set_page_response_headers(
2119 _chrome_http_req_res: &mut ChromeHTTPReqRes,
2120 _page_response: &mut PageResponse,
2121) {
2122}
2123
2124#[cfg(all(feature = "chrome", feature = "headers"))]
2126fn set_page_response_headers_raw(
2127 chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
2128 page_response: &mut PageResponse,
2129) {
2130 if let Some(chrome_headers) = chrome_http_req_res {
2131 let mut header_map = reqwest::header::HeaderMap::new();
2132
2133 if let Some(obj) = chrome_headers.inner().as_object() {
2134 for (index, (key, value)) in obj.iter().enumerate() {
2135 use std::str::FromStr;
2136 if let (Ok(header_name), Ok(header_value)) = (
2137 reqwest::header::HeaderName::from_str(key),
2138 reqwest::header::HeaderValue::from_str(&value.to_string()),
2139 ) {
2140 header_map.insert(header_name, header_value);
2141 }
2142 if index > 1000 {
2143 break;
2144 }
2145 }
2146 }
2147 if !header_map.is_empty() {
2148 page_response.headers = Some(header_map);
2149 }
2150 }
2151}
2152
2153#[cfg(all(feature = "chrome", not(feature = "headers")))]
2155fn set_page_response_headers_raw(
2156 _chrome_http_req_res: &mut Option<chromiumoxide::cdp::browser_protocol::network::Headers>,
2157 _page_response: &mut PageResponse,
2158) {
2159}
2160
2161#[cfg(all(feature = "chrome", feature = "cookies"))]
2163async fn set_page_response_cookies(page_response: &mut PageResponse, page: &chromiumoxide::Page) {
2164 if let Ok(mut cookies) = page.get_cookies().await {
2165 let mut cookies_map: std::collections::HashMap<String, String> =
2166 std::collections::HashMap::new();
2167
2168 for cookie in cookies.drain(..) {
2169 cookies_map.insert(cookie.name, cookie.value);
2170 }
2171
2172 let response_headers = convert_headers(&cookies_map);
2173
2174 if !response_headers.is_empty() {
2175 page_response.cookies = Some(response_headers);
2176 }
2177 }
2178}
2179
2180#[cfg(all(feature = "chrome", not(feature = "cookies")))]
2182async fn set_page_response_cookies(_page_response: &mut PageResponse, _page: &chromiumoxide::Page) {
2183}
2184
2185#[cfg(feature = "chrome")]
2187pub async fn perform_screenshot(
2188 target_url: &str,
2189 page: &chromiumoxide::Page,
2190 screenshot: &Option<crate::configuration::ScreenShotConfig>,
2191 page_response: &mut PageResponse,
2192) {
2193 use base64::engine::general_purpose::STANDARD;
2194 use base64::Engine;
2195
2196 match screenshot {
2197 Some(ref ss) => {
2198 let output_format = string_concat!(
2199 ".",
2200 ss.params
2201 .cdp_params
2202 .format
2203 .as_ref()
2204 .unwrap_or_else(|| &crate::configuration::CaptureScreenshotFormat::Png)
2205 .to_string()
2206 );
2207 let ss_params = chromiumoxide::page::ScreenshotParams::from(ss.params.clone());
2208
2209 let full_page = ss_params.full_page.unwrap_or_default();
2210 let omit_background = ss_params.omit_background.unwrap_or_default();
2211 let mut cdp_params = ss_params.cdp_params;
2212
2213 cdp_params.optimize_for_speed = Some(true);
2214
2215 if full_page {
2216 cdp_params.capture_beyond_viewport = Some(true);
2217 }
2218
2219 if omit_background {
2220 let _ = page.execute(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams {
2221 color: Some(chromiumoxide::cdp::browser_protocol::dom::Rgba {
2222 r: 0,
2223 g: 0,
2224 b: 0,
2225 a: Some(0.),
2226 }),
2227 })
2228 .await;
2229 }
2230
2231 match page.execute(cdp_params).await {
2232 Ok(b) => match STANDARD.decode(&b.data) {
2233 Ok(b) => {
2234 if ss.save {
2235 let output_path = create_output_path(
2236 &ss.output_dir.clone().unwrap_or_else(|| "./storage/".into()),
2237 &target_url,
2238 &output_format,
2239 )
2240 .await;
2241 let _ = tokio::fs::write(output_path, &b).await;
2242 }
2243 if ss.bytes {
2244 page_response.screenshot_bytes = Some(b);
2245 }
2246 }
2247 _ => (),
2248 },
2249 Err(e) => {
2250 log::error!("failed to take screenshot: {:?} - {:?}", e, target_url)
2251 }
2252 };
2253
2254 if omit_background {
2255 let _ = page.execute(chromiumoxide::cdp::browser_protocol::emulation::SetDefaultBackgroundColorOverrideParams { color: None })
2256 .await;
2257 }
2258 }
2259 _ => {
2260 let output_path = create_output_path(
2261 &std::env::var("SCREENSHOT_DIRECTORY")
2262 .unwrap_or_else(|_| "./storage/".to_string())
2263 .into(),
2264 &target_url,
2265 &".png",
2266 )
2267 .await;
2268
2269 match page
2270 .save_screenshot(
2271 chromiumoxide::page::ScreenshotParams::builder()
2272 .format(
2273 chromiumoxide::cdp::browser_protocol::page::CaptureScreenshotFormat::Png,
2274 )
2275 .full_page(match std::env::var("SCREENSHOT_FULL_PAGE") {
2276 Ok(t) => t == "true",
2277 _ => true,
2278 })
2279 .omit_background(match std::env::var("SCREENSHOT_OMIT_BACKGROUND") {
2280 Ok(t) => t == "true",
2281 _ => true,
2282 })
2283 .build(),
2284 &output_path,
2285 )
2286 .await
2287 {
2288 Ok(_) => log::debug!("saved screenshot: {:?}", output_path),
2289 Err(e) => log::error!("failed to save screenshot: {:?} - {:?}", e, output_path),
2290 };
2291 }
2292 }
2293}
2294
2295#[cfg(feature = "chrome")]
2296pub async fn get_last_redirect(
2298 target_url: &str,
2299 u: &Option<std::sync::Arc<chromiumoxide::handler::http::HttpRequest>>,
2300 page: &chromiumoxide::Page,
2301) -> Option<String> {
2302 if let Some(http_request) = u {
2303 if let Some(redirect) = http_request.redirect_chain.last() {
2304 if let Some(url) = redirect.url.as_ref() {
2305 return if target_url != url {
2306 Some(url.clone())
2307 } else {
2308 None
2309 };
2310 }
2311 }
2312 }
2313 page.url().await.ok()?
2314}
2315
2316#[cfg(feature = "cookies")]
2318pub fn get_cookies(res: &Response) -> Option<crate::client::header::HeaderMap> {
2319 use crate::client::header::{HeaderMap, HeaderName, HeaderValue};
2320
2321 let mut headers = HeaderMap::new();
2322
2323 for cookie in res.cookies() {
2324 if let Ok(h) = HeaderValue::from_str(cookie.value()) {
2325 if let Ok(n) = HeaderName::from_str(cookie.name()) {
2326 headers.insert(n, h);
2327 }
2328 }
2329 }
2330
2331 if !headers.is_empty() {
2332 Some(headers)
2333 } else {
2334 None
2335 }
2336}
2337
2338#[cfg(not(feature = "cookies"))]
2339pub fn get_cookies(res: &Response) -> Option<crate::client::header::HeaderMap> {
2341 None
2342}
2343
2344pub(crate) fn block_streaming(res: &Response, only_html: bool) -> bool {
2346 let mut block_streaming = false;
2347
2348 if only_html {
2349 if let Some(content_type) = res.headers().get(crate::client::header::CONTENT_TYPE) {
2350 if let Ok(content_type_str) = content_type.to_str() {
2351 if IGNORE_CONTENT_TYPES.contains(content_type_str) {
2352 block_streaming = true;
2353 }
2354 }
2355 }
2356 }
2357
2358 block_streaming
2359}
2360
2361pub async fn handle_response_bytes(
2363 res: Response,
2364 target_url: &str,
2365 only_html: bool,
2366) -> PageResponse {
2367 let u = res.url().as_str();
2368
2369 let rd = if target_url != u {
2370 Some(u.into())
2371 } else {
2372 None
2373 };
2374
2375 let status_code: StatusCode = res.status();
2376 let headers = res.headers().clone();
2377 #[cfg(feature = "remote_addr")]
2378 let remote_addr = res.remote_addr();
2379 let cookies = get_cookies(&res);
2380
2381 let mut content: Option<Box<Vec<u8>>> = None;
2382 let mut anti_bot_tech = AntiBotTech::default();
2383
2384 if !block_streaming(&res, only_html) {
2385 let mut data = match res.content_length() {
2386 Some(cap) if cap >= MAX_PRE_ALLOCATED_HTML_PAGE_SIZE => {
2387 Vec::with_capacity(cap.max(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE) as usize)
2388 }
2389 _ => Vec::with_capacity(MAX_PRE_ALLOCATED_HTML_PAGE_SIZE_USIZE),
2390 };
2391 let mut stream = res.bytes_stream();
2392 let mut first_bytes = true;
2393
2394 while let Some(item) = stream.next().await {
2395 match item {
2396 Ok(text) => {
2397 if only_html && first_bytes {
2398 first_bytes = false;
2399 if is_binary_file(&text) {
2400 break;
2401 }
2402 }
2403 let limit = *MAX_SIZE_BYTES;
2404
2405 if limit > 0 && data.len() + text.len() > limit {
2406 break;
2407 }
2408
2409 data.put(text)
2410 }
2411 Err(e) => {
2412 log::error!("{e} in {}", target_url);
2413 break;
2414 }
2415 }
2416 }
2417
2418 anti_bot_tech = detect_anti_bot_tech_response(
2419 &target_url,
2420 &HeaderSource::HeaderMap(&headers),
2421 &data,
2422 None,
2423 );
2424 content.replace(Box::new(data.into()));
2425 }
2426
2427 PageResponse {
2428 #[cfg(feature = "headers")]
2429 headers: Some(headers),
2430 #[cfg(feature = "remote_addr")]
2431 remote_addr,
2432 #[cfg(feature = "cookies")]
2433 cookies,
2434 content,
2435 final_url: rd,
2436 status_code,
2437 anti_bot_tech,
2438 ..Default::default()
2439 }
2440}
2441
2442pub async fn handle_response_bytes_writer<'h, O>(
2444 res: Response,
2445 target_url: &str,
2446 only_html: bool,
2447 rewriter: &mut HtmlRewriter<'h, O>,
2448 collected_bytes: &mut Vec<u8>,
2449) -> (PageResponse, bool)
2450where
2451 O: OutputSink + Send + 'static,
2452{
2453 let u = res.url().as_str();
2454
2455 let final_url: Option<String> = if target_url != u {
2456 Some(u.into())
2457 } else {
2458 None
2459 };
2460
2461 let status_code: StatusCode = res.status();
2462 let headers = res.headers().clone();
2463 #[cfg(feature = "remote_addr")]
2464 let remote_addr = res.remote_addr();
2465 let cookies = get_cookies(&res);
2466 let mut anti_bot_tech = AntiBotTech::default();
2467
2468 let mut rewrite_error = false;
2469
2470 if !block_streaming(&res, only_html) {
2471 let mut stream = res.bytes_stream();
2472 let mut first_bytes = true;
2473 let mut data_len = 0;
2474
2475 while let Some(item) = stream.next().await {
2476 match item {
2477 Ok(res_bytes) => {
2478 if only_html && first_bytes {
2479 first_bytes = false;
2480 if is_binary_file(&res_bytes) {
2481 break;
2482 }
2483 }
2484 let limit = *MAX_SIZE_BYTES;
2485 let bytes_len = res_bytes.len();
2486
2487 if limit > 0 && data_len + bytes_len > limit {
2488 break;
2489 }
2490
2491 data_len += bytes_len;
2492
2493 if !rewrite_error {
2494 if rewriter.write(&res_bytes).is_err() {
2495 rewrite_error = true;
2496 }
2497 }
2498
2499 collected_bytes.put(res_bytes);
2500 }
2501 Err(e) => {
2502 log::error!("{e} in {}", target_url);
2503 break;
2504 }
2505 }
2506 }
2507
2508 anti_bot_tech = detect_anti_bot_tech_response(
2509 &target_url,
2510 &HeaderSource::HeaderMap(&headers),
2511 &collected_bytes,
2512 None,
2513 );
2514 }
2515
2516 (
2517 PageResponse {
2518 #[cfg(feature = "headers")]
2519 headers: Some(headers),
2520 #[cfg(feature = "remote_addr")]
2521 remote_addr,
2522 #[cfg(feature = "cookies")]
2523 cookies,
2524 final_url,
2525 status_code,
2526 anti_bot_tech,
2527 ..Default::default()
2528 },
2529 rewrite_error,
2530 )
2531}
2532
2533pub(crate) fn valid_parsing_status(res: &Response) -> bool {
2535 res.status().is_success() || res.status() == 404
2536}
2537
2538async fn fetch_page_html_raw_base(
2540 target_url: &str,
2541 client: &Client,
2542 only_html: bool,
2543) -> PageResponse {
2544 match client.get(target_url).send().await {
2545 Ok(res) if valid_parsing_status(&res) => {
2546 handle_response_bytes(res, target_url, only_html).await
2547 }
2548 Ok(res) => handle_response_bytes(res, target_url, only_html).await,
2549 Err(err) => {
2550 log::info!("error fetching {}", target_url);
2551 let mut page_response = PageResponse::default();
2552
2553 if let Some(status_code) = err.status() {
2554 page_response.status_code = status_code;
2555 } else {
2556 page_response.status_code = crate::page::get_error_http_status_code(&err);
2557 }
2558
2559 page_response.error_for_status = Some(Err(err));
2560 page_response
2561 }
2562 }
2563}
2564
2565pub async fn fetch_page_html_raw(target_url: &str, client: &Client) -> PageResponse {
2567 fetch_page_html_raw_base(target_url, client, false).await
2568}
2569
2570pub async fn fetch_page_html_raw_only_html(target_url: &str, client: &Client) -> PageResponse {
2572 fetch_page_html_raw_base(target_url, client, false).await
2573}
2574
2575#[cfg(feature = "decentralized")]
2577pub async fn fetch_page(target_url: &str, client: &Client) -> Option<Vec<u8>> {
2578 match client.get(target_url).send().await {
2579 Ok(res) if valid_parsing_status(&res) => match res.bytes().await {
2580 Ok(text) => Some(text.into()),
2581 Err(_) => {
2582 log("- error fetching {}", &target_url);
2583 None
2584 }
2585 },
2586 Ok(_) => None,
2587 Err(_) => {
2588 log("- error parsing html bytes {}", &target_url);
2589 None
2590 }
2591 }
2592}
2593
2594#[cfg(all(feature = "decentralized", feature = "headers"))]
2595pub enum FetchPageResult {
2597 Success(reqwest::header::HeaderMap, Option<Vec<u8>>),
2599 NoSuccess(reqwest::header::HeaderMap),
2601 FetchError,
2603}
2604
2605#[cfg(all(feature = "decentralized", feature = "headers"))]
2606pub async fn fetch_page_and_headers(target_url: &str, client: &Client) -> FetchPageResult {
2608 match client.get(target_url).send().await {
2609 Ok(res) if valid_parsing_status(&res) => {
2610 let headers = res.headers().clone();
2611 let b = match res.bytes().await {
2612 Ok(text) => Some(text),
2613 Err(_) => {
2614 log("- error fetching {}", &target_url);
2615 None
2616 }
2617 };
2618 FetchPageResult::Success(headers, b)
2619 }
2620 Ok(res) => FetchPageResult::NoSuccess(res.headers().clone()),
2621 Err(_) => {
2622 log("- error parsing html bytes {}", &target_url);
2623 FetchPageResult::FetchError
2624 }
2625 }
2626}
2627
2628#[cfg(all(not(feature = "fs"), not(feature = "chrome")))]
2629pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
2631 fetch_page_html_raw(target_url, client).await
2632}
2633
2634#[cfg(all(feature = "fs", not(feature = "chrome")))]
2636pub async fn fetch_page_html(target_url: &str, client: &Client) -> PageResponse {
2637 use crate::bytes::BufMut;
2638 use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
2639 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
2640
2641 match client.get(target_url).send().await {
2642 Ok(res) if valid_parsing_status(&res) => {
2643 let u = res.url().as_str();
2644
2645 let rd = if target_url != u {
2646 Some(u.into())
2647 } else {
2648 None
2649 };
2650
2651 let status_code = res.status();
2652 let cookies = get_cookies(&res);
2653 #[cfg(feature = "headers")]
2654 let headers = res.headers().clone();
2655 #[cfg(feature = "remote_addr")]
2656 let remote_addr = res.remote_addr();
2657 let mut stream = res.bytes_stream();
2658 let mut data = Vec::new();
2659 let mut file: Option<tokio::fs::File> = None;
2660 let mut file_path = String::new();
2661
2662 while let Some(item) = stream.next().await {
2663 match item {
2664 Ok(text) => {
2665 let wrote_disk = file.is_some();
2666
2667 if !wrote_disk && data.capacity() < 8192 {
2669 data.put(text);
2670 } else {
2671 if !wrote_disk {
2672 file_path = string_concat!(
2673 TMP_DIR,
2674 &utf8_percent_encode(target_url, NON_ALPHANUMERIC).to_string()
2675 );
2676 match tokio::fs::File::create(&file_path).await {
2677 Ok(f) => {
2678 let file = file.insert(f);
2679
2680 data.put(text);
2681
2682 if let Ok(_) = file.write_all(&data.as_ref()).await {
2683 data.clear();
2684 }
2685 }
2686 _ => data.put(text),
2687 };
2688 } else {
2689 if let Some(f) = file.as_mut() {
2690 if let Err(_) = f.write_all(&text).await {
2691 data.put(text)
2692 }
2693 }
2694 }
2695 }
2696 }
2697 Err(e) => {
2698 log::error!("{e} in {}", target_url);
2699 break;
2700 }
2701 }
2702 }
2703
2704 PageResponse {
2705 #[cfg(feature = "headers")]
2706 headers: Some(headers),
2707 #[cfg(feature = "remote_addr")]
2708 remote_addr,
2709 #[cfg(feature = "cookies")]
2710 cookies,
2711 content: Some(if file.is_some() {
2712 let mut buffer = vec![];
2713
2714 if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
2715 if let Ok(_) = b.read_to_end(&mut buffer).await {
2716 let _ = tokio::fs::remove_file(file_path).await;
2717 }
2718 }
2719
2720 Box::new(buffer.into())
2721 } else {
2722 Box::new(data.into())
2723 }),
2724 status_code,
2725 final_url: rd,
2726 ..Default::default()
2727 }
2728 }
2729 Ok(res) => {
2730 let u = res.url().as_str();
2731
2732 let rd = if target_url != u {
2733 Some(u.into())
2734 } else {
2735 None
2736 };
2737
2738 PageResponse {
2739 #[cfg(feature = "headers")]
2740 headers: Some(res.headers().clone()),
2741 #[cfg(feature = "remote_addr")]
2742 remote_addr: res.remote_addr(),
2743 #[cfg(feature = "cookies")]
2744 cookies: get_cookies(&res),
2745 status_code: res.status(),
2746 final_url: rd,
2747 ..Default::default()
2748 }
2749 }
2750 Err(err) => {
2751 log::info!("error fetching {}", target_url);
2752 let mut page_response = PageResponse::default();
2753
2754 if let Some(status_code) = err.status() {
2755 page_response.status_code = status_code;
2756 } else {
2757 page_response.status_code = crate::page::get_error_http_status_code(&err);
2758 }
2759
2760 page_response.error_for_status = Some(Err(err));
2761 page_response
2762 }
2763 }
2764}
2765
2766#[cfg(all(feature = "fs", feature = "chrome"))]
2768pub async fn fetch_page_html(
2770 target_url: &str,
2771 client: &Client,
2772 page: &chromiumoxide::Page,
2773 wait_for: &Option<crate::configuration::WaitFor>,
2774 screenshot: &Option<crate::configuration::ScreenShotConfig>,
2775 page_set: bool,
2776 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2777 execution_scripts: &Option<ExecutionScripts>,
2778 automation_scripts: &Option<AutomationScripts>,
2779 viewport: &Option<crate::configuration::Viewport>,
2780 request_timeout: &Option<Box<std::time::Duration>>,
2781 track_events: &Option<crate::configuration::ChromeEventTracker>,
2782) -> PageResponse {
2783 use crate::tokio::io::{AsyncReadExt, AsyncWriteExt};
2784 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
2785
2786 match &page {
2787 page => {
2788 match fetch_page_html_chrome_base(
2789 &target_url,
2790 &page,
2791 false,
2792 true,
2793 wait_for,
2794 screenshot,
2795 page_set,
2796 openai_config,
2797 None,
2798 execution_scripts,
2799 automation_scripts,
2800 &viewport,
2801 &request_timeout,
2802 &track_events,
2803 )
2804 .await
2805 {
2806 Ok(page) => page,
2807 _ => {
2808 log::info!(
2809 "- error fetching chrome page defaulting to raw http request {}",
2810 &target_url,
2811 );
2812
2813 use crate::bytes::BufMut;
2814
2815 match client.get(target_url).send().await {
2816 Ok(res) if valid_parsing_status(&res) => {
2817 #[cfg(feature = "headers")]
2818 let headers = res.headers().clone();
2819 let cookies = get_cookies(&res);
2820 let status_code = res.status();
2821 let mut stream = res.bytes_stream();
2822 let mut data = Vec::new();
2823
2824 let mut file: Option<tokio::fs::File> = None;
2825 let mut file_path = String::new();
2826
2827 while let Some(item) = stream.next().await {
2828 match item {
2829 Ok(text) => {
2830 let wrote_disk = file.is_some();
2831
2832 if !wrote_disk && data.capacity() < 8192 {
2834 data.put(text);
2835 } else {
2836 if !wrote_disk {
2837 file_path = string_concat!(
2838 TMP_DIR,
2839 &utf8_percent_encode(
2840 target_url,
2841 NON_ALPHANUMERIC
2842 )
2843 .to_string()
2844 );
2845 match tokio::fs::File::create(&file_path).await {
2846 Ok(f) => {
2847 let file = file.insert(f);
2848
2849 data.put(text);
2850
2851 if let Ok(_) =
2852 file.write_all(&data.as_ref()).await
2853 {
2854 data.clear();
2855 }
2856 }
2857 _ => data.put(text),
2858 };
2859 } else {
2860 if let Some(f) = file.as_mut() {
2861 if let Ok(_) = f.write_all(&text).await {
2862 data.put(text)
2863 }
2864 }
2865 }
2866 }
2867 }
2868 Err(e) => {
2869 log::error!("{e} in {}", target_url);
2870 break;
2871 }
2872 }
2873 }
2874
2875 PageResponse {
2876 #[cfg(feature = "headers")]
2877 headers: Some(headers),
2878 #[cfg(feature = "remote_addr")]
2879 remote_addr: res.remote_addr(),
2880 #[cfg(feature = "cookies")]
2881 cookies,
2882 content: Some(if file.is_some() {
2883 let mut buffer = vec![];
2884
2885 if let Ok(mut b) = tokio::fs::File::open(&file_path).await {
2886 if let Ok(_) = b.read_to_end(&mut buffer).await {
2887 let _ = tokio::fs::remove_file(file_path).await;
2888 }
2889 }
2890
2891 Box::new(buffer.into())
2892 } else {
2893 Box::new(data.into())
2894 }),
2895 status_code,
2896 ..Default::default()
2897 }
2898 }
2899
2900 Ok(res) => PageResponse {
2901 #[cfg(feature = "headers")]
2902 headers: Some(res.headers().clone()),
2903 #[cfg(feature = "remote_addr")]
2904 remote_addr: res.remote_addr(),
2905 #[cfg(feature = "cookies")]
2906 cookies: get_cookies(&res),
2907 status_code: res.status(),
2908 ..Default::default()
2909 },
2910 Err(err) => {
2911 log::info!("error fetching {}", target_url);
2912 let mut page_response = PageResponse::default();
2913
2914 if let Some(status_code) = err.status() {
2915 page_response.status_code = status_code;
2916 } else {
2917 page_response.status_code =
2918 crate::page::get_error_http_status_code(&err);
2919 }
2920
2921 page_response.error_for_status = Some(Err(err));
2922 page_response
2923 }
2924 }
2925 }
2926 }
2927 }
2928 }
2929}
2930
2931#[cfg(all(not(feature = "fs"), feature = "chrome"))]
2932pub async fn fetch_page_html(
2934 target_url: &str,
2935 client: &Client,
2936 page: &chromiumoxide::Page,
2937 wait_for: &Option<crate::configuration::WaitFor>,
2938 screenshot: &Option<crate::configuration::ScreenShotConfig>,
2939 page_set: bool,
2940 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2941 execution_scripts: &Option<ExecutionScripts>,
2942 automation_scripts: &Option<AutomationScripts>,
2943 viewport: &Option<crate::configuration::Viewport>,
2944 request_timeout: &Option<Box<std::time::Duration>>,
2945 track_events: &Option<crate::configuration::ChromeEventTracker>,
2946) -> PageResponse {
2947 match fetch_page_html_chrome_base(
2948 &target_url,
2949 &page,
2950 false,
2951 true,
2952 wait_for,
2953 screenshot,
2954 page_set,
2955 openai_config,
2956 None,
2957 execution_scripts,
2958 automation_scripts,
2959 viewport,
2960 request_timeout,
2961 track_events,
2962 )
2963 .await
2964 {
2965 Ok(page) => page,
2966 Err(err) => {
2967 log::error!("{:?}", err);
2968 fetch_page_html_raw(&target_url, &client).await
2969 }
2970 }
2971}
2972
2973#[cfg(feature = "chrome")]
2974pub async fn fetch_page_html_chrome(
2976 target_url: &str,
2977 client: &Client,
2978 page: &chromiumoxide::Page,
2979 wait_for: &Option<crate::configuration::WaitFor>,
2980 screenshot: &Option<crate::configuration::ScreenShotConfig>,
2981 page_set: bool,
2982 openai_config: &Option<Box<crate::configuration::GPTConfigs>>,
2983 execution_scripts: &Option<ExecutionScripts>,
2984 automation_scripts: &Option<AutomationScripts>,
2985 viewport: &Option<crate::configuration::Viewport>,
2986 request_timeout: &Option<Box<std::time::Duration>>,
2987 track_events: &Option<crate::configuration::ChromeEventTracker>,
2988) -> PageResponse {
2989 match &page {
2990 page => {
2991 match fetch_page_html_chrome_base(
2992 &target_url,
2993 &page,
2994 false,
2995 true,
2996 wait_for,
2997 screenshot,
2998 page_set,
2999 openai_config,
3000 None,
3001 execution_scripts,
3002 automation_scripts,
3003 viewport,
3004 request_timeout,
3005 track_events,
3006 )
3007 .await
3008 {
3009 Ok(page) => page,
3010 Err(err) => {
3011 log::error!(
3012 "{:?}. Error requesting: {} - defaulting to raw http request",
3013 err,
3014 target_url
3015 );
3016
3017 use crate::bytes::BufMut;
3018
3019 match client.get(target_url).send().await {
3020 Ok(res) if valid_parsing_status(&res) => {
3021 #[cfg(feature = "headers")]
3022 let headers = res.headers().clone();
3023 #[cfg(feature = "remote_addr")]
3024 let remote_addr = res.remote_addr();
3025 let cookies = get_cookies(&res);
3026 let status_code = res.status();
3027 let mut stream = res.bytes_stream();
3028 let mut data = Vec::new();
3029
3030 while let Some(item) = stream.next().await {
3031 match item {
3032 Ok(text) => {
3033 let limit = *MAX_SIZE_BYTES;
3034
3035 if limit > 0 && data.len() + text.len() > limit {
3036 break;
3037 }
3038 data.put(text)
3039 }
3040 Err(e) => {
3041 log::error!("{e} in {}", target_url);
3042 break;
3043 }
3044 }
3045 }
3046
3047 PageResponse {
3048 #[cfg(feature = "headers")]
3049 headers: Some(headers),
3050 #[cfg(feature = "remote_addr")]
3051 remote_addr,
3052 #[cfg(feature = "cookies")]
3053 cookies,
3054 content: Some(Box::new(data.into())),
3055 status_code,
3056 ..Default::default()
3057 }
3058 }
3059 Ok(res) => PageResponse {
3060 #[cfg(feature = "headers")]
3061 headers: Some(res.headers().clone()),
3062 #[cfg(feature = "remote_addr")]
3063 remote_addr: res.remote_addr(),
3064 #[cfg(feature = "cookies")]
3065 cookies: get_cookies(&res),
3066 status_code: res.status(),
3067 ..Default::default()
3068 },
3069 Err(err) => {
3070 log::info!("error fetching {}", target_url);
3071 let mut page_response = PageResponse::default();
3072
3073 if let Some(status_code) = err.status() {
3074 page_response.status_code = status_code;
3075 } else {
3076 page_response.status_code =
3077 crate::page::get_error_http_status_code(&err);
3078 }
3079
3080 page_response.error_for_status = Some(Err(err));
3081 page_response
3082 }
3083 }
3084 }
3085 }
3086 }
3087 }
3088}
3089
3090#[cfg(not(feature = "openai"))]
3091pub async fn openai_request(
3093 _gpt_configs: &crate::configuration::GPTConfigs,
3094 _resource: String,
3095 _url: &str,
3096 _prompt: &str,
3097) -> crate::features::openai_common::OpenAIReturn {
3098 Default::default()
3099}
3100
3101#[cfg(feature = "openai")]
3102lazy_static! {
3103 static ref CORE_BPE_TOKEN_COUNT: tiktoken_rs::CoreBPE = tiktoken_rs::cl100k_base().unwrap();
3104 static ref SEM: tokio::sync::Semaphore = {
3105 let logical = num_cpus::get();
3106 let physical = num_cpus::get_physical();
3107
3108 let sem_limit = if logical > physical {
3109 (logical) / (physical)
3110 } else {
3111 logical
3112 };
3113
3114 let (sem_limit, sem_max) = if logical == physical {
3115 (sem_limit * physical, 20)
3116 } else {
3117 (sem_limit * 4, 10)
3118 };
3119 let sem_limit = sem_limit / 3;
3120 tokio::sync::Semaphore::const_new(sem_limit.max(sem_max))
3121 };
3122 static ref CLIENT: async_openai::Client<async_openai::config::OpenAIConfig> =
3123 async_openai::Client::new();
3124}
3125
3126#[cfg(feature = "openai")]
3127pub async fn openai_request_base(
3129 gpt_configs: &crate::configuration::GPTConfigs,
3130 resource: String,
3131 url: &str,
3132 prompt: &str,
3133) -> crate::features::openai_common::OpenAIReturn {
3134 match SEM.acquire().await {
3135 Ok(permit) => {
3136 let mut chat_completion_defaults =
3137 async_openai::types::CreateChatCompletionRequestArgs::default();
3138 let gpt_base = chat_completion_defaults
3139 .max_tokens(gpt_configs.max_tokens)
3140 .model(&gpt_configs.model);
3141 let gpt_base = match gpt_configs.user {
3142 Some(ref user) => gpt_base.user(user),
3143 _ => gpt_base,
3144 };
3145 let gpt_base = match gpt_configs.temperature {
3146 Some(temp) => gpt_base.temperature(temp),
3147 _ => gpt_base,
3148 };
3149 let gpt_base = match gpt_configs.top_p {
3150 Some(tp) => gpt_base.top_p(tp),
3151 _ => gpt_base,
3152 };
3153
3154 let core_bpe = match tiktoken_rs::get_bpe_from_model(&gpt_configs.model) {
3155 Ok(bpe) => Some(bpe),
3156 _ => None,
3157 };
3158
3159 let (tokens, prompt_tokens) = match core_bpe {
3160 Some(ref core_bpe) => (
3161 core_bpe.encode_with_special_tokens(&resource),
3162 core_bpe.encode_with_special_tokens(&prompt),
3163 ),
3164 _ => (
3165 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&resource),
3166 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
3167 ),
3168 };
3169
3170 let output_tokens_count = tokens.len() + prompt_tokens.len();
3172
3173 let mut max_tokens = crate::features::openai::calculate_max_tokens(
3174 &gpt_configs.model,
3175 gpt_configs.max_tokens,
3176 &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
3177 &resource,
3178 &prompt,
3179 );
3180
3181 let resource = if output_tokens_count > max_tokens {
3183 let r = clean_html(&resource);
3184
3185 max_tokens = crate::features::openai::calculate_max_tokens(
3186 &gpt_configs.model,
3187 gpt_configs.max_tokens,
3188 &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
3189 &r,
3190 &prompt,
3191 );
3192
3193 let (tokens, prompt_tokens) = match core_bpe {
3194 Some(ref core_bpe) => (
3195 core_bpe.encode_with_special_tokens(&r),
3196 core_bpe.encode_with_special_tokens(&prompt),
3197 ),
3198 _ => (
3199 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
3200 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
3201 ),
3202 };
3203
3204 let output_tokens_count = tokens.len() + prompt_tokens.len();
3205
3206 if output_tokens_count > max_tokens {
3207 let r = clean_html_slim(&r);
3208
3209 max_tokens = crate::features::openai::calculate_max_tokens(
3210 &gpt_configs.model,
3211 gpt_configs.max_tokens,
3212 &&crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT_COMPLETION.clone(),
3213 &r,
3214 &prompt,
3215 );
3216
3217 let (tokens, prompt_tokens) = match core_bpe {
3218 Some(ref core_bpe) => (
3219 core_bpe.encode_with_special_tokens(&r),
3220 core_bpe.encode_with_special_tokens(&prompt),
3221 ),
3222 _ => (
3223 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&r),
3224 CORE_BPE_TOKEN_COUNT.encode_with_special_tokens(&prompt),
3225 ),
3226 };
3227
3228 let output_tokens_count = tokens.len() + prompt_tokens.len();
3229
3230 if output_tokens_count > max_tokens {
3231 clean_html_full(&r)
3232 } else {
3233 r
3234 }
3235 } else {
3236 r
3237 }
3238 } else {
3239 clean_html(&resource)
3240 };
3241
3242 let mut tokens_used = crate::features::openai_common::OpenAIUsage::default();
3243 let json_mode = gpt_configs.extra_ai_data;
3244
3245 let response_format = {
3246 let mut mode = if json_mode {
3247 async_openai::types::ResponseFormat::JsonObject
3248 } else {
3249 async_openai::types::ResponseFormat::Text
3250 };
3251
3252 if let Some(ref structure) = gpt_configs.json_schema {
3253 if let Some(ref schema) = structure.schema {
3254 if let Ok(mut schema) =
3255 crate::features::serde_json::from_str::<serde_json::Value>(&schema)
3256 {
3257 if json_mode {
3258 if let Some(properties) = schema.get_mut("properties") {
3260 if let Some(properties_map) = properties.as_object_mut() {
3261 properties_map.insert(
3262 "js".to_string(),
3263 serde_json::json!({
3264 "type": "string"
3265 }),
3266 );
3267 }
3268 }
3269 }
3270
3271 mode = async_openai::types::ResponseFormat::JsonSchema {
3272 json_schema: async_openai::types::ResponseFormatJsonSchema {
3273 description: structure.description.clone(),
3274 name: structure.name.clone(),
3275 schema: if schema.is_null() { None } else { Some(schema) },
3276 strict: structure.strict,
3277 },
3278 }
3279 }
3280 }
3281 }
3282
3283 mode
3284 };
3285
3286 match async_openai::types::ChatCompletionRequestAssistantMessageArgs::default()
3287 .content(string_concat!("URL: ", url, "\n", "HTML: ", resource))
3288 .build()
3289 {
3290 Ok(resource_completion) => {
3291 let mut messages: Vec<async_openai::types::ChatCompletionRequestMessage> =
3292 vec![crate::features::openai::BROWSER_ACTIONS_SYSTEM_PROMPT.clone()];
3293
3294 if json_mode {
3295 messages.push(
3296 crate::features::openai::BROWSER_ACTIONS_SYSTEM_EXTRA_PROMPT.clone(),
3297 );
3298 }
3299
3300 messages.push(resource_completion.into());
3301
3302 if !prompt.is_empty() {
3303 messages.push(
3304 match async_openai::types::ChatCompletionRequestUserMessageArgs::default()
3305 .content(prompt)
3306 .build()
3307 {
3308 Ok(o) => o,
3309 _ => Default::default(),
3310 }
3311 .into()
3312 )
3313 }
3314
3315 let v = match gpt_base
3316 .max_tokens(max_tokens as u32)
3317 .messages(messages)
3318 .response_format(response_format)
3319 .build()
3320 {
3321 Ok(request) => {
3322 let res = match gpt_configs.api_key {
3323 Some(ref key) => {
3324 if !key.is_empty() {
3325 let conf = CLIENT.config().to_owned();
3326 async_openai::Client::with_config(conf.with_api_key(key))
3327 .chat()
3328 .create(request)
3329 .await
3330 } else {
3331 CLIENT.chat().create(request).await
3332 }
3333 }
3334 _ => CLIENT.chat().create(request).await,
3335 };
3336
3337 match res {
3338 Ok(mut response) => {
3339 let mut choice = response.choices.first_mut();
3340
3341 if let Some(usage) = response.usage.take() {
3342 tokens_used.prompt_tokens = usage.prompt_tokens;
3343 tokens_used.completion_tokens = usage.completion_tokens;
3344 tokens_used.total_tokens = usage.total_tokens;
3345 }
3346
3347 match choice.as_mut() {
3348 Some(c) => match c.message.content.take() {
3349 Some(content) => content,
3350 _ => Default::default(),
3351 },
3352 _ => Default::default(),
3353 }
3354 }
3355 Err(err) => {
3356 log::error!("{:?}", err);
3357 Default::default()
3358 }
3359 }
3360 }
3361 _ => Default::default(),
3362 };
3363
3364 drop(permit);
3365
3366 crate::features::openai_common::OpenAIReturn {
3367 response: v,
3368 usage: tokens_used,
3369 error: None,
3370 }
3371 }
3372 Err(e) => {
3373 let mut d = crate::features::openai_common::OpenAIReturn::default();
3374
3375 d.error = Some(e.to_string());
3376
3377 d
3378 }
3379 }
3380 }
3381 Err(e) => {
3382 let mut d = crate::features::openai_common::OpenAIReturn::default();
3383
3384 d.error = Some(e.to_string());
3385
3386 d
3387 }
3388 }
3389}
3390
3391#[cfg(all(feature = "openai", not(feature = "cache_openai")))]
3392pub async fn openai_request(
3394 gpt_configs: &crate::configuration::GPTConfigs,
3395 resource: String,
3396 url: &str,
3397 prompt: &str,
3398) -> crate::features::openai_common::OpenAIReturn {
3399 openai_request_base(gpt_configs, resource, url, prompt).await
3400}
3401
3402#[cfg(all(feature = "openai", feature = "cache_openai"))]
3403pub async fn openai_request(
3405 gpt_configs: &crate::configuration::GPTConfigs,
3406 resource: String,
3407 url: &str,
3408 prompt: &str,
3409) -> crate::features::openai_common::OpenAIReturn {
3410 match &gpt_configs.cache {
3411 Some(cache) => {
3412 use std::hash::{Hash, Hasher};
3413 let mut s = ahash::AHasher::default();
3414
3415 url.hash(&mut s);
3416 prompt.hash(&mut s);
3417 gpt_configs.model.hash(&mut s);
3418 gpt_configs.max_tokens.hash(&mut s);
3419 gpt_configs.extra_ai_data.hash(&mut s);
3420 resource.hash(&mut s);
3422
3423 let key = s.finish();
3424
3425 match cache.get(&key).await {
3426 Some(cache) => {
3427 let mut c = cache;
3428 c.usage.cached = true;
3429 c
3430 }
3431 _ => {
3432 let r = openai_request_base(gpt_configs, resource, url, prompt).await;
3433 let _ = cache.insert(key, r.clone()).await;
3434 r
3435 }
3436 }
3437 }
3438 _ => openai_request_base(gpt_configs, resource, url, prompt).await,
3439 }
3440}
3441
3442pub fn clean_html_raw(html: &str) -> String {
3444 html.to_string()
3445}
3446
3447#[cfg(feature = "openai")]
3449pub fn clean_html_base(html: &str) -> String {
3450 use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
3451
3452 match rewrite_str(
3453 html,
3454 RewriteStrSettings {
3455 element_content_handlers: vec![
3456 element!("script", |el| {
3457 el.remove();
3458 Ok(())
3459 }),
3460 element!("style", |el| {
3461 el.remove();
3462 Ok(())
3463 }),
3464 element!("link", |el| {
3465 el.remove();
3466 Ok(())
3467 }),
3468 element!("iframe", |el| {
3469 el.remove();
3470 Ok(())
3471 }),
3472 element!("[style*='display:none']", |el| {
3473 el.remove();
3474 Ok(())
3475 }),
3476 element!("[id*='ad']", |el| {
3477 el.remove();
3478 Ok(())
3479 }),
3480 element!("[class*='ad']", |el| {
3481 el.remove();
3482 Ok(())
3483 }),
3484 element!("[id*='tracking']", |el| {
3485 el.remove();
3486 Ok(())
3487 }),
3488 element!("[class*='tracking']", |el| {
3489 el.remove();
3490 Ok(())
3491 }),
3492 element!("meta", |el| {
3493 if let Some(attribute) = el.get_attribute("name") {
3494 if attribute != "title" && attribute != "description" {
3495 el.remove();
3496 }
3497 } else {
3498 el.remove();
3499 }
3500 Ok(())
3501 }),
3502 ],
3503 document_content_handlers: vec![doc_comments!(|c| {
3504 c.remove();
3505 Ok(())
3506 })],
3507 ..RewriteStrSettings::default()
3508 },
3509 ) {
3510 Ok(r) => r,
3511 _ => html.into(),
3512 }
3513}
3514
3515#[cfg(feature = "chrome")]
3517pub async fn rewrite_base_tag(html: &str, base_url: &Option<&str>) -> String {
3518 use lol_html::{element, html_content::ContentType};
3519 use std::sync::OnceLock;
3520
3521 if html.is_empty() {
3522 return Default::default();
3523 }
3524
3525 let base_tag_inserted = OnceLock::new();
3526 let already_present = OnceLock::new();
3527
3528 let base_url_len = base_url.map(|s| s.len());
3529
3530 let rewriter_settings: lol_html::Settings<'_, '_, lol_html::send::SendHandlerTypes> =
3531 lol_html::send::Settings {
3532 element_content_handlers: vec![
3533 element!("base", {
3535 |el| {
3536 if base_tag_inserted.get().is_none() {
3538 if let Some(attr) = el.get_attribute("href") {
3540 let valid_http =
3541 attr.starts_with("http://") || attr.starts_with("https://");
3542
3543 if valid_http {
3545 let _ = base_tag_inserted.set(true);
3546 let _ = already_present.set(true);
3547 } else {
3548 el.remove();
3549 }
3550 } else {
3551 el.remove();
3552 }
3553 }
3554
3555 Ok(())
3556 }
3557 }),
3558 element!("head", {
3560 |el: &mut lol_html::send::Element| {
3561 if let Some(handlers) = el.end_tag_handlers() {
3562 let base_tag_inserted = base_tag_inserted.clone();
3563 let base_url =
3564 format!(r#"<base href="{}">"#, base_url.unwrap_or_default());
3565
3566 handlers.push(Box::new(move |end| {
3567 if base_tag_inserted.get().is_none() {
3568 let _ = base_tag_inserted.set(true);
3569 end.before(&base_url, ContentType::Html);
3570 }
3571 Ok(())
3572 }))
3573 }
3574 Ok(())
3575 }
3576 }),
3577 element!("html", {
3579 |el: &mut lol_html::send::Element| {
3580 if let Some(handlers) = el.end_tag_handlers() {
3581 let base_tag_inserted = base_tag_inserted.clone();
3582 let base_url = format!(
3583 r#"<head><base href="{}"></head>"#,
3584 base_url.unwrap_or_default()
3585 );
3586
3587 handlers.push(Box::new(move |end| {
3588 if base_tag_inserted.get().is_none() {
3589 let _ = base_tag_inserted.set(true);
3590 end.before(&base_url, ContentType::Html);
3591 }
3592 Ok(())
3593 }))
3594 }
3595 Ok(())
3596 }
3597 }),
3598 ],
3599 ..lol_html::send::Settings::new_for_handler_types()
3600 };
3601
3602 let mut buffer = Vec::with_capacity(
3603 html.len()
3604 + match base_url_len {
3605 Some(l) => l + 29,
3606 _ => 0,
3607 },
3608 );
3609
3610 let mut rewriter = lol_html::send::HtmlRewriter::new(rewriter_settings, |c: &[u8]| {
3611 buffer.extend_from_slice(c);
3612 });
3613
3614 let mut stream = tokio_stream::iter(html.as_bytes().chunks(*STREAMING_CHUNK_SIZE));
3615
3616 let mut wrote_error = false;
3617
3618 while let Some(chunk) = stream.next().await {
3619 if already_present.get().is_some() {
3621 break;
3622 }
3623 if rewriter.write(chunk).is_err() {
3624 wrote_error = true;
3625 break;
3626 }
3627 }
3628
3629 if !wrote_error {
3630 let _ = rewriter.end();
3631 }
3632
3633 if already_present.get().is_some() {
3634 html.to_string()
3635 } else {
3636 auto_encoder::auto_encode_bytes(&buffer)
3637 }
3638}
3639
3640#[cfg(feature = "openai")]
3642pub fn clean_html_slim(html: &str) -> String {
3643 use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
3644 match rewrite_str(
3645 html,
3646 RewriteStrSettings {
3647 element_content_handlers: vec![
3648 element!("script", |el| {
3649 el.remove();
3650 Ok(())
3651 }),
3652 element!("style", |el| {
3653 el.remove();
3654 Ok(())
3655 }),
3656 element!("svg", |el| {
3657 el.remove();
3658 Ok(())
3659 }),
3660 element!("noscript", |el| {
3661 el.remove();
3662 Ok(())
3663 }),
3664 element!("link", |el| {
3665 el.remove();
3666 Ok(())
3667 }),
3668 element!("iframe", |el| {
3669 el.remove();
3670 Ok(())
3671 }),
3672 element!("canvas", |el| {
3673 el.remove();
3674 Ok(())
3675 }),
3676 element!("video", |el| {
3677 el.remove();
3678 Ok(())
3679 }),
3680 element!("img", |el| {
3681 if let Some(src) = el.get_attribute("src") {
3682 if src.starts_with("data:image") {
3683 el.remove();
3684 }
3685 }
3686 Ok(())
3687 }),
3688 element!("picture", |el| {
3689 if let Some(src) = el.get_attribute("src") {
3690 if src.starts_with("data:image") {
3691 el.remove();
3692 }
3693 }
3694 Ok(())
3695 }),
3696 element!("[style*='display:none']", |el| {
3697 el.remove();
3698 Ok(())
3699 }),
3700 element!("[id*='ad']", |el| {
3701 el.remove();
3702 Ok(())
3703 }),
3704 element!("[class*='ad']", |el| {
3705 el.remove();
3706 Ok(())
3707 }),
3708 element!("[id*='tracking']", |el| {
3709 el.remove();
3710 Ok(())
3711 }),
3712 element!("[class*='tracking']", |el| {
3713 el.remove();
3714 Ok(())
3715 }),
3716 element!("meta", |el| {
3717 if let Some(attribute) = el.get_attribute("name") {
3718 if attribute != "title" && attribute != "description" {
3719 el.remove();
3720 }
3721 } else {
3722 el.remove();
3723 }
3724 Ok(())
3725 }),
3726 ],
3727 document_content_handlers: vec![doc_comments!(|c| {
3728 c.remove();
3729 Ok(())
3730 })],
3731 ..RewriteStrSettings::default()
3732 },
3733 ) {
3734 Ok(r) => r,
3735 _ => html.into(),
3736 }
3737}
3738
3739#[cfg(feature = "openai")]
3741pub fn clean_html_full(html: &str) -> String {
3742 use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
3743
3744 match rewrite_str(
3745 html,
3746 RewriteStrSettings {
3747 element_content_handlers: vec![
3748 element!("nav, footer", |el| {
3749 el.remove();
3750 Ok(())
3751 }),
3752 element!("meta", |el| {
3753 let name = el.get_attribute("name").map(|n| n.to_lowercase());
3754
3755 if !matches!(name.as_deref(), Some("viewport") | Some("charset")) {
3756 el.remove();
3757 }
3758
3759 Ok(())
3760 }),
3761 element!("*", |el| {
3762 let attrs_to_keep = ["id", "data-", "class"];
3763 let attributes_list = el.attributes().iter();
3764 let mut remove_list = Vec::new();
3765
3766 for attr in attributes_list {
3767 if !attrs_to_keep.contains(&attr.name().as_str()) {
3768 remove_list.push(attr.name());
3769 }
3770 }
3771
3772 for attr in remove_list {
3773 el.remove_attribute(&attr);
3774 }
3775
3776 Ok(())
3777 }),
3778 ],
3779 document_content_handlers: vec![doc_comments!(|c| {
3780 c.remove();
3781 Ok(())
3782 })],
3783 ..RewriteStrSettings::default()
3784 },
3785 ) {
3786 Ok(r) => r,
3787 _ => html.into(),
3788 }
3789}
3790
3791#[cfg(not(feature = "openai"))]
3793pub fn clean_html(html: &str) -> String {
3794 clean_html_raw(html)
3795}
3796
3797#[cfg(all(feature = "openai", not(feature = "openai_slim_fit")))]
3799pub fn clean_html(html: &str) -> String {
3800 clean_html_base(html)
3801}
3802
3803#[cfg(all(feature = "openai", feature = "openai_slim_fit"))]
3805pub fn clean_html(html: &str) -> String {
3806 clean_html_slim(html)
3807}
3808
3809#[cfg(not(feature = "openai"))]
3810pub fn clean_html_slim(html: &str) -> String {
3812 html.into()
3813}
3814
3815pub fn log(message: &'static str, data: impl AsRef<str>) {
3817 if log_enabled!(Level::Info) {
3818 info!("{message} - {}", data.as_ref());
3819 }
3820}
3821
3822#[cfg(feature = "control")]
3823#[derive(PartialEq, Debug)]
3825pub enum Handler {
3826 Start,
3828 Pause,
3830 Resume,
3832 Shutdown,
3834}
3835
3836#[cfg(feature = "control")]
3837lazy_static! {
3838 pub static ref CONTROLLER: std::sync::Arc<tokio::sync::RwLock<(tokio::sync::watch::Sender<(String, Handler)>,
3840 tokio::sync::watch::Receiver<(String, Handler)>)>> =
3841 std::sync::Arc::new(tokio::sync::RwLock::new(tokio::sync::watch::channel(("handles".to_string(), Handler::Start))));
3842}
3843
3844#[cfg(feature = "control")]
3845pub async fn pause(target: &str) {
3847 match CONTROLLER
3848 .write()
3849 .await
3850 .0
3851 .send((target.into(), Handler::Pause))
3852 {
3853 _ => (),
3854 };
3855}
3856
3857#[cfg(feature = "control")]
3858pub async fn resume(target: &str) {
3860 match CONTROLLER
3861 .write()
3862 .await
3863 .0
3864 .send((target.into(), Handler::Resume))
3865 {
3866 _ => (),
3867 };
3868}
3869
3870#[cfg(feature = "control")]
3871pub async fn shutdown(target: &str) {
3873 match CONTROLLER
3874 .write()
3875 .await
3876 .0
3877 .send((target.into(), Handler::Shutdown))
3878 {
3879 _ => (),
3880 };
3881}
3882
3883#[cfg(feature = "control")]
3884pub async fn reset(target: &str) {
3886 match CONTROLLER
3887 .write()
3888 .await
3889 .0
3890 .send((target.into(), Handler::Start))
3891 {
3892 _ => (),
3893 };
3894}
3895
3896pub(crate) fn setup_website_selectors(url: &str, allowed: AllowedDomainTypes) -> RelativeSelectors {
3898 let subdomains = allowed.subdomains;
3899 let tld = allowed.tld;
3900
3901 crate::page::get_page_selectors_base(url, subdomains, tld)
3902}
3903
3904#[derive(Debug, Default, Clone, Copy)]
3906pub struct AllowedDomainTypes {
3907 pub subdomains: bool,
3909 pub tld: bool,
3911}
3912
3913impl AllowedDomainTypes {
3914 pub fn new(subdomains: bool, tld: bool) -> Self {
3916 Self { subdomains, tld }
3917 }
3918}
3919
3920pub(crate) fn modify_selectors(
3922 prior_domain: &Option<Box<Url>>,
3923 domain: &str,
3924 domain_parsed: &mut Option<Box<Url>>,
3925 url: &mut Box<CaseInsensitiveString>,
3926 base: &mut RelativeSelectors,
3927 allowed: AllowedDomainTypes,
3928) {
3929 *domain_parsed = parse_absolute_url(domain);
3930 *url = Box::new(domain.into());
3931 let s = setup_website_selectors(url.inner(), allowed);
3932 base.0 = s.0;
3933 base.1 = s.1;
3934 if let Some(prior_domain) = prior_domain {
3935 if let Some(dname) = prior_domain.host_str() {
3936 base.2 = dname.into();
3937 }
3938 }
3939}
3940
3941pub fn get_last_segment(path: &str) -> &str {
3943 if let Some(pos) = path.rfind('/') {
3944 let next_position = pos + 1;
3945 if next_position < path.len() {
3946 &path[next_position..]
3947 } else {
3948 ""
3949 }
3950 } else {
3951 path
3952 }
3953}
3954
3955pub(crate) fn get_path_from_url(url: &str) -> &str {
3957 if let Some(start_pos) = url.find("//") {
3958 let mut pos = start_pos + 2;
3959
3960 if let Some(third_slash_pos) = url[pos..].find('/') {
3961 pos += third_slash_pos;
3962 &url[pos..]
3963 } else {
3964 "/"
3965 }
3966 } else {
3967 "/"
3968 }
3969}
3970
3971pub(crate) fn get_domain_from_url(url: &str) -> &str {
3973 if let Some(start_pos) = url.find("//") {
3974 let pos = start_pos + 2;
3975
3976 if let Some(first_slash_pos) = url[pos..].find('/') {
3977 &url[pos..pos + first_slash_pos]
3978 } else {
3979 &url[pos..]
3980 }
3981 } else {
3982 if let Some(first_slash_pos) = url.find('/') {
3983 &url[..first_slash_pos]
3984 } else {
3985 &url
3986 }
3987 }
3988}
3989
3990pub fn networking_capable(url: &str) -> bool {
3992 url.starts_with("https://")
3993 || url.starts_with("http://")
3994 || url.starts_with("file://")
3995 || url.starts_with("ftp://")
3996}
3997
3998pub fn prepare_url(u: &str) -> String {
4000 if let Some(index) = u.find("://") {
4001 let split_index = u
4002 .char_indices()
4003 .nth(index + 3)
4004 .map(|(i, _)| i)
4005 .unwrap_or(u.len());
4006
4007 format!("https://{}", &u[split_index..])
4008 } else {
4009 format!("https://{}", u)
4010 }
4011}
4012
4013pub(crate) async fn normalize_html(html: &[u8]) -> Vec<u8> {
4015 use lol_html::{element, send::Settings};
4016
4017 let mut output = Vec::new();
4018
4019 let mut rewriter = HtmlRewriter::new(
4020 Settings {
4021 element_content_handlers: vec![
4022 element!("a[href]", |el| {
4023 el.remove_attribute("href");
4024 Ok(())
4025 }),
4026 element!("script, style, iframe, base, noscript", |el| {
4027 el.remove();
4028 Ok(())
4029 }),
4030 element!("*", |el| {
4031 let mut remove_attr = vec![];
4032
4033 for attr in el.attributes() {
4034 let name = attr.name();
4035 let remove =
4036 !(name.starts_with("data-") || name == "id" || name == "class");
4037 if remove {
4038 remove_attr.push(name);
4039 }
4040 }
4041
4042 for name in remove_attr {
4043 el.remove_attribute(&name);
4044 }
4045
4046 Ok(())
4047 }),
4048 ],
4049 ..Settings::new_send()
4050 },
4051 |c: &[u8]| output.extend_from_slice(c),
4052 );
4053
4054 let chunks = html.chunks(*STREAMING_CHUNK_SIZE);
4055 let mut stream = tokio_stream::iter(chunks);
4056 let mut wrote_error = false;
4057
4058 while let Some(chunk) = stream.next().await {
4059 if rewriter.write(chunk).is_err() {
4060 wrote_error = true;
4061 break;
4062 }
4063 }
4064
4065 if !wrote_error {
4066 let _ = rewriter.end();
4067 }
4068
4069 output
4070}
4071
4072pub(crate) async fn hash_html(html: &[u8]) -> u64 {
4074 let normalized_html = normalize_html(html).await;
4075
4076 if !normalized_html.is_empty() {
4077 use std::hash::{Hash, Hasher};
4078 let mut s = ahash::AHasher::default();
4079 normalized_html.hash(&mut s);
4080 let key = s.finish();
4081 key
4082 } else {
4083 Default::default()
4084 }
4085}
4086
4087#[cfg(feature = "tracing")]
4088pub(crate) fn spawn_task<F>(task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
4090where
4091 F: std::future::Future + Send + 'static,
4092 F::Output: Send + 'static,
4093{
4094 tokio::task::Builder::new()
4095 .name(task_name)
4096 .spawn(future)
4097 .expect("failed to spawn task")
4098}
4099
4100#[cfg(not(feature = "tracing"))]
4101#[allow(unused)]
4102pub(crate) fn spawn_task<F>(_task_name: &str, future: F) -> tokio::task::JoinHandle<F::Output>
4104where
4105 F: std::future::Future + Send + 'static,
4106 F::Output: Send + 'static,
4107{
4108 tokio::task::spawn(future)
4109}
4110
4111#[cfg(feature = "tracing")]
4112pub(crate) fn spawn_set<F, T>(
4114 task_name: &str,
4115 set: &mut tokio::task::JoinSet<T>,
4116 future: F,
4117) -> tokio::task::AbortHandle
4118where
4119 F: Future<Output = T>,
4120 F: Send + 'static,
4121 T: Send + 'static,
4122{
4123 set.build_task()
4124 .name(task_name)
4125 .spawn(future)
4126 .expect("set should spawn")
4127}
4128
4129#[cfg(not(feature = "tracing"))]
4130pub(crate) fn spawn_set<F, T>(
4132 _task_name: &str,
4133 set: &mut tokio::task::JoinSet<T>,
4134 future: F,
4135) -> tokio::task::AbortHandle
4136where
4137 F: Future<Output = T>,
4138 F: Send + 'static,
4139 T: Send + 'static,
4140{
4141 set.spawn(future)
4142}
4143
4144#[cfg(feature = "balance")]
4145const REBALANCE_TIME: std::time::Duration = std::time::Duration::from_millis(100);
4147
4148#[cfg(feature = "balance")]
4150pub async fn get_semaphore(semaphore: &Arc<Semaphore>, detect: bool) -> &Arc<Semaphore> {
4151 let cpu_load = if detect {
4152 crate::utils::detect_system::get_global_cpu_state().await
4153 } else {
4154 0
4155 };
4156
4157 if cpu_load == 2 {
4158 tokio::time::sleep(REBALANCE_TIME).await;
4159 }
4160
4161 if cpu_load >= 1 {
4162 &*crate::website::SEM_SHARED
4163 } else {
4164 semaphore
4165 }
4166}
4167
4168pub fn crawl_duration_expired(crawl_timeout: &Option<Duration>, start: &Option<Instant>) -> bool {
4170 crawl_timeout
4171 .and_then(|duration| start.map(|start| start.elapsed() >= duration))
4172 .unwrap_or(false)
4173}
4174
4175static HTML_TAGS: phf::Set<&'static [u8]> = phf_set! {
4177 b"<!doctype html",
4178 b"<html",
4179 b"<document",
4180};
4181
4182pub fn is_html_content_check(bytes: &[u8]) -> bool {
4184 let check_bytes = if bytes.len() > 1024 {
4185 &bytes[..1024]
4186 } else {
4187 bytes
4188 };
4189
4190 for tag in HTML_TAGS.iter() {
4191 if check_bytes
4192 .windows(tag.len())
4193 .any(|window| window.eq_ignore_ascii_case(tag))
4194 {
4195 return true;
4196 }
4197 }
4198
4199 false
4200}
4201
4202#[cfg(not(feature = "balance"))]
4204pub async fn get_semaphore(semaphore: &Arc<Semaphore>, _detect: bool) -> &Arc<Semaphore> {
4205 semaphore
4206}
4207
4208#[derive(Debug)]
4209#[cfg(feature = "smart")]
4211pub(crate) struct HtmlOutputSink {
4212 pub(crate) data: Vec<u8>,
4214 pub(crate) sender: Option<tokio::sync::oneshot::Sender<Vec<u8>>>,
4216}
4217
4218#[cfg(feature = "smart")]
4219impl HtmlOutputSink {
4220 pub(crate) fn new(sender: tokio::sync::oneshot::Sender<Vec<u8>>) -> Self {
4222 HtmlOutputSink {
4223 data: Vec::new(),
4224 sender: Some(sender),
4225 }
4226 }
4227}
4228
4229#[cfg(feature = "smart")]
4230impl OutputSink for HtmlOutputSink {
4231 fn handle_chunk(&mut self, chunk: &[u8]) {
4232 self.data.extend_from_slice(chunk);
4233 if chunk.len() == 0 {
4234 if let Some(sender) = self.sender.take() {
4235 let data_to_send = std::mem::take(&mut self.data);
4236 let _ = sender.send(data_to_send);
4237 }
4238 }
4239 }
4240}
4241
4242#[cfg(feature = "tracing")]
4244pub fn emit_log(link: &str) {
4245 tracing::info!("fetch {}", &link);
4246}
4247#[cfg(not(feature = "tracing"))]
4249pub fn emit_log(link: &str) {
4250 log::info!("fetch {}", &link);
4251}
4252
4253#[cfg(feature = "tracing")]
4255pub fn emit_log_shutdown(link: &str) {
4256 tracing::info!("shutdown {}", &link);
4257}
4258#[cfg(not(feature = "tracing"))]
4260pub fn emit_log_shutdown(link: &str) {
4261 log::info!("shutdown {}", &link);
4262}