1use crate::errors::{Result, SpiderError};
8use crate::protocol::protocol_adapter::ProtocolAdapter;
9use arc_swap::ArcSwap;
10use serde_json::Value;
11use std::sync::Arc;
12use std::time::{Duration, Instant};
13use tokio::time::sleep;
14
15pub struct SpiderPage {
22 adapter: ArcSwap<ProtocolAdapter>,
23}
24
25#[derive(Debug, Clone)]
35pub enum FieldSelector<'a> {
36 Text(&'a str),
39 Attr {
41 selector: &'a str,
42 attribute: &'a str,
43 },
44}
45
46impl<'a> From<&'a str> for FieldSelector<'a> {
47 fn from(s: &'a str) -> Self {
48 Self::Text(s)
49 }
50}
51
52impl SpiderPage {
53 pub fn new(adapter: ProtocolAdapter) -> Self {
59 Self {
60 adapter: ArcSwap::from_pointee(adapter),
61 }
62 }
63
64 pub fn from_arc(adapter: Arc<ProtocolAdapter>) -> Self {
66 Self {
67 adapter: ArcSwap::from(adapter),
68 }
69 }
70
71 #[inline]
73 pub(crate) fn adapter(&self) -> arc_swap::Guard<Arc<ProtocolAdapter>> {
74 self.adapter.load()
75 }
76
77 pub async fn goto(&self, url: &str) -> Result<()> {
83 self.adapter().navigate(url).await
84 }
85
86 pub async fn goto_fast(&self, url: &str) -> Result<()> {
90 self.adapter().navigate_fast(url).await
91 }
92
93 pub async fn goto_dom(&self, url: &str) -> Result<()> {
98 self.adapter().navigate_dom(url).await
99 }
100
101 pub async fn go_back(&self) -> Result<()> {
103 self.adapter().evaluate("window.history.back()").await?;
104 Ok(())
105 }
106
107 pub async fn go_forward(&self) -> Result<()> {
109 self.adapter().evaluate("window.history.forward()").await?;
110 Ok(())
111 }
112
113 pub async fn reload(&self) -> Result<()> {
115 self.adapter().evaluate("window.location.reload()").await?;
116 Ok(())
117 }
118
119 pub async fn content(&self, wait_ms: u64, min_length: usize) -> Result<String> {
136 if wait_ms > 0 {
141 let early_html = self.adapter().get_html().await.unwrap_or_default();
142 if early_html.len() >= min_length
143 && !Self::is_interstitial_content(&early_html)
144 && !Self::is_rate_limit_content(&early_html)
145 {
146 return Ok(early_html);
147 }
148 self.wait_for_network_idle(wait_ms).await?;
149 }
150
151 let mut html = self.adapter().get_html().await.unwrap_or_default();
152
153 if wait_ms > 0 && Self::is_interstitial_content(&html) {
162 let interstitial_waits: &[u64] = &[2000, 2000, 3000, 4000, 5000, 7000, 7000];
163 for &wait in interstitial_waits {
164 sleep(Duration::from_millis(wait)).await;
165 html = self.adapter().get_html().await.unwrap_or_default();
166 if !Self::is_interstitial_content(&html) {
167 break;
168 }
169 if html.len() > 15_000 {
171 break;
172 }
173 }
174 if Self::is_interstitial_content(&html) {
177 return Err(SpiderError::Blocked(
178 "Page stuck on interstitial challenge".into(),
179 ));
180 }
181 }
182
183 if wait_ms > 0 && Self::is_rate_limit_content(&html) {
186 return Err(SpiderError::Blocked(
187 "Rate limit exceeded (site-level)".into(),
188 ));
189 }
190
191 if wait_ms > 0 && html.len() < min_length {
196 let increments: &[u64] = &[300, 500, 800, 1200];
197 for &extra in increments {
198 sleep(Duration::from_millis(extra)).await;
199 let updated = self.adapter().get_html().await.unwrap_or_default();
200 if updated.len() > html.len() {
201 html = updated;
202 }
203 if html.len() >= min_length {
204 break;
205 }
206 }
207 if html.len() < min_length {
211 let poll_deadline = Instant::now() + Duration::from_millis(3000);
212 while Instant::now() < poll_deadline {
213 sleep(Duration::from_millis(1000)).await;
214 let polled = self.adapter().get_html().await.unwrap_or_default();
215 if polled.len() > html.len() {
216 html = polled;
217 }
218 if html.len() >= min_length {
219 break;
220 }
221 }
222 }
223 }
224
225 Ok(html)
226 }
227
228 pub async fn raw_content(&self) -> Result<String> {
231 self.adapter().get_html().await
232 }
233
234 pub async fn content_with_early_return(
246 &self,
247 max_wait_ms: u64,
248 min_content_length: usize,
249 poll_interval_ms: u64,
250 ) -> Result<String> {
251 let deadline = Instant::now() + Duration::from_millis(max_wait_ms);
252 while Instant::now() < deadline {
253 let html = self.adapter().get_html().await.unwrap_or_default();
254 if html.len() >= min_content_length
255 && !Self::is_interstitial_content(&html)
256 && !Self::is_rate_limit_content(&html)
257 {
258 return Ok(html);
259 }
260 let remaining = deadline.saturating_duration_since(Instant::now());
261 if remaining.is_zero() {
262 break;
263 }
264 let wait = Duration::from_millis(poll_interval_ms).min(remaining);
265 sleep(wait).await;
266 }
267 Ok(self.adapter().get_html().await.unwrap_or_default())
269 }
270
271 pub async fn content_with_network_idle(
289 &self,
290 max_wait_ms: u64,
291 min_content_length: usize,
292 interstitial_budget_ms: u64,
293 ) -> Result<String> {
294 let deadline = Instant::now() + Duration::from_millis(max_wait_ms);
295
296 let mut html = self.adapter().get_html().await.unwrap_or_default();
298 if html.len() >= min_content_length
299 && !Self::is_interstitial_content(&html)
300 && !Self::is_rate_limit_content(&html)
301 {
302 return Ok(html);
303 }
304
305 let dom_deadline = deadline.min(Instant::now() + Duration::from_millis(5000));
307 while Instant::now() < dom_deadline {
308 let state = self.adapter().evaluate("document.readyState").await;
309 if let Ok(val) = state {
310 let s = val.as_str().unwrap_or("");
311 if s == "interactive" || s == "complete" {
312 break;
313 }
314 }
315 sleep(Duration::from_millis(200)).await;
316 }
317
318 let idle_ms: u64 = 400;
322 let idle_check_ms = {
323 let remaining = deadline.saturating_duration_since(Instant::now());
324 remaining.as_millis().min(8000) as u64
325 };
326 if idle_check_ms > 500 {
327 let js = format!(
328 r#"
329 new Promise((resolve) => {{
330 let lastActivity = Date.now();
331 const idleThreshold = {idle_ms};
332 const deadline = Date.now() + {idle_check_ms};
333 const perfObs = new PerformanceObserver(() => {{ lastActivity = Date.now(); }});
334 try {{ perfObs.observe({{ entryTypes: ['resource'] }}); }} catch(e) {{}}
335 const mutObs = new MutationObserver(() => {{ lastActivity = Date.now(); }});
336 mutObs.observe(document.documentElement, {{ childList: true, subtree: true, attributes: true }});
337 const check = () => {{
338 const now = Date.now();
339 if (now >= deadline || (now - lastActivity >= idleThreshold)) {{
340 perfObs.disconnect(); mutObs.disconnect(); resolve(true); return;
341 }}
342 setTimeout(check, 100);
343 }};
344 setTimeout(check, idleThreshold);
345 }})
346 "#
347 );
348 if self.adapter().evaluate(&js).await.is_err() {
349 sleep(Duration::from_millis(500)).await;
350 }
351 }
352
353 html = self.adapter().get_html().await.unwrap_or_default();
355 if html.len() >= min_content_length
356 && !Self::is_interstitial_content(&html)
357 && !Self::is_rate_limit_content(&html)
358 {
359 return Ok(html);
360 }
361
362 if Self::is_interstitial_content(&html) {
367 let i_deadline =
368 deadline.min(Instant::now() + Duration::from_millis(interstitial_budget_ms));
369 let waits: &[u64] = &[2000, 2000, 3000, 4000, 5000, 7000, 10000];
370 for &wait in waits {
371 if Instant::now() >= i_deadline {
372 break;
373 }
374 let remaining = i_deadline.saturating_duration_since(Instant::now());
375 let actual_wait = Duration::from_millis(wait).min(remaining);
376 sleep(actual_wait).await;
377 html = self.adapter().get_html().await.unwrap_or_default();
378 if !Self::is_interstitial_content(&html) {
379 break;
380 }
381 if html.len() > 15_000 {
382 break;
383 }
384 }
385 if Self::is_interstitial_content(&html) {
386 return Err(SpiderError::Blocked(
387 "Page stuck on interstitial challenge".into(),
388 ));
389 }
390 }
391
392 if Self::is_rate_limit_content(&html) {
393 return Err(SpiderError::Blocked(
394 "Rate limit exceeded (site-level)".into(),
395 ));
396 }
397
398 if html.len() < min_content_length {
400 while Instant::now() < deadline {
401 sleep(Duration::from_millis(1000)).await;
402 let polled = self.adapter().get_html().await.unwrap_or_default();
403 if polled.len() > html.len() {
404 html = polled;
405 }
406 if html.len() >= min_content_length {
407 break;
408 }
409 }
410 }
411
412 Ok(html)
413 }
414
415 pub async fn title(&self) -> Result<String> {
421 let val = self.adapter().evaluate("document.title").await?;
422 Ok(val.as_str().unwrap_or("").to_string())
423 }
424
425 pub async fn url(&self) -> Result<String> {
427 let val = self.adapter().evaluate("window.location.href").await?;
428 Ok(val.as_str().unwrap_or("").to_string())
429 }
430
431 pub async fn screenshot(&self) -> Result<String> {
433 self.adapter().capture_screenshot().await
434 }
435
436 pub async fn evaluate(&self, expression: &str) -> Result<Value> {
438 self.adapter().evaluate(expression).await
439 }
440
441 pub async fn click(&self, selector: &str) -> Result<()> {
447 let (x, y) = self.get_element_center(selector).await?;
448 self.adapter().click_point(x, y).await
449 }
450
451 pub async fn click_at(&self, x: f64, y: f64) -> Result<()> {
453 self.adapter().click_point(x, y).await
454 }
455
456 pub async fn dblclick(&self, selector: &str) -> Result<()> {
458 let (x, y) = self.get_element_center(selector).await?;
459 self.adapter().double_click_point(x, y).await
460 }
461
462 pub async fn right_click(&self, selector: &str) -> Result<()> {
464 let (x, y) = self.get_element_center(selector).await?;
465 self.adapter().right_click_point(x, y).await
466 }
467
468 pub async fn click_and_hold(&self, selector: &str, hold_ms: u64) -> Result<()> {
476 let (x, y) = self.get_element_center(selector).await?;
477 self.adapter().click_hold_point(x, y, hold_ms).await
478 }
479
480 pub async fn click_and_hold_at(&self, x: f64, y: f64, hold_ms: u64) -> Result<()> {
486 self.adapter().click_hold_point(x, y, hold_ms).await
487 }
488
489 pub async fn click_all(&self, selector: &str) -> Result<()> {
491 let escaped = serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
492 let js = format!(
493 r#"
494 (function() {{
495 const els = document.querySelectorAll({escaped});
496 return Array.from(els).map(el => {{
497 const r = el.getBoundingClientRect();
498 return {{ x: r.x + r.width / 2, y: r.y + r.height / 2 }};
499 }});
500 }})()
501 "#
502 );
503 let result = self.adapter().evaluate(&js).await?;
504 if let Some(points) = result.as_array() {
505 for pt in points {
506 let x = pt.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0);
507 let y = pt.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0);
508 self.adapter().click_point(x, y).await?;
509 sleep(Duration::from_millis(100)).await;
510 }
511 }
512 Ok(())
513 }
514
515 pub async fn fill(&self, selector: &str, value: &str) -> Result<()> {
521 let escaped_sel =
522 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
523 let clear_js = format!(
525 r#"
526 (function() {{
527 const el = document.querySelector({escaped_sel});
528 if (el) {{ el.focus(); el.value = ''; }}
529 }})()
530 "#
531 );
532 self.adapter().evaluate(&clear_js).await?;
533
534 if let Ok((x, y)) = self.get_element_center(selector).await {
536 let _ = self.adapter().click_point(x, y).await;
537 }
538
539 self.adapter().insert_text(value).await?;
541
542 let dispatch_js = format!(
544 r#"
545 (function() {{
546 const el = document.querySelector({escaped_sel});
547 if (el) {{
548 el.dispatchEvent(new Event('input', {{ bubbles: true }}));
549 el.dispatchEvent(new Event('change', {{ bubbles: true }}));
550 }}
551 }})()
552 "#
553 );
554 self.adapter().evaluate(&dispatch_js).await?;
555 Ok(())
556 }
557
558 pub async fn type_text(&self, value: &str) -> Result<()> {
560 self.adapter().insert_text(value).await
561 }
562
563 pub async fn press(&self, key: &str) -> Result<()> {
565 self.adapter().press_key(key).await
566 }
567
568 pub async fn clear(&self, selector: &str) -> Result<()> {
570 let escaped =
571 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
572 let js = format!("document.querySelector({escaped}).value = ''");
573 self.adapter().evaluate(&js).await?;
574 Ok(())
575 }
576
577 pub async fn select(&self, selector: &str, value: &str) -> Result<()> {
579 let escaped_sel =
580 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
581 let escaped_val =
582 serde_json::to_string(value).unwrap_or_else(|_| format!("\"{}\"", value));
583 let js = format!(
584 r#"
585 (function() {{
586 const el = document.querySelector({escaped_sel});
587 if (el) {{
588 el.value = {escaped_val};
589 el.dispatchEvent(new Event('change', {{ bubbles: true }}));
590 }}
591 }})()
592 "#
593 );
594 self.adapter().evaluate(&js).await?;
595 Ok(())
596 }
597
598 pub async fn focus(&self, selector: &str) -> Result<()> {
604 let escaped =
605 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
606 let js = format!("document.querySelector({escaped})?.focus()");
607 self.adapter().evaluate(&js).await?;
608 Ok(())
609 }
610
611 pub async fn blur(&self, selector: &str) -> Result<()> {
613 let escaped =
614 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
615 let js = format!("document.querySelector({escaped})?.blur()");
616 self.adapter().evaluate(&js).await?;
617 Ok(())
618 }
619
620 pub async fn hover(&self, selector: &str) -> Result<()> {
622 let (x, y) = self.get_element_center(selector).await?;
623 self.adapter().hover_point(x, y).await
624 }
625
626 pub async fn drag(&self, from_selector: &str, to_selector: &str) -> Result<()> {
632 let (fx, fy) = self.get_element_center(from_selector).await?;
633 let (tx, ty) = self.get_element_center(to_selector).await?;
634 self.adapter().drag_point(fx, fy, tx, ty).await
635 }
636
637 pub async fn scroll_y(&self, pixels: i64) -> Result<()> {
643 let js = format!("window.scrollBy(0, {pixels})");
644 self.adapter().evaluate(&js).await?;
645 Ok(())
646 }
647
648 pub async fn scroll_x(&self, pixels: i64) -> Result<()> {
650 let js = format!("window.scrollBy({pixels}, 0)");
651 self.adapter().evaluate(&js).await?;
652 Ok(())
653 }
654
655 pub async fn scroll_to(&self, selector: &str) -> Result<()> {
657 let escaped =
658 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
659 let js = format!(
660 "document.querySelector({escaped})?.scrollIntoView({{ behavior: 'smooth', block: 'center' }})"
661 );
662 self.adapter().evaluate(&js).await?;
663 Ok(())
664 }
665
666 pub async fn scroll_to_point(&self, x: f64, y: f64) -> Result<()> {
668 let js = format!("window.scrollTo({x}, {y})");
669 self.adapter().evaluate(&js).await?;
670 Ok(())
671 }
672
673 pub async fn wait_for_selector(&self, selector: &str, timeout_ms: u64) -> Result<()> {
679 let interval: u64 = 100;
680 let max_iter = (timeout_ms + interval - 1) / interval; let escaped =
682 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
683 let check_js = format!("!!document.querySelector({escaped})");
684 for _ in 0..max_iter {
685 let found = self.adapter().evaluate(&check_js).await?;
686 if found.as_bool().unwrap_or(false) {
687 return Ok(());
688 }
689 sleep(Duration::from_millis(interval)).await;
690 }
691 Err(SpiderError::Timeout(format!(
692 "Timeout waiting for selector: {selector}"
693 )))
694 }
695
696 pub async fn wait_for_navigation(&self, timeout_ms: u64) -> Result<()> {
698 let wait = timeout_ms.min(1000);
699 sleep(Duration::from_millis(wait)).await;
700 Ok(())
701 }
702
703 pub async fn wait_for_ready(&self, timeout_ms: u64) -> Result<()> {
712 let start = Instant::now();
713 let poll_interval: u64 = 200;
714 let stable_threshold = Duration::from_millis(500);
715 let timeout = Duration::from_millis(timeout_ms);
716
717 while start.elapsed() < timeout {
719 let state = self.adapter().evaluate("document.readyState").await;
720 if let Ok(val) = state {
721 if val.as_str() == Some("complete") {
722 break;
723 }
724 }
725 sleep(Duration::from_millis(poll_interval)).await;
726 }
727
728 let mut last_length: i64 = 0;
730 let mut stable_since = Instant::now();
731
732 while start.elapsed() < timeout {
733 let length = self
734 .adapter()
735 .evaluate("document.documentElement.innerHTML.length")
736 .await
737 .ok()
738 .and_then(|v| v.as_i64())
739 .unwrap_or(0);
740
741 if length != last_length {
742 last_length = length;
743 stable_since = Instant::now();
744 } else if stable_since.elapsed() >= stable_threshold {
745 return Ok(());
746 }
747
748 sleep(Duration::from_millis(poll_interval)).await;
749 }
750
751 Ok(())
752 }
753
754 pub async fn wait_for_content(&self, min_length: usize, timeout_ms: u64) -> Result<()> {
757 let start = Instant::now();
758 let timeout = Duration::from_millis(timeout_ms);
759 while start.elapsed() < timeout {
760 let length = self
761 .adapter()
762 .evaluate("document.documentElement.innerHTML.length")
763 .await
764 .ok()
765 .and_then(|v| v.as_u64())
766 .unwrap_or(0) as usize;
767 if length >= min_length {
768 return Ok(());
769 }
770 sleep(Duration::from_millis(200)).await;
771 }
772 Ok(())
773 }
774
775 pub async fn wait_for_network_idle(&self, timeout_ms: u64) -> Result<()> {
787 let start = Instant::now();
788 let poll_interval: u64 = 250;
789 let timeout = Duration::from_millis(timeout_ms);
790
791 while start.elapsed() < timeout {
793 let state = self.adapter().evaluate("document.readyState").await;
794 if let Ok(val) = state {
795 if val.as_str() == Some("complete") {
796 break;
797 }
798 }
799 sleep(Duration::from_millis(poll_interval)).await;
800 }
801
802 let idle_ms: u64 = 400;
807 let remaining = {
808 let elapsed = start.elapsed();
809 if timeout > elapsed {
810 (timeout - elapsed).as_millis().max(1000) as u64
811 } else {
812 1000
813 }
814 };
815 let js = format!(
816 r#"
817 new Promise((resolve) => {{
818 let lastActivity = Date.now();
819 const idleThreshold = {idle_ms};
820 const deadline = Date.now() + {remaining};
821
822 const perfObs = new PerformanceObserver(() => {{ lastActivity = Date.now(); }});
823 try {{ perfObs.observe({{ entryTypes: ['resource'] }}); }} catch(e) {{}}
824
825 const mutObs = new MutationObserver(() => {{ lastActivity = Date.now(); }});
826 mutObs.observe(document.documentElement, {{
827 childList: true, subtree: true, attributes: true
828 }});
829
830 const check = () => {{
831 const now = Date.now();
832 if (now >= deadline || (now - lastActivity >= idleThreshold)) {{
833 perfObs.disconnect();
834 mutObs.disconnect();
835 resolve(true);
836 return;
837 }}
838 setTimeout(check, 100);
839 }};
840 setTimeout(check, idleThreshold);
841 }})
842 "#
843 );
844 if self.adapter().evaluate(&js).await.is_err() {
845 sleep(Duration::from_millis(500)).await;
848 }
849
850 Ok(())
851 }
852
853 pub async fn set_viewport(
859 &self,
860 width: u32,
861 height: u32,
862 device_scale_factor: f64,
863 mobile: bool,
864 ) -> Result<()> {
865 self.adapter()
866 .set_viewport(width, height, device_scale_factor, mobile)
867 .await
868 }
869
870 pub async fn query_selector(&self, selector: &str) -> Result<Option<String>> {
876 let escaped =
877 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
878 let js = format!("document.querySelector({escaped})?.outerHTML ?? null");
879 let val = self.adapter().evaluate(&js).await?;
880 if val.is_null() {
881 Ok(None)
882 } else {
883 Ok(val.as_str().map(|s| s.to_string()))
884 }
885 }
886
887 pub async fn query_selector_all(&self, selector: &str) -> Result<Vec<String>> {
889 let escaped =
890 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
891 let js = format!(
892 "Array.from(document.querySelectorAll({escaped})).map(el => el.outerHTML)"
893 );
894 let val = self.adapter().evaluate(&js).await?;
895 let items = val
896 .as_array()
897 .map(|arr| {
898 arr.iter()
899 .filter_map(|v| v.as_str().map(|s| s.to_string()))
900 .collect()
901 })
902 .unwrap_or_default();
903 Ok(items)
904 }
905
906 pub async fn text_content(&self, selector: &str) -> Result<Option<String>> {
908 let escaped =
909 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
910 let js = format!("document.querySelector({escaped})?.textContent ?? null");
911 let val = self.adapter().evaluate(&js).await?;
912 if val.is_null() {
913 Ok(None)
914 } else {
915 Ok(val.as_str().map(|s| s.to_string()))
916 }
917 }
918
919 pub async fn extract_fields(
941 &self,
942 fields: &[(&str, FieldSelector<'_>)],
943 ) -> Result<std::collections::HashMap<String, Option<String>>> {
944 let field_map: Vec<Value> = fields
946 .iter()
947 .map(|(key, sel)| {
948 let (css, attr) = match sel {
949 FieldSelector::Text(s) => (*s, None),
950 FieldSelector::Attr {
951 selector,
952 attribute,
953 } => (*selector, Some(*attribute)),
954 };
955 serde_json::json!({
956 "key": key,
957 "selector": css,
958 "attribute": attr,
959 })
960 })
961 .collect();
962
963 let field_json = serde_json::to_string(&field_map)
964 .unwrap_or_else(|_| "[]".to_string());
965
966 let js = format!(
967 r#"
968 (() => {{
969 const fields = {field_json};
970 const result = {{}};
971 for (const f of fields) {{
972 const el = document.querySelector(f.selector);
973 result[f.key] = el
974 ? (f.attribute ? el.getAttribute(f.attribute) : el.textContent?.trim()) ?? null
975 : null;
976 }}
977 return JSON.stringify(result);
978 }})()
979 "#
980 );
981
982 let val = self.adapter().evaluate(&js).await?;
983 let raw = val.as_str().unwrap_or("{}");
984 let parsed: std::collections::HashMap<String, Option<String>> =
985 serde_json::from_str(raw).unwrap_or_default();
986 Ok(parsed)
987 }
988
989 async fn get_element_center(&self, selector: &str) -> Result<(f64, f64)> {
996 let escaped =
997 serde_json::to_string(selector).unwrap_or_else(|_| format!("\"{}\"", selector));
998 let js = format!(
999 r#"
1000 (function() {{
1001 const el = document.querySelector({escaped});
1002 if (!el) return null;
1003 el.scrollIntoView({{ block: 'center', behavior: 'instant' }});
1004 const r = el.getBoundingClientRect();
1005 return {{ x: r.x + r.width / 2, y: r.y + r.height / 2 }};
1006 }})()
1007 "#
1008 );
1009 let result = self.adapter().evaluate(&js).await?;
1010
1011 if result.is_null() {
1012 return Err(SpiderError::Other(format!(
1013 "Element not found: {selector}"
1014 )));
1015 }
1016
1017 let x = result
1018 .get("x")
1019 .and_then(|v| v.as_f64())
1020 .ok_or_else(|| SpiderError::Other(format!("Element not found: {selector}")))?;
1021 let y = result
1022 .get("y")
1023 .and_then(|v| v.as_f64())
1024 .ok_or_else(|| SpiderError::Other(format!("Element not found: {selector}")))?;
1025
1026 Ok((x, y))
1027 }
1028
1029 pub fn route_message(&self, data: &str) {
1031 self.adapter.load().route_message(data);
1032 }
1033
1034 pub fn destroy(&self) {
1036 self.adapter.load().destroy();
1037 }
1038
1039 pub fn set_adapter(&self, adapter: ProtocolAdapter) {
1045 self.adapter.store(Arc::new(adapter));
1046 }
1047
1048 pub fn set_adapter_arc(&self, adapter: Arc<ProtocolAdapter>) {
1050 self.adapter.store(adapter);
1051 }
1052
1053 pub fn is_interstitial_content(html: &str) -> bool {
1058 if html.len() > 15_000 {
1059 return false; }
1061 let lower = html.to_lowercase();
1062
1063 if lower.contains("just a moment")
1065 || lower.contains("checking your browser")
1066 || lower.contains("please wait while we verify")
1067 || lower.contains("verifying the device")
1068 || lower.contains("available after verification")
1069 || lower.contains("ddos-guard")
1070 || lower.contains("challenge-platform")
1071 || lower.contains("px-captcha")
1072 || lower.contains("_cf_chl_opt")
1073 || lower.contains("managed_challenge")
1074 || lower.contains("datadome")
1075 || lower.contains("ak_bmsc")
1076 || lower.contains("please enable cookies")
1077 {
1078 return true;
1079 }
1080
1081 if html.len() < 5_000 {
1086 if lower.contains("loading...") || lower.contains("loading results") {
1087 return true;
1088 }
1089 if lower.contains("please wait") && !lower.contains("article") {
1090 return true;
1091 }
1092 }
1093
1094 false
1095 }
1096
1097 pub fn is_rate_limit_content(html: &str) -> bool {
1102 if html.len() > 20_000 {
1103 return false; }
1105 let lower = html.to_lowercase();
1106 lower.contains("rate limit exceeded")
1107 || lower.contains("too many requests")
1108 || (lower.contains("rate limit") && lower.contains("please try again"))
1109 }
1110}
1111
1112#[cfg(test)]
1113mod tests {
1114 use super::*;
1115
1116 #[test]
1121 fn interstitial_cloudflare() {
1122 let html = "<html><body>Just a moment...</body></html>";
1123 assert!(SpiderPage::is_interstitial_content(html));
1124 }
1125
1126 #[test]
1127 fn interstitial_checking_browser() {
1128 let html = "<html><body>Checking your browser before accessing</body></html>";
1129 assert!(SpiderPage::is_interstitial_content(html));
1130 }
1131
1132 #[test]
1133 fn interstitial_perimeterx() {
1134 let html = "<html><body>Verifying the device...</body></html>";
1135 assert!(SpiderPage::is_interstitial_content(html));
1136 }
1137
1138 #[test]
1139 fn interstitial_ddos_guard() {
1140 let html = "<html><head></head><body>ddos-guard check</body></html>";
1141 assert!(SpiderPage::is_interstitial_content(html));
1142 }
1143
1144 #[test]
1145 fn interstitial_challenge_platform() {
1146 let html = "<html><body class='challenge-platform'>wait</body></html>";
1147 assert!(SpiderPage::is_interstitial_content(html));
1148 }
1149
1150 #[test]
1151 fn interstitial_px_captcha() {
1152 let html = "<html><body><div id='px-captcha'></div></body></html>";
1153 assert!(SpiderPage::is_interstitial_content(html));
1154 }
1155
1156 #[test]
1157 fn interstitial_cf_chl_opt() {
1158 let html = "<html><body><script>var _cf_chl_opt={}</script></body></html>";
1159 assert!(SpiderPage::is_interstitial_content(html));
1160 }
1161
1162 #[test]
1163 fn interstitial_managed_challenge() {
1164 let html = "<html><body>managed_challenge page</body></html>";
1165 assert!(SpiderPage::is_interstitial_content(html));
1166 }
1167
1168 #[test]
1169 fn interstitial_datadome() {
1170 let html = "<html><body>DataDome verification</body></html>";
1171 assert!(SpiderPage::is_interstitial_content(html));
1172 }
1173
1174 #[test]
1175 fn interstitial_akamai() {
1176 let html = "<html><body><script>ak_bmsc=cookie</script></body></html>";
1177 assert!(SpiderPage::is_interstitial_content(html));
1178 }
1179
1180 #[test]
1181 fn interstitial_enable_cookies() {
1182 let html = "<html><body>Please enable cookies to continue</body></html>";
1183 assert!(SpiderPage::is_interstitial_content(html));
1184 }
1185
1186 #[test]
1187 fn interstitial_loading_small() {
1188 let html = "<html><body>Loading...</body></html>";
1189 assert!(SpiderPage::is_interstitial_content(html));
1190 }
1191
1192 #[test]
1193 fn interstitial_loading_results() {
1194 let html = "<html><body>Loading results</body></html>";
1195 assert!(SpiderPage::is_interstitial_content(html));
1196 }
1197
1198 #[test]
1199 fn interstitial_please_wait_small() {
1200 let html = "<html><body>Please wait</body></html>";
1201 assert!(SpiderPage::is_interstitial_content(html));
1202 }
1203
1204 #[test]
1205 fn interstitial_please_wait_with_article_not_detected() {
1206 let html = "<html><body>Please wait for this article</body></html>";
1207 assert!(!SpiderPage::is_interstitial_content(html));
1208 }
1209
1210 #[test]
1211 fn interstitial_large_page_not_detected() {
1212 let html = "x".repeat(16_000);
1213 assert!(!SpiderPage::is_interstitial_content(&html));
1214 }
1215
1216 #[test]
1217 fn interstitial_normal_page_not_detected() {
1218 let html = "<html><body><h1>Welcome</h1><p>Normal content here.</p></body></html>";
1219 assert!(!SpiderPage::is_interstitial_content(html));
1220 }
1221
1222 #[test]
1227 fn rate_limit_exceeded() {
1228 let html = "<html><body>Rate limit exceeded</body></html>";
1229 assert!(SpiderPage::is_rate_limit_content(html));
1230 }
1231
1232 #[test]
1233 fn rate_limit_too_many_requests() {
1234 let html = "<html><body>Too many requests</body></html>";
1235 assert!(SpiderPage::is_rate_limit_content(html));
1236 }
1237
1238 #[test]
1239 fn rate_limit_try_again() {
1240 let html = "<html><body>Rate limit hit. Please try again later.</body></html>";
1241 assert!(SpiderPage::is_rate_limit_content(html));
1242 }
1243
1244 #[test]
1245 fn rate_limit_large_page_not_detected() {
1246 let html = format!(
1247 "<html><body>{}</body></html>",
1248 "x".repeat(21_000)
1249 );
1250 assert!(!SpiderPage::is_rate_limit_content(&html));
1251 }
1252
1253 #[test]
1254 fn rate_limit_normal_page_not_detected() {
1255 let html = "<html><body><h1>Normal page</h1></body></html>";
1256 assert!(!SpiderPage::is_rate_limit_content(html));
1257 }
1258}