reasonkit_web/browser/
navigation.rs

1//! Page navigation functionality
2//!
3//! This module handles URL navigation with retry logic, timeout handling,
4//! and human-like behavior simulation.
5
6use crate::browser::PageHandle;
7use crate::error::{Error, NavigationError, Result};
8use std::time::Duration;
9use tracing::{debug, info, instrument, warn};
10
11/// Options for page navigation
12#[derive(Debug, Clone)]
13pub struct NavigationOptions {
14    /// Timeout in milliseconds (default: 30000)
15    pub timeout_ms: u64,
16    /// Wait until condition (default: networkidle0)
17    pub wait_until: WaitUntil,
18    /// Number of retry attempts (default: 3)
19    pub retries: u32,
20    /// Delay between retries in ms (default: 1000)
21    pub retry_delay_ms: u64,
22    /// Simulate human-like behavior (default: true)
23    pub human_like: bool,
24}
25
26impl Default for NavigationOptions {
27    fn default() -> Self {
28        Self {
29            timeout_ms: 30000,
30            wait_until: WaitUntil::NetworkIdle0,
31            retries: 3,
32            retry_delay_ms: 1000,
33            human_like: true,
34        }
35    }
36}
37
38/// Condition to wait for after navigation
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum WaitUntil {
41    /// Wait until load event fires
42    Load,
43    /// Wait until DOMContentLoaded event fires
44    DomContentLoaded,
45    /// Wait until network is idle (0 connections for 500ms)
46    NetworkIdle0,
47    /// Wait until network is idle (max 2 connections for 500ms)
48    NetworkIdle2,
49}
50
51/// Result of a navigation operation
52#[derive(Debug)]
53pub struct NavigationResult {
54    /// Final URL after any redirects
55    pub final_url: String,
56    /// HTTP status code
57    pub status: Option<u16>,
58    /// Page title
59    pub title: Option<String>,
60    /// Navigation duration in milliseconds
61    pub duration_ms: u64,
62}
63
64/// URL validation utilities
65pub struct UrlValidator;
66
67impl UrlValidator {
68    /// Validate a URL for navigation
69    pub fn validate(url: &str) -> std::result::Result<(), String> {
70        // Check for empty URL
71        if url.is_empty() {
72            return Err("URL cannot be empty".to_string());
73        }
74
75        // Check for valid protocol
76        if !url.starts_with("http://")
77            && !url.starts_with("https://")
78            && !url.starts_with("file://")
79        {
80            return Err(format!(
81                "URL must start with http://, https://, or file://: {}",
82                url
83            ));
84        }
85
86        // Check URL length (max 2048 characters is common limit)
87        if url.len() > 2048 {
88            return Err("URL exceeds maximum length of 2048 characters".to_string());
89        }
90
91        // Check for localhost/127.0.0.1 (allowed but flagged)
92        // This is informational - we still allow it
93        let _is_localhost = Self::is_localhost(url);
94
95        Ok(())
96    }
97
98    /// Check if URL points to localhost
99    pub fn is_localhost(url: &str) -> bool {
100        let lower = url.to_lowercase();
101        lower.contains("://localhost")
102            || lower.contains("://127.0.0.1")
103            || lower.contains("://[::1]")
104            || lower.contains("://0.0.0.0")
105    }
106
107    /// Check if URL is external (not localhost)
108    pub fn is_external(url: &str) -> bool {
109        !Self::is_localhost(url)
110    }
111
112    /// Extract host from URL
113    pub fn extract_host(url: &str) -> Option<String> {
114        // Simple extraction - find :// then extract until next / or end
115        if let Some(protocol_end) = url.find("://") {
116            let after_protocol = &url[protocol_end + 3..];
117            let host_end = after_protocol.find('/').unwrap_or(after_protocol.len());
118            let host_with_port = &after_protocol[..host_end];
119            // Remove port if present
120            let host = host_with_port
121                .rsplit(':')
122                .next_back()
123                .or(Some(host_with_port))
124                .map(|h| {
125                    if host_with_port.contains(':') && !host_with_port.starts_with('[') {
126                        // IPv4 with port
127                        host_with_port.split(':').next().unwrap_or(host_with_port)
128                    } else {
129                        h
130                    }
131                })?;
132            Some(host.to_string())
133        } else {
134            None
135        }
136    }
137}
138
139/// Simple rate limiter for requests
140pub struct RateLimiter {
141    /// Maximum requests per window
142    max_requests: u32,
143    /// Window duration in seconds
144    window_secs: u64,
145    /// Current request count in window
146    request_count: u32,
147    /// Window start time
148    window_start: std::time::Instant,
149}
150
151impl RateLimiter {
152    /// Create a new rate limiter
153    pub fn new(max_requests: u32, window_secs: u64) -> Self {
154        Self {
155            max_requests,
156            window_secs,
157            request_count: 0,
158            window_start: std::time::Instant::now(),
159        }
160    }
161
162    /// Check if a request is allowed (and count it if so)
163    pub fn check(&mut self) -> bool {
164        let now = std::time::Instant::now();
165        let elapsed = now.duration_since(self.window_start).as_secs();
166
167        // Reset window if expired
168        if elapsed >= self.window_secs {
169            self.window_start = now;
170            self.request_count = 0;
171        }
172
173        // Check if under limit
174        if self.request_count < self.max_requests {
175            self.request_count += 1;
176            true
177        } else {
178            false
179        }
180    }
181
182    /// Get remaining requests in current window
183    pub fn remaining(&self) -> u32 {
184        self.max_requests.saturating_sub(self.request_count)
185    }
186
187    /// Reset the rate limiter
188    pub fn reset(&mut self) {
189        self.request_count = 0;
190        self.window_start = std::time::Instant::now();
191    }
192}
193
194/// Page navigator with advanced navigation capabilities
195pub struct PageNavigator;
196
197impl PageNavigator {
198    /// Navigate to a URL with default options
199    #[instrument(skip(page))]
200    pub async fn goto(
201        page: &PageHandle,
202        url: &str,
203        options: Option<NavigationOptions>,
204    ) -> Result<NavigationResult> {
205        let opts = options.unwrap_or_default();
206        let start = std::time::Instant::now();
207
208        // Validate URL
209        if !url.starts_with("http://")
210            && !url.starts_with("https://")
211            && !url.starts_with("file://")
212        {
213            return Err(NavigationError::InvalidUrl(format!(
214                "URL must start with http://, https://, or file://: {}",
215                url
216            ))
217            .into());
218        }
219
220        info!("Navigating to: {}", url);
221
222        let mut last_error = None;
223        for attempt in 0..=opts.retries {
224            if attempt > 0 {
225                warn!("Navigation retry attempt {} of {}", attempt, opts.retries);
226                tokio::time::sleep(Duration::from_millis(opts.retry_delay_ms)).await;
227            }
228
229            match Self::navigate_once(&page.page, url, &opts).await {
230                Ok(result) => {
231                    // Update page URL
232                    page.set_url(result.final_url.clone()).await;
233
234                    // Apply human-like behavior if enabled
235                    if opts.human_like {
236                        Self::simulate_human_behavior(&page.page).await?;
237                    }
238
239                    let duration_ms = start.elapsed().as_millis() as u64;
240                    return Ok(NavigationResult {
241                        final_url: result.final_url,
242                        status: result.status,
243                        title: result.title,
244                        duration_ms,
245                    });
246                }
247                Err(e) => {
248                    warn!("Navigation attempt {} failed: {}", attempt + 1, e);
249                    last_error = Some(e);
250                }
251            }
252        }
253
254        Err(last_error.unwrap_or_else(|| {
255            NavigationError::LoadFailed("Navigation failed after all retries".to_string()).into()
256        }))
257    }
258
259    /// Perform a single navigation attempt
260    async fn navigate_once(
261        page: &chromiumoxide::Page,
262        url: &str,
263        opts: &NavigationOptions,
264    ) -> Result<NavigationResult> {
265        // Navigate with timeout
266        let timeout = Duration::from_millis(opts.timeout_ms);
267
268        let nav_future = page.goto(url);
269        let _response = tokio::time::timeout(timeout, nav_future)
270            .await
271            .map_err(|_| NavigationError::Timeout(opts.timeout_ms))?
272            .map_err(|e| NavigationError::LoadFailed(e.to_string()))?;
273
274        // Wait for page to be ready based on wait_until option
275        Self::wait_for_ready(page, opts).await?;
276
277        // Get final URL and title
278        let final_url = page
279            .url()
280            .await
281            .map_err(|e| Error::cdp(e.to_string()))?
282            .unwrap_or_else(|| url.to_string());
283
284        let title = page
285            .evaluate("document.title")
286            .await
287            .ok()
288            .and_then(|v| v.into_value::<String>().ok());
289
290        // Navigation doesn't return status directly in chromiumoxide
291        let status: Option<u16> = None;
292
293        debug!("Navigation complete: {} -> {}", url, final_url);
294
295        Ok(NavigationResult {
296            final_url,
297            status,
298            title,
299            duration_ms: 0, // Will be set by caller
300        })
301    }
302
303    /// Wait for page to be ready based on wait_until condition
304    async fn wait_for_ready(page: &chromiumoxide::Page, opts: &NavigationOptions) -> Result<()> {
305        let script = match opts.wait_until {
306            WaitUntil::Load => {
307                r#"
308                    new Promise(resolve => {
309                        if (document.readyState === 'complete') {
310                            resolve(true);
311                        } else {
312                            window.addEventListener('load', () => resolve(true));
313                        }
314                    })
315                "#
316            }
317            WaitUntil::DomContentLoaded => {
318                r#"
319                    new Promise(resolve => {
320                        if (document.readyState !== 'loading') {
321                            resolve(true);
322                        } else {
323                            document.addEventListener('DOMContentLoaded', () => resolve(true));
324                        }
325                    })
326                "#
327            }
328            WaitUntil::NetworkIdle0 | WaitUntil::NetworkIdle2 => {
329                // For network idle, we'll just wait a short time after load
330                // A more sophisticated approach would monitor actual network activity
331                r#"
332                    new Promise(resolve => {
333                        if (document.readyState === 'complete') {
334                            setTimeout(() => resolve(true), 500);
335                        } else {
336                            window.addEventListener('load', () => {
337                                setTimeout(() => resolve(true), 500);
338                            });
339                        }
340                    })
341                "#
342            }
343        };
344
345        let timeout = Duration::from_millis(opts.timeout_ms);
346        tokio::time::timeout(timeout, page.evaluate(script))
347            .await
348            .map_err(|_| NavigationError::Timeout(opts.timeout_ms))?
349            .map_err(|e| Error::cdp(e.to_string()))?;
350
351        Ok(())
352    }
353
354    /// Simulate human-like behavior after navigation
355    async fn simulate_human_behavior(page: &chromiumoxide::Page) -> Result<()> {
356        // Random small delay
357        let delay = rand::random::<u64>() % 500 + 200;
358        tokio::time::sleep(Duration::from_millis(delay)).await;
359
360        // Gentle scroll
361        let scroll_script = r#"
362            window.scrollTo({
363                top: Math.random() * 100 + 50,
364                behavior: 'smooth'
365            });
366        "#;
367
368        let _ = page.evaluate(scroll_script).await;
369
370        // Small delay after scroll
371        tokio::time::sleep(Duration::from_millis(200)).await;
372
373        Ok(())
374    }
375
376    /// Go back in browser history
377    #[instrument(skip(page))]
378    pub async fn back(page: &PageHandle) -> Result<()> {
379        page.page
380            .evaluate("window.history.back()")
381            .await
382            .map_err(|e| Error::cdp(e.to_string()))?;
383
384        tokio::time::sleep(Duration::from_millis(500)).await;
385        Ok(())
386    }
387
388    /// Go forward in browser history
389    #[instrument(skip(page))]
390    pub async fn forward(page: &PageHandle) -> Result<()> {
391        page.page
392            .evaluate("window.history.forward()")
393            .await
394            .map_err(|e| Error::cdp(e.to_string()))?;
395
396        tokio::time::sleep(Duration::from_millis(500)).await;
397        Ok(())
398    }
399
400    /// Reload the current page
401    #[instrument(skip(page))]
402    pub async fn reload(page: &PageHandle) -> Result<()> {
403        page.page
404            .reload()
405            .await
406            .map_err(|e| Error::cdp(e.to_string()))?;
407
408        Ok(())
409    }
410
411    /// Wait for a specific element to appear
412    #[instrument(skip(page))]
413    pub async fn wait_for_selector(
414        page: &PageHandle,
415        selector: &str,
416        timeout_ms: u64,
417    ) -> Result<()> {
418        let script = format!(
419            r#"
420                new Promise((resolve, reject) => {{
421                    const timeout = {};
422                    const start = Date.now();
423
424                    function check() {{
425                        const el = document.querySelector('{}');
426                        if (el) {{
427                            resolve(true);
428                        }} else if (Date.now() - start > timeout) {{
429                            reject(new Error('Timeout waiting for selector'));
430                        }} else {{
431                            requestAnimationFrame(check);
432                        }}
433                    }}
434                    check();
435                }})
436            "#,
437            timeout_ms,
438            selector.replace('\'', "\\'")
439        );
440
441        let timeout = Duration::from_millis(timeout_ms + 1000);
442        tokio::time::timeout(timeout, page.page.evaluate(script.as_str()))
443            .await
444            .map_err(|_| NavigationError::Timeout(timeout_ms))?
445            .map_err(|e| Error::cdp(e.to_string()))?;
446
447        Ok(())
448    }
449}
450
451#[cfg(test)]
452mod tests {
453    use super::*;
454
455    // ========================================================================
456    // NavigationOptions Tests
457    // ========================================================================
458
459    #[test]
460    fn test_navigation_options_default() {
461        let opts = NavigationOptions::default();
462        assert_eq!(opts.timeout_ms, 30000);
463        assert_eq!(opts.retries, 3);
464        assert!(opts.human_like);
465        assert_eq!(opts.retry_delay_ms, 1000);
466    }
467
468    #[test]
469    fn test_wait_until_variants() {
470        assert_ne!(WaitUntil::Load, WaitUntil::DomContentLoaded);
471        assert_eq!(WaitUntil::NetworkIdle0, WaitUntil::NetworkIdle0);
472    }
473
474    // ========================================================================
475    // URL Validation Tests
476    // ========================================================================
477
478    #[test]
479    fn test_url_validation_valid_http() {
480        assert!(UrlValidator::validate("http://example.com").is_ok());
481    }
482
483    #[test]
484    fn test_url_validation_valid_https() {
485        assert!(UrlValidator::validate("https://example.com").is_ok());
486    }
487
488    #[test]
489    fn test_url_validation_valid_file() {
490        assert!(UrlValidator::validate("file:///path/to/file.html").is_ok());
491    }
492
493    #[test]
494    fn test_url_validation_empty() {
495        let result = UrlValidator::validate("");
496        assert!(result.is_err());
497        assert!(result.unwrap_err().contains("empty"));
498    }
499
500    #[test]
501    fn test_url_validation_no_protocol() {
502        let result = UrlValidator::validate("example.com");
503        assert!(result.is_err());
504        assert!(result.unwrap_err().contains("must start with"));
505    }
506
507    #[test]
508    fn test_url_validation_invalid_protocol() {
509        let result = UrlValidator::validate("ftp://example.com");
510        assert!(result.is_err());
511    }
512
513    #[test]
514    fn test_url_validation_too_long() {
515        let long_url = format!("https://example.com/{}", "a".repeat(3000));
516        let result = UrlValidator::validate(&long_url);
517        assert!(result.is_err());
518        assert!(result.unwrap_err().contains("maximum length"));
519    }
520
521    // ========================================================================
522    // Localhost Check Tests
523    // ========================================================================
524
525    #[test]
526    fn test_localhost_check_127001() {
527        assert!(UrlValidator::is_localhost("http://127.0.0.1:8080"));
528        assert!(UrlValidator::is_localhost("https://127.0.0.1/path"));
529    }
530
531    #[test]
532    fn test_localhost_check_localhost() {
533        assert!(UrlValidator::is_localhost("http://localhost:3000"));
534        assert!(UrlValidator::is_localhost("https://localhost/api"));
535    }
536
537    #[test]
538    fn test_localhost_check_ipv6_loopback() {
539        assert!(UrlValidator::is_localhost("http://[::1]:8080"));
540    }
541
542    #[test]
543    fn test_localhost_check_zero_addr() {
544        assert!(UrlValidator::is_localhost("http://0.0.0.0:8080"));
545    }
546
547    #[test]
548    fn test_localhost_check_external() {
549        assert!(!UrlValidator::is_localhost("https://example.com"));
550        assert!(!UrlValidator::is_localhost("https://google.com"));
551        assert!(!UrlValidator::is_localhost("http://192.168.1.1"));
552    }
553
554    #[test]
555    fn test_is_external() {
556        assert!(UrlValidator::is_external("https://example.com"));
557        assert!(!UrlValidator::is_external("http://localhost:8080"));
558        assert!(!UrlValidator::is_external("http://127.0.0.1"));
559    }
560
561    // ========================================================================
562    // Host Extraction Tests
563    // ========================================================================
564
565    #[test]
566    fn test_extract_host_simple() {
567        assert_eq!(
568            UrlValidator::extract_host("https://example.com/path"),
569            Some("example.com".to_string())
570        );
571    }
572
573    #[test]
574    fn test_extract_host_with_port() {
575        assert_eq!(
576            UrlValidator::extract_host("http://localhost:8080/api"),
577            Some("localhost".to_string())
578        );
579    }
580
581    #[test]
582    fn test_extract_host_no_path() {
583        assert_eq!(
584            UrlValidator::extract_host("https://google.com"),
585            Some("google.com".to_string())
586        );
587    }
588
589    #[test]
590    fn test_extract_host_no_protocol() {
591        assert_eq!(UrlValidator::extract_host("example.com"), None);
592    }
593
594    // ========================================================================
595    // Rate Limiter Tests
596    // ========================================================================
597
598    #[test]
599    fn test_rate_limiter_allows_under_limit() {
600        let mut limiter = RateLimiter::new(5, 60);
601
602        // First 5 requests should be allowed
603        assert!(limiter.check());
604        assert!(limiter.check());
605        assert!(limiter.check());
606        assert!(limiter.check());
607        assert!(limiter.check());
608    }
609
610    #[test]
611    fn test_rate_limiter_blocks_over_limit() {
612        let mut limiter = RateLimiter::new(3, 60);
613
614        // First 3 requests should be allowed
615        assert!(limiter.check());
616        assert!(limiter.check());
617        assert!(limiter.check());
618
619        // 4th request should be blocked
620        assert!(!limiter.check());
621        assert!(!limiter.check());
622    }
623
624    #[test]
625    fn test_rate_limiter_remaining() {
626        let mut limiter = RateLimiter::new(5, 60);
627
628        assert_eq!(limiter.remaining(), 5);
629        limiter.check();
630        assert_eq!(limiter.remaining(), 4);
631        limiter.check();
632        limiter.check();
633        assert_eq!(limiter.remaining(), 2);
634    }
635
636    #[test]
637    fn test_rate_limiter_reset() {
638        let mut limiter = RateLimiter::new(3, 60);
639
640        limiter.check();
641        limiter.check();
642        limiter.check();
643        assert_eq!(limiter.remaining(), 0);
644        assert!(!limiter.check());
645
646        limiter.reset();
647        assert_eq!(limiter.remaining(), 3);
648        assert!(limiter.check());
649    }
650
651    #[test]
652    fn test_rate_limiter_single_request() {
653        let mut limiter = RateLimiter::new(1, 60);
654        assert!(limiter.check());
655        assert!(!limiter.check());
656    }
657
658    #[test]
659    fn test_rate_limiter_zero_remaining_after_exhaustion() {
660        let mut limiter = RateLimiter::new(2, 60);
661        limiter.check();
662        limiter.check();
663        assert_eq!(limiter.remaining(), 0);
664    }
665
666    // ========================================================================
667    // NavigationResult Tests
668    // ========================================================================
669
670    #[test]
671    fn test_navigation_result_structure() {
672        let result = NavigationResult {
673            final_url: "https://example.com".to_string(),
674            status: Some(200),
675            title: Some("Example".to_string()),
676            duration_ms: 150,
677        };
678
679        assert_eq!(result.final_url, "https://example.com");
680        assert_eq!(result.status, Some(200));
681        assert_eq!(result.title, Some("Example".to_string()));
682        assert_eq!(result.duration_ms, 150);
683    }
684
685    #[test]
686    fn test_navigation_result_without_status() {
687        let result = NavigationResult {
688            final_url: "https://example.com".to_string(),
689            status: None,
690            title: None,
691            duration_ms: 100,
692        };
693
694        assert!(result.status.is_none());
695        assert!(result.title.is_none());
696    }
697
698    // ========================================================================
699    // Edge Cases Tests
700    // ========================================================================
701
702    #[test]
703    fn test_url_validation_with_query_params() {
704        assert!(UrlValidator::validate("https://example.com?foo=bar&baz=123").is_ok());
705    }
706
707    #[test]
708    fn test_url_validation_with_fragment() {
709        assert!(UrlValidator::validate("https://example.com#section").is_ok());
710    }
711
712    #[test]
713    fn test_url_validation_with_auth() {
714        assert!(UrlValidator::validate("https://user:pass@example.com").is_ok());
715    }
716
717    #[test]
718    fn test_localhost_case_insensitive() {
719        assert!(UrlValidator::is_localhost("http://LOCALHOST:8080"));
720        assert!(UrlValidator::is_localhost("http://LocalHost:8080"));
721    }
722
723    #[test]
724    fn test_localhost_in_path_not_matched() {
725        // localhost in the path should not trigger localhost detection
726        assert!(!UrlValidator::is_localhost(
727            "https://example.com/localhost/api"
728        ));
729    }
730}