Skip to main content

stygian_graph/adapters/
browser.rs

1//! JavaScript rendering adapter using stygian-browser
2//!
3//! Implements the `ScrapingService` port using a headless browser (via the
4//! `stygian-browser` crate) for pages that require JavaScript execution.
5//!
6//! Features:
7//! - Full JS execution via Chrome DevTools Protocol
8//! - Configurable wait strategies (DOM ready, network idle, selector)
9//! - Stealth mode via stygian-browser's anti-detection features
10//! - Graceful fallback to HTTP when browser pool is unavailable
11//! - Circuit-breaker friendly: propagates pool-exhaustion as service errors
12//!
13//! # Example
14//!
15//! ```no_run
16//! use stygian_graph::adapters::browser::{BrowserAdapter, BrowserAdapterConfig};
17//! use stygian_graph::ports::{ScrapingService, ServiceInput};
18//! use serde_json::json;
19//!
20//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
21//! let config = BrowserAdapterConfig::default();
22//! let adapter = BrowserAdapter::with_config(config);
23//! let input = ServiceInput {
24//!     url: "https://example.com".to_string(),
25//!     params: json!({ "wait_strategy": "dom_content_loaded", "timeout_ms": 30000 }),
26//! };
27//! // let result = adapter.execute(input).await.unwrap();
28//! # });
29//! ```
30
31use std::fmt;
32use std::time::{Duration, Instant};
33
34use async_trait::async_trait;
35use serde_json::{Value, json};
36
37use crate::domain::error::{Result, ServiceError, StygianError};
38use crate::ports::{ScrapingService, ServiceInput, ServiceOutput};
39
40/// Wait strategy for JavaScript-rendered pages
41#[derive(Debug, Clone, PartialEq, Eq, Default)]
42pub enum WaitStrategy {
43    /// Wait until DOM content is loaded (default)
44    #[default]
45    DomContentLoaded,
46    /// Wait until all network requests complete
47    NetworkIdle,
48    /// Wait until a CSS selector appears in the DOM
49    SelectorAppears(String),
50    /// Wait for a fixed duration after navigation
51    Fixed(Duration),
52}
53
54impl WaitStrategy {
55    /// Parse from a JSON parameter value
56    fn from_params(params: &Value) -> Self {
57        match params.get("wait_strategy").and_then(Value::as_str) {
58            Some("network_idle") => Self::NetworkIdle,
59            Some("dom_content_loaded") => Self::DomContentLoaded,
60            Some(s) if s.starts_with("selector:") => {
61                Self::SelectorAppears(s.trim_start_matches("selector:").to_string())
62            }
63            _ => params
64                .get("wait_ms")
65                .and_then(Value::as_u64)
66                .map_or(Self::DomContentLoaded, |ms| {
67                    Self::Fixed(Duration::from_millis(ms))
68                }),
69        }
70    }
71}
72
73impl fmt::Display for WaitStrategy {
74    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
75        match self {
76            Self::DomContentLoaded => write!(f, "dom_content_loaded"),
77            Self::NetworkIdle => write!(f, "network_idle"),
78            Self::SelectorAppears(selector) => write!(f, "selector_appears({selector})"),
79            Self::Fixed(duration) => write!(f, "fixed_{}ms", duration.as_millis()),
80        }
81    }
82}
83
84/// Stealth level for browser automation
85#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
86pub enum StealthLevel {
87    /// No stealth (fastest, but detectable)
88    None,
89    /// Basic stealth: hide automation signals
90    #[default]
91    Basic,
92    /// Advanced stealth: full fingerprint spoofing
93    Advanced,
94}
95
96impl StealthLevel {
97    fn from_params(params: &Value) -> Self {
98        match params.get("stealth_level").and_then(Value::as_str) {
99            Some("advanced") => Self::Advanced,
100            Some("none") => Self::None,
101            _ => Self::Basic,
102        }
103    }
104
105    /// Convert stealth level to string representation
106    #[must_use]
107    pub const fn as_str(&self) -> &'static str {
108        match self {
109            Self::None => "none",
110            Self::Basic => "basic",
111            Self::Advanced => "advanced",
112        }
113    }
114}
115
116/// Configuration for the `BrowserAdapter`
117#[derive(Debug, Clone)]
118pub struct BrowserAdapterConfig {
119    /// Default navigation timeout
120    pub timeout: Duration,
121    /// Maximum concurrent browser sessions (maps to pool size)
122    pub max_concurrent: usize,
123    /// Default wait strategy
124    pub default_wait: WaitStrategy,
125    /// Default stealth level
126    pub default_stealth: StealthLevel,
127    /// Whether to block common tracking/ad resources (improves speed)
128    pub block_resources: bool,
129    /// Whether to run in headless mode
130    pub headless: bool,
131    /// Custom User-Agent string (None = default)
132    pub user_agent: Option<String>,
133    /// Viewport width in pixels
134    pub viewport_width: u32,
135    /// Viewport height in pixels
136    pub viewport_height: u32,
137}
138
139impl Default for BrowserAdapterConfig {
140    fn default() -> Self {
141        Self {
142            timeout: Duration::from_secs(30),
143            max_concurrent: 5,
144            default_wait: WaitStrategy::DomContentLoaded,
145            default_stealth: StealthLevel::Basic,
146            block_resources: true,
147            headless: true,
148            user_agent: None,
149            viewport_width: 1920,
150            viewport_height: 1080,
151        }
152    }
153}
154
155/// Browser-based scraping adapter
156///
157/// Wraps stygian-browser's `BrowserPool` to implement the `ScrapingService` port.
158/// Falls back to an error indicating unavailability when the browser pool
159/// cannot be used (headless Chrome not available, pool exhausted, etc.).
160///
161/// The adapter accepts per-request parameters via `ServiceInput.params`:
162/// - `wait_strategy`: `"dom_content_loaded"` | `"network_idle"` | `"selector:<css>"` | `"fixed_ms:<n>"`
163/// - `stealth_level`: `"none"` | `"basic"` | `"advanced"`
164/// - `timeout_ms`: override default timeout in milliseconds
165/// - `wait_ms`: milliseconds to wait when strategy is "fixed"
166#[derive(Clone)]
167pub struct BrowserAdapter {
168    config: BrowserAdapterConfig,
169}
170
171impl BrowserAdapter {
172    /// Create a new `BrowserAdapter` with default configuration
173    ///
174    /// # Example
175    ///
176    /// ```
177    /// use stygian_graph::adapters::browser::BrowserAdapter;
178    /// use stygian_graph::ports::ScrapingService;
179    ///
180    /// let adapter = BrowserAdapter::new();
181    /// assert_eq!(adapter.name(), "browser");
182    /// ```
183    #[must_use]
184    pub fn new() -> Self {
185        Self {
186            config: BrowserAdapterConfig::default(),
187        }
188    }
189
190    /// Create a new `BrowserAdapter` with custom configuration
191    ///
192    /// # Example
193    ///
194    /// ```
195    /// use stygian_graph::adapters::browser::{BrowserAdapter, BrowserAdapterConfig};
196    /// use std::time::Duration;
197    ///
198    /// let config = BrowserAdapterConfig {
199    ///     timeout: Duration::from_secs(60),
200    ///     block_resources: false,
201    ///     ..BrowserAdapterConfig::default()
202    /// };
203    /// let adapter = BrowserAdapter::with_config(config);
204    /// ```
205    #[must_use]
206    pub const fn with_config(config: BrowserAdapterConfig) -> Self {
207        Self { config }
208    }
209
210    /// Extract per-request timeout from params, falling back to config default
211    fn resolve_timeout(&self, params: &Value) -> Duration {
212        params
213            .get("timeout_ms")
214            .and_then(Value::as_u64)
215            .map_or(self.config.timeout, Duration::from_millis)
216    }
217
218    /// Performs the browser navigation using stygian-browser's `BrowserPool`.
219    ///
220    /// Returns rendered HTML and timing metadata. When headless Chrome is
221    /// unavailable this returns a `ServiceError` so callers can react
222    /// (e.g. fall back to `HttpAdapter` via circuit-breaker logic).
223    #[allow(clippy::option_if_let_else)]
224    #[cfg(feature = "browser")]
225    async fn navigate_with_browser(
226        &self,
227        url: &str,
228        wait: &WaitStrategy,
229        timeout: Duration,
230    ) -> Result<(String, Value)> {
231        use stygian_browser::page::WaitUntil;
232        use stygian_browser::{BrowserConfig, BrowserPool};
233
234        let start = Instant::now();
235
236        // Step 1: Build browser config from adapter config
237        let browser_config = BrowserConfig {
238            headless: self.config.headless,
239            ..BrowserConfig::default()
240        };
241
242        // Step 2: Create pool (in production this would be cached at adapter level)
243        let pool = BrowserPool::new(browser_config)
244            .await
245            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
246
247        // Step 3: Acquire a browser handle with timeout
248        let handle = match tokio::time::timeout(timeout, pool.acquire()).await {
249            Ok(Ok(h)) => h,
250            Ok(Err(e)) => {
251                return Err(StygianError::Service(ServiceError::Unavailable(format!(
252                    "Browser pool exhausted or unavailable: {e}"
253                ))));
254            }
255            Err(_) => {
256                return Err(StygianError::Service(ServiceError::Unavailable(format!(
257                    "Browser acquisition timeout after {timeout:?}"
258                ))));
259            }
260        };
261
262        // Step 4: Get browser instance and create new page
263        let Some(instance) = handle.browser() else {
264            return Err(StygianError::Service(ServiceError::Unavailable(
265                "Failed to get browser instance after acquisition".to_string(),
266            )));
267        };
268
269        let mut page = instance
270            .new_page()
271            .await
272            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
273
274        // Step 5: Convert WaitStrategy to browser's WaitUntil
275        let wait_condition = match wait {
276            WaitStrategy::DomContentLoaded => WaitUntil::DomContentLoaded,
277            WaitStrategy::NetworkIdle => WaitUntil::NetworkIdle,
278            WaitStrategy::SelectorAppears(selector) => WaitUntil::Selector(selector.clone()),
279            WaitStrategy::Fixed(_duration) => WaitUntil::DomContentLoaded, // Fixed uses timeout, not condition
280        };
281
282        // Step 6: Navigate with specified wait strategy
283        if let Err(e) = page.navigate(url, wait_condition, timeout).await {
284            return Err(StygianError::Service(ServiceError::Unavailable(format!(
285                "Browser navigation failed: {e}"
286            ))));
287        }
288
289        // Step 7: Wait for fixed duration if specified
290        if let WaitStrategy::Fixed(duration) = wait {
291            tokio::time::sleep(*duration).await;
292        }
293
294        // Step 8: Get rendered HTML content
295        let html = page
296            .content()
297            .await
298            .map_err(|e| StygianError::Service(ServiceError::Unavailable(e.to_string())))?;
299
300        let elapsed = start.elapsed();
301
302        // Step 9: Return HTML and metadata
303        // BrowserHandle is automatically returned to pool when dropped
304        Ok((
305            html,
306            json!({
307                "url": url,
308                "navigation_time_ms": elapsed.as_millis(),
309                "wait_strategy": wait.to_string(),
310                "stealth_level": self.config.default_stealth.as_str(),
311                "viewport": {
312                    "width": self.config.viewport_width,
313                    "height": self.config.viewport_height
314                },
315                "rendered": true,
316            }),
317        ))
318    }
319
320    /// Fallback path when the `browser` feature is disabled
321    #[cfg(not(feature = "browser"))]
322    async fn navigate_with_browser(
323        &self,
324        url: &str,
325        _wait: &WaitStrategy,
326        _timeout: Duration,
327    ) -> Result<(String, Value)> {
328        Err(StygianError::Service(ServiceError::Unavailable(format!(
329            "stygian-graph was compiled without the 'browser' feature; \
330             cannot render JavaScript for URL: {url}"
331        ))))
332    }
333}
334
335impl Default for BrowserAdapter {
336    fn default() -> Self {
337        Self::new()
338    }
339}
340
341#[async_trait]
342impl ScrapingService for BrowserAdapter {
343    /// Execute a JavaScript-rendered scrape
344    ///
345    /// Accepts the following `params` keys:
346    /// - `wait_strategy` — how to determine page readiness
347    /// - `stealth_level` — anti-detection level  
348    /// - `timeout_ms` — per-request timeout override
349    ///
350    /// # Example
351    ///
352    /// ```no_run
353    /// use stygian_graph::adapters::browser::BrowserAdapter;
354    /// use stygian_graph::ports::{ScrapingService, ServiceInput};
355    /// use serde_json::json;
356    ///
357    /// # tokio::runtime::Runtime::new().unwrap().block_on(async {
358    /// let adapter = BrowserAdapter::new();
359    /// let input = ServiceInput {
360    ///     url: "https://example.com".to_string(),
361    ///     params: json!({ "wait_strategy": "network_idle", "stealth_level": "advanced" }),
362    /// };
363    /// // let output = adapter.execute(input).await.unwrap();
364    /// # });
365    /// ```
366    async fn execute(&self, input: ServiceInput) -> Result<ServiceOutput> {
367        let wait = WaitStrategy::from_params(&input.params);
368        let _stealth = StealthLevel::from_params(&input.params);
369        let timeout = self.resolve_timeout(&input.params);
370
371        let (html, metadata) = tokio::time::timeout(
372            timeout + Duration::from_secs(5), // outer hard deadline
373            self.navigate_with_browser(&input.url, &wait, timeout),
374        )
375        .await
376        .map_err(|_| {
377            StygianError::Service(ServiceError::Timeout(
378                u64::try_from(timeout.as_millis()).unwrap_or(u64::MAX),
379            ))
380        })??;
381
382        Ok(ServiceOutput {
383            data: html,
384            metadata,
385        })
386    }
387
388    fn name(&self) -> &'static str {
389        "browser"
390    }
391}
392
393#[cfg(test)]
394#[allow(
395    clippy::unwrap_used,
396    clippy::expect_used,
397    clippy::panic,
398    clippy::redundant_closure_for_method_calls
399)]
400mod tests {
401    use super::*;
402
403    #[test]
404    fn test_adapter_default_name() {
405        let adapter = BrowserAdapter::new();
406        assert_eq!(adapter.name(), "browser");
407    }
408
409    #[test]
410    fn test_wait_strategy_from_params_dom() {
411        let params = json!({ "wait_strategy": "dom_content_loaded" });
412        assert_eq!(
413            WaitStrategy::from_params(&params),
414            WaitStrategy::DomContentLoaded
415        );
416    }
417
418    #[test]
419    fn test_wait_strategy_from_params_network_idle() {
420        let params = json!({ "wait_strategy": "network_idle" });
421        assert_eq!(
422            WaitStrategy::from_params(&params),
423            WaitStrategy::NetworkIdle
424        );
425    }
426
427    #[test]
428    fn test_wait_strategy_from_params_selector() {
429        let params = json!({ "wait_strategy": "selector:#main-content" });
430        assert_eq!(
431            WaitStrategy::from_params(&params),
432            WaitStrategy::SelectorAppears("#main-content".to_string())
433        );
434    }
435
436    #[test]
437    fn test_wait_strategy_from_params_fixed_ms() {
438        let params = json!({ "wait_ms": 500u64 });
439        assert_eq!(
440            WaitStrategy::from_params(&params),
441            WaitStrategy::Fixed(Duration::from_millis(500))
442        );
443    }
444
445    #[test]
446    fn test_stealth_level_from_params() {
447        assert_eq!(
448            StealthLevel::from_params(&json!({ "stealth_level": "advanced" })),
449            StealthLevel::Advanced
450        );
451        assert_eq!(
452            StealthLevel::from_params(&json!({ "stealth_level": "none" })),
453            StealthLevel::None
454        );
455        assert_eq!(StealthLevel::from_params(&json!({})), StealthLevel::Basic);
456    }
457
458    #[test]
459    fn test_resolve_timeout_override() {
460        let adapter = BrowserAdapter::new();
461        let params = json!({ "timeout_ms": 5000u64 });
462        assert_eq!(adapter.resolve_timeout(&params), Duration::from_secs(5));
463    }
464
465    #[test]
466    fn test_resolve_timeout_default() {
467        let adapter = BrowserAdapter::new();
468        let params = json!({});
469        assert_eq!(adapter.resolve_timeout(&params), Duration::from_secs(30));
470    }
471
472    #[test]
473    fn test_config_builder() {
474        let config = BrowserAdapterConfig {
475            timeout: Duration::from_mins(1),
476            max_concurrent: 3,
477            block_resources: false,
478            ..BrowserAdapterConfig::default()
479        };
480        let adapter = BrowserAdapter::with_config(config);
481        assert_eq!(adapter.config.timeout, Duration::from_mins(1));
482        assert_eq!(adapter.config.max_concurrent, 3);
483    }
484
485    #[allow(clippy::panic)]
486    #[tokio::test]
487    #[ignore = "requires real Chrome binary"]
488    async fn test_execute_returns_service_output_or_unavailable() {
489        let adapter = BrowserAdapter::new();
490        let input = ServiceInput {
491            url: "https://example.com".to_string(),
492            params: json!({ "wait_strategy": "dom_content_loaded" }),
493        };
494        // Either succeeds (pool stub) or returns Unavailable — both are acceptable
495        match adapter.execute(input).await {
496            Ok(output) => {
497                assert!(!output.data.is_empty(), "output data should not be empty");
498                assert!(output.metadata.is_object());
499            }
500            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
501                // expected when headless Chrome is not available
502            }
503            Err(e) => panic!("unexpected error: {e}"),
504        }
505    }
506
507    // Integration tests from T00 Task Requirements
508
509    #[tokio::test]
510    #[ignore = "requires real Chrome binary and external network access"]
511    async fn browser_adapter_navigates_url() {
512        let config = BrowserAdapterConfig::default();
513        let adapter = BrowserAdapter::with_config(config);
514
515        let input = ServiceInput {
516            url: "https://example.com".to_string(),
517            params: json!({
518                "wait_strategy": "dom_content_loaded",
519                "timeout_ms": 30000
520            }),
521        };
522
523        let result = adapter.execute(input).await;
524
525        // Should succeed or return graceful unavailable (browser not installed)
526        match result {
527            Ok(output) => {
528                assert!(!output.data.is_empty());
529                assert!(
530                    output
531                        .metadata
532                        .get("rendered")
533                        .and_then(|v| v.as_bool())
534                        .unwrap_or(false)
535                );
536                assert!(output.metadata.get("navigation_time_ms").is_some());
537                assert_eq!(
538                    output.metadata.get("url").and_then(|v| v.as_str()),
539                    Some("https://example.com")
540                );
541            }
542            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
543                // Expected if Chrome not installed
544            }
545            Err(e) => panic!("Unexpected error: {e}"),
546        }
547    }
548
549    #[tokio::test]
550    #[ignore = "Requires Chrome installed and network access; may panic if browser unavailable"]
551    async fn browser_adapter_respects_timeout() {
552        let config = BrowserAdapterConfig {
553            timeout: Duration::from_secs(2),
554            ..Default::default()
555        };
556        let adapter = BrowserAdapter::with_config(config);
557
558        // This URL delays for 10 seconds, should timeout with 2s limit
559        let input = ServiceInput {
560            url: "https://httpbin.org/delay/10".to_string(),
561            params: json!({"timeout_ms": 2000}),
562        };
563
564        let result = adapter.execute(input).await;
565
566        // Should timeout gracefully or be unavailable (Chrome not installed)
567        match result {
568            Err(StygianError::Service(ServiceError::Unavailable(msg))) => {
569                // Expected if Chrome not installed or timeout occurred
570                assert!(
571                    msg.contains("timeout")
572                        || msg.contains("unavailable")
573                        || msg.contains("Chrome")
574                        || msg.contains("exhausted")
575                );
576            }
577            Err(StygianError::Service(ServiceError::Timeout(_))) => {
578                // Also acceptable - explicit timeout
579            }
580            Ok(_) => {
581                // Should not succeed with 2s timeout on 10s delay
582                panic!("Expected timeout or unavailable, got success");
583            }
584            Err(e) => {
585                // Any other error is acceptable (network, browser init, etc)
586                eprintln!("Got acceptable error: {e}");
587            }
588        }
589    }
590
591    #[tokio::test]
592    #[ignore = "requires real Chrome binary"]
593    async fn browser_adapter_invalid_url() {
594        let config = BrowserAdapterConfig::default();
595        let adapter = BrowserAdapter::with_config(config);
596
597        let input = ServiceInput {
598            url: "not-a-valid-url".to_string(),
599            params: json!({}),
600        };
601
602        let result = adapter.execute(input).await;
603
604        // Should surface browser error gracefully
605        assert!(result.is_err());
606    }
607
608    #[tokio::test]
609    #[ignore = "requires real Chrome binary and external network access"]
610    async fn browser_adapter_wait_strategy_selector() {
611        let config = BrowserAdapterConfig::default();
612        let adapter = BrowserAdapter::with_config(config);
613
614        let input = ServiceInput {
615            url: "https://example.com".to_string(),
616            params: json!({
617                "wait_strategy": "selector:body"
618            }),
619        };
620
621        match adapter.execute(input).await {
622            Ok(output) => {
623                assert_eq!(
624                    output
625                        .metadata
626                        .get("wait_strategy")
627                        .and_then(|v| v.as_str()),
628                    Some("selector_appears(body)")
629                );
630            }
631            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
632                // Expected if Chrome not installed
633            }
634            Err(e) => panic!("Unexpected error: {e}"),
635        }
636    }
637
638    #[tokio::test]
639    #[ignore = "requires real Chrome binary and external network access"]
640    async fn browser_adapter_metadata_complete() {
641        let config = BrowserAdapterConfig {
642            default_stealth: StealthLevel::Advanced,
643            user_agent: Some("Mozilla/5.0".to_string()),
644            viewport_width: 1440,
645            viewport_height: 900,
646            ..Default::default()
647        };
648        let adapter = BrowserAdapter::with_config(config);
649
650        let input = ServiceInput {
651            url: "https://example.com".to_string(),
652            params: json!({}),
653        };
654
655        match adapter.execute(input).await {
656            Ok(output) => {
657                assert_eq!(
658                    output.metadata.get("url").and_then(|v| v.as_str()),
659                    Some("https://example.com")
660                );
661                assert_eq!(
662                    output
663                        .metadata
664                        .get("stealth_level")
665                        .and_then(|v| v.as_str()),
666                    Some("advanced")
667                );
668                assert!(output.metadata.get("viewport").is_some());
669                assert!(output.metadata.get("navigation_time_ms").is_some());
670                let viewport = output.metadata.get("viewport").expect("viewport exists");
671                assert_eq!(viewport.get("width").and_then(|v| v.as_u64()), Some(1440));
672                assert_eq!(viewport.get("height").and_then(|v| v.as_u64()), Some(900));
673            }
674            Err(StygianError::Service(ServiceError::Unavailable(_))) => {
675                // Expected if Chrome not installed
676            }
677            Err(e) => panic!("Unexpected error: {e}"),
678        }
679    }
680}