omnivore_core/
browser.rs

1use anyhow::{Context, Result};
2use thirtyfour::prelude::*;
3use thirtyfour::{ChromeCapabilities, FirefoxCapabilities};
4use std::time::Duration;
5use tokio::time::sleep;
6use serde::{Deserialize, Serialize};
7use tracing::{info, warn, debug};
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct BrowserConfig {
11    pub browser_type: BrowserType,
12    pub headless: bool,
13    pub page_load_timeout: u64,
14    pub script_timeout: u64,
15    pub implicit_wait: u64,
16    pub driver_url: Option<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
20#[serde(rename_all = "lowercase")]
21pub enum BrowserType {
22    Chrome,
23    Firefox,
24}
25
26impl Default for BrowserConfig {
27    fn default() -> Self {
28        Self {
29            browser_type: BrowserType::Chrome,
30            headless: true,
31            page_load_timeout: 30,
32            script_timeout: 30,
33            implicit_wait: 10,
34            driver_url: None,
35        }
36    }
37}
38
39pub struct BrowserEngine {
40    driver: WebDriver,
41    #[allow(dead_code)]
42    config: BrowserConfig,
43}
44
45impl BrowserEngine {
46    pub async fn new(config: BrowserConfig) -> Result<Self> {
47        let driver_url = config.driver_url.clone()
48            .unwrap_or_else(|| "http://localhost:4444".to_string());
49        
50        let driver = match config.browser_type {
51            BrowserType::Chrome => {
52                let mut caps = ChromeCapabilities::new();
53                if config.headless {
54                    caps.add_arg("--headless")?;
55                }
56                caps.add_arg("--no-sandbox")?;
57                caps.add_arg("--disable-dev-shm-usage")?;
58                caps.add_arg("--disable-gpu")?;
59                caps.add_arg("--window-size=1920,1080")?;
60                caps.add_arg("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")?;
61                
62                WebDriver::new(&driver_url, caps).await
63                    .context("Failed to create Chrome WebDriver")?
64            }
65            BrowserType::Firefox => {
66                let mut caps = FirefoxCapabilities::new();
67                if config.headless {
68                    caps.add_arg("-headless")?;
69                }
70                caps.add_arg("-width=1920")?;
71                caps.add_arg("-height=1080")?;
72                
73                WebDriver::new(&driver_url, caps).await
74                    .context("Failed to create Firefox WebDriver")?
75            }
76        };
77        
78        // Set timeouts
79        driver.set_page_load_timeout(Duration::from_secs(config.page_load_timeout)).await?;
80        driver.set_script_timeout(Duration::from_secs(config.script_timeout)).await?;
81        driver.set_implicit_wait_timeout(Duration::from_secs(config.implicit_wait)).await?;
82        
83        Ok(Self { driver, config })
84    }
85    
86    pub async fn navigate(&self, url: &str) -> Result<()> {
87        self.driver.goto(url).await
88            .context(format!("Failed to navigate to {}", url))?;
89        
90        // Wait for page to be ready
91        self.wait_for_page_ready().await?;
92        
93        Ok(())
94    }
95    
96    pub async fn wait_for_page_ready(&self) -> Result<()> {
97        let script = r#"
98            return document.readyState === 'complete' && 
99                   (typeof jQuery === 'undefined' || jQuery.active === 0) &&
100                   (typeof angular === 'undefined' || !angular.element(document).injector() || 
101                    angular.element(document).injector().get('$http').pendingRequests.length === 0);
102        "#;
103        
104        let max_wait = Duration::from_secs(30);
105        let start = std::time::Instant::now();
106        
107        loop {
108            if start.elapsed() > max_wait {
109                warn!("Page load timeout exceeded");
110                break;
111            }
112            
113            match self.driver.execute(script, vec![]).await {
114                Ok(ret) => {
115                    if let Some(ready) = ret.json().as_bool() {
116                        if ready {
117                            debug!("Page is ready");
118                            break;
119                        }
120                    }
121                }
122                Err(_) => {
123                    // Script execution failed, page might not have jQuery/Angular
124                    // Check basic readyState
125                    let basic_script = "return document.readyState === 'complete';";
126                    if let Ok(ret) = self.driver.execute(basic_script, vec![]).await {
127                        if let Some(ready) = ret.json().as_bool() {
128                            if ready {
129                                break;
130                            }
131                        }
132                    }
133                }
134            }
135            
136            sleep(Duration::from_millis(500)).await;
137        }
138        
139        Ok(())
140    }
141    
142    pub async fn find_dropdowns(&self) -> Result<Vec<WebElement>> {
143        let selectors = vec![
144            "select",
145            "div[role='combobox']",
146            "div[role='listbox']",
147            ".dropdown",
148            ".select-wrapper",
149            "[data-toggle='dropdown']",
150        ];
151        
152        let mut dropdowns = Vec::new();
153        
154        for selector in selectors {
155            match self.driver.find_all(By::Css(selector)).await {
156                Ok(elements) => dropdowns.extend(elements),
157                Err(_) => continue,
158            }
159        }
160        
161        Ok(dropdowns)
162    }
163    
164    pub async fn find_filters(&self) -> Result<Vec<WebElement>> {
165        let selectors = vec![
166            "input[type='checkbox']",
167            "input[type='radio']",
168            ".filter",
169            ".filter-option",
170            "[data-filter]",
171            "[role='checkbox']",
172            "[role='radio']",
173        ];
174        
175        let mut filters = Vec::new();
176        
177        for selector in selectors {
178            match self.driver.find_all(By::Css(selector)).await {
179                Ok(elements) => filters.extend(elements),
180                Err(_) => continue,
181            }
182        }
183        
184        Ok(filters)
185    }
186    
187    pub async fn interact_with_dropdown(&self, dropdown: &WebElement) -> Result<Vec<String>> {
188        let mut contents = Vec::new();
189        
190        // Check if it's a select element
191        if dropdown.tag_name().await?.to_lowercase() == "select" {
192            let options = dropdown.find_all(By::Css("option")).await?;
193            
194            for option in options {
195                // Click the option
196                if let Ok(_) = option.click().await {
197                    sleep(Duration::from_millis(1000)).await;
198                    self.wait_for_page_ready().await?;
199                    
200                    // Get page content
201                    let content = self.get_page_content().await?;
202                    contents.push(content);
203                }
204            }
205        } else {
206            // Handle custom dropdowns
207            if let Ok(_) = dropdown.click().await {
208                sleep(Duration::from_millis(500)).await;
209                
210                // Look for dropdown items
211                let item_selectors = vec![
212                    "li",
213                    ".dropdown-item",
214                    "[role='option']",
215                    ".option",
216                ];
217                
218                for selector in item_selectors {
219                    if let Ok(items) = self.driver.find_all(By::Css(selector)).await {
220                        for item in items {
221                            if let Ok(_) = item.click().await {
222                                sleep(Duration::from_millis(1000)).await;
223                                self.wait_for_page_ready().await?;
224                                
225                                let content = self.get_page_content().await?;
226                                contents.push(content);
227                                
228                                // Re-open dropdown for next item
229                                dropdown.click().await.ok();
230                                sleep(Duration::from_millis(500)).await;
231                            }
232                        }
233                        break;
234                    }
235                }
236            }
237        }
238        
239        Ok(contents)
240    }
241    
242    pub async fn interact_with_filter(&self, filter: &WebElement) -> Result<String> {
243        filter.click().await?;
244        sleep(Duration::from_millis(1000)).await;
245        self.wait_for_page_ready().await?;
246        
247        self.get_page_content().await
248    }
249    
250    pub async fn get_page_content(&self) -> Result<String> {
251        let html = self.driver.source().await
252            .context("Failed to get page source")?;
253        Ok(html)
254    }
255    
256    pub async fn scroll_to_bottom(&self) -> Result<()> {
257        let script = "window.scrollTo(0, document.body.scrollHeight);";
258        self.driver.execute(script, vec![]).await?;
259        sleep(Duration::from_millis(1000)).await;
260        Ok(())
261    }
262    
263    pub async fn infinite_scroll(&self, max_scrolls: u32) -> Result<()> {
264        let mut last_height: i64 = 0;
265        
266        for _ in 0..max_scrolls {
267            // Get current scroll height
268            let script = "return document.body.scrollHeight;";
269            let height_result = self.driver.execute(script, vec![]).await?;
270            let current_height = height_result.json().as_i64().unwrap_or(0);
271            
272            if current_height == last_height {
273                // No more content to load
274                break;
275            }
276            
277            last_height = current_height;
278            
279            // Scroll to bottom
280            self.scroll_to_bottom().await?;
281            
282            // Wait for new content to load
283            sleep(Duration::from_secs(2)).await;
284        }
285        
286        Ok(())
287    }
288    
289    pub async fn extract_dynamic_content(&self, url: &str) -> Result<DynamicContent> {
290        info!("Extracting dynamic content from: {}", url);
291        
292        self.navigate(url).await?;
293        
294        // Check for infinite scroll
295        let initial_height = self.driver
296            .execute("return document.body.scrollHeight;", vec![])
297            .await?
298            .json()
299            .as_i64()
300            .unwrap_or(0);
301        
302        self.scroll_to_bottom().await?;
303        sleep(Duration::from_secs(2)).await;
304        
305        let new_height = self.driver
306            .execute("return document.body.scrollHeight;", vec![])
307            .await?
308            .json()
309            .as_i64()
310            .unwrap_or(0);
311        
312        let has_infinite_scroll = new_height > initial_height;
313        
314        if has_infinite_scroll {
315            info!("Detected infinite scroll, loading all content...");
316            self.infinite_scroll(10).await?;
317        }
318        
319        // Get main content
320        let main_content = self.get_page_content().await?;
321        
322        // Find interactive elements
323        let dropdowns = self.find_dropdowns().await?;
324        let filters = self.find_filters().await?;
325        
326        info!("Found {} dropdowns and {} filters", dropdowns.len(), filters.len());
327        
328        let mut dropdown_contents = Vec::new();
329        let mut filter_contents = Vec::new();
330        
331        // Interact with dropdowns
332        for (idx, dropdown) in dropdowns.iter().enumerate() {
333            info!("Processing dropdown {}/{}", idx + 1, dropdowns.len());
334            match self.interact_with_dropdown(dropdown).await {
335                Ok(contents) => {
336                    for content in contents {
337                        dropdown_contents.push(DropdownContent {
338                            index: idx,
339                            content,
340                        });
341                    }
342                }
343                Err(e) => warn!("Failed to interact with dropdown {}: {}", idx, e),
344            }
345        }
346        
347        // Interact with filters
348        for (idx, filter) in filters.iter().enumerate() {
349            info!("Processing filter {}/{}", idx + 1, filters.len());
350            match self.interact_with_filter(filter).await {
351                Ok(content) => {
352                    filter_contents.push(FilterContent {
353                        index: idx,
354                        content,
355                    });
356                }
357                Err(e) => warn!("Failed to interact with filter {}: {}", idx, e),
358            }
359        }
360        
361        Ok(DynamicContent {
362            url: url.to_string(),
363            main_content,
364            dropdown_contents,
365            filter_contents,
366            has_infinite_scroll,
367        })
368    }
369    
370    pub async fn quit(self) -> Result<()> {
371        self.driver.quit().await
372            .context("Failed to quit WebDriver")?;
373        Ok(())
374    }
375}
376
377#[derive(Debug, Serialize, Deserialize)]
378pub struct DynamicContent {
379    pub url: String,
380    pub main_content: String,
381    pub dropdown_contents: Vec<DropdownContent>,
382    pub filter_contents: Vec<FilterContent>,
383    pub has_infinite_scroll: bool,
384}
385
386#[derive(Debug, Serialize, Deserialize)]
387pub struct DropdownContent {
388    pub index: usize,
389    pub content: String,
390}
391
392#[derive(Debug, Serialize, Deserialize)]
393pub struct FilterContent {
394    pub index: usize,
395    pub content: String,
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[tokio::test]
403    async fn test_browser_config() {
404        let config = BrowserConfig::default();
405        assert!(config.headless);
406        assert_eq!(config.page_load_timeout, 30);
407    }
408}