omnivore_core/crawler/
browser.rs

1use crate::{CrawlResult, Error, Result};
2use crate::extractor::ContentExtractor;
3use thirtyfour::prelude::*;
4use url::Url;
5use tokio::time::sleep;
6use std::time::Duration;
7use tracing::{info, warn, debug};
8use serde::{Serialize, Deserialize};
9
10pub struct BrowserEngine {
11    driver: Option<WebDriver>,
12    headless: bool,
13}
14
15#[derive(Debug, Serialize, Deserialize)]
16pub struct DynamicContent {
17    pub url: String,
18    pub main_content: String,
19    pub dropdown_contents: Vec<DropdownContent>,
20    pub filter_contents: Vec<FilterContent>,
21    pub has_infinite_scroll: bool,
22}
23
24#[derive(Debug, Serialize, Deserialize)]
25pub struct DropdownContent {
26    pub index: usize,
27    pub label: Option<String>,
28    pub content: String,
29}
30
31#[derive(Debug, Serialize, Deserialize)]
32pub struct FilterContent {
33    pub index: usize,
34    pub label: Option<String>,
35    pub content: String,
36}
37
38impl BrowserEngine {
39    pub async fn new() -> Result<Self> {
40        Ok(Self { 
41            driver: None,
42            headless: true,
43        })
44    }
45    
46    pub async fn new_with_options(headless: bool) -> Result<Self> {
47        Ok(Self {
48            driver: None,
49            headless,
50        })
51    }
52
53    pub async fn connect(&mut self) -> Result<()> {
54        let mut caps = DesiredCapabilities::chrome();
55        
56        // Add Chrome options
57        let chrome_args = if self.headless {
58            vec![
59                "--headless",
60                "--no-sandbox",
61                "--disable-dev-shm-usage",
62                "--disable-gpu",
63                "--window-size=1920,1080",
64                "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
65            ]
66        } else {
67            vec![
68                "--no-sandbox",
69                "--disable-dev-shm-usage",
70                "--window-size=1920,1080",
71                "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
72            ]
73        };
74        
75        for arg in chrome_args {
76            caps.add_arg(arg).map_err(|e| Error::Browser(format!("Failed to add Chrome arg: {e}")))?;
77        }
78
79        match WebDriver::new("http://localhost:9515", caps).await {
80            Ok(driver) => {
81                self.driver = Some(driver);
82                Ok(())
83            }
84            Err(e) => Err(Error::Browser(format!("Failed to connect to browser: {e}"))),
85        }
86    }
87
88    pub async fn crawl_dynamic(&self, url: Url) -> Result<CrawlResult> {
89        let driver = self
90            .driver
91            .as_ref()
92            .ok_or_else(|| Error::Browser("Browser not connected".to_string()))?;
93
94        driver
95            .goto(url.as_str())
96            .await
97            .map_err(|e| Error::Browser(format!("Navigation failed: {e}")))?;
98
99        // Wait for page to be ready
100        self.wait_for_page_ready(driver).await?;
101
102        let content = driver
103            .source()
104            .await
105            .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
106
107        let links = self.extract_links_js(driver, &url).await?;
108
109        // Extract clean content
110        let extractor = ContentExtractor::new();
111        let cleaned_content = Some(extractor.extract_clean_content(&content));
112
113        Ok(CrawlResult {
114            url: url.to_string(),
115            status_code: 200,
116            content,
117            cleaned_content,
118            headers: std::collections::HashMap::new(),
119            extracted_data: serde_json::json!({}),
120            links: links.into_iter().map(|u| u.to_string()).collect(),
121            crawled_at: chrono::Utc::now(),
122        })
123    }
124    
125    pub async fn crawl_with_interactions(&self, url: Url) -> Result<DynamicContent> {
126        let driver = self
127            .driver
128            .as_ref()
129            .ok_or_else(|| Error::Browser("Browser not connected".to_string()))?;
130
131        info!("Crawling with interactions: {}", url);
132        
133        driver
134            .goto(url.as_str())
135            .await
136            .map_err(|e| Error::Browser(format!("Navigation failed: {e}")))?;
137
138        self.wait_for_page_ready(driver).await?;
139        
140        // Check for infinite scroll
141        let has_infinite_scroll = self.detect_infinite_scroll(driver).await?;
142        if has_infinite_scroll {
143            info!("Detected infinite scroll, loading all content...");
144            self.handle_infinite_scroll(driver, 10).await?;
145        }
146        
147        // Get main content
148        let main_content = driver
149            .source()
150            .await
151            .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
152        
153        // Find and interact with dropdowns
154        let dropdown_contents = self.interact_with_dropdowns(driver).await?;
155        
156        // Find and interact with filters
157        let filter_contents = self.interact_with_filters(driver).await?;
158        
159        Ok(DynamicContent {
160            url: url.to_string(),
161            main_content,
162            dropdown_contents,
163            filter_contents,
164            has_infinite_scroll,
165        })
166    }
167    
168    async fn wait_for_page_ready(&self, driver: &WebDriver) -> Result<()> {
169        let script = r#"
170            return document.readyState === 'complete' && 
171                   (typeof jQuery === 'undefined' || jQuery.active === 0) &&
172                   (typeof angular === 'undefined' || !angular.element(document).injector() || 
173                    angular.element(document).injector().get('$http').pendingRequests.length === 0);
174        "#;
175        
176        let max_wait = Duration::from_secs(30);
177        let start = std::time::Instant::now();
178        
179        loop {
180            if start.elapsed() > max_wait {
181                warn!("Page load timeout exceeded");
182                break;
183            }
184            
185            match driver.execute(script, vec![]).await {
186                Ok(ret) => {
187                    if let Some(ready) = ret.json().as_bool() {
188                        if ready {
189                            debug!("Page is ready");
190                            break;
191                        }
192                    }
193                }
194                Err(_) => {
195                    // Try basic readyState check
196                    let basic_script = "return document.readyState === 'complete';";
197                    if let Ok(ret) = driver.execute(basic_script, vec![]).await {
198                        if let Some(ready) = ret.json().as_bool() {
199                            if ready {
200                                break;
201                            }
202                        }
203                    }
204                }
205            }
206            
207            sleep(Duration::from_millis(500)).await;
208        }
209        
210        Ok(())
211    }
212    
213    async fn detect_infinite_scroll(&self, driver: &WebDriver) -> Result<bool> {
214        let initial_height = driver
215            .execute("return document.body.scrollHeight;", vec![])
216            .await
217            .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?  
218            .json()
219            .as_i64()
220            .unwrap_or(0);
221        
222        driver.execute("window.scrollTo(0, document.body.scrollHeight);", vec![])
223            .await
224            .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?;
225        sleep(Duration::from_secs(2)).await;
226        
227        let new_height = driver
228            .execute("return document.body.scrollHeight;", vec![])
229            .await
230            .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?  
231            .json()
232            .as_i64()
233            .unwrap_or(0);
234        
235        Ok(new_height > initial_height)
236    }
237    
238    async fn handle_infinite_scroll(&self, driver: &WebDriver, max_scrolls: u32) -> Result<()> {
239        let mut last_height: i64 = 0;
240        
241        for _ in 0..max_scrolls {
242            let current_height = driver
243                .execute("return document.body.scrollHeight;", vec![])
244                .await
245                .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?
246                .json()
247                .as_i64()
248                .unwrap_or(0);
249            
250            if current_height == last_height {
251                break;
252            }
253            
254            last_height = current_height;
255            driver.execute("window.scrollTo(0, document.body.scrollHeight);", vec![])
256            .await
257            .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?;
258            sleep(Duration::from_secs(2)).await;
259        }
260        
261        Ok(())
262    }
263    
264    async fn interact_with_dropdowns(&self, driver: &WebDriver) -> Result<Vec<DropdownContent>> {
265        let mut contents = Vec::new();
266        
267        // Find select elements
268        let selects = driver.find_all(By::Css("select")).await.unwrap_or_default();
269        
270        for (idx, select) in selects.iter().enumerate() {
271            // Get label if available
272            let label = select.attr("aria-label").await.ok().flatten();
273            
274            // Get all options
275            if let Ok(options) = select.find_all(By::Css("option")).await {
276                for option in options.iter().skip(1) { // Skip first (usually default) option
277                    if let Ok(_) = option.click().await {
278                        sleep(Duration::from_millis(1000)).await;
279                        self.wait_for_page_ready(driver).await?;
280                        
281                        let content = driver.source().await
282                            .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
283                        contents.push(DropdownContent {
284                            index: idx,
285                            label: label.clone(),
286                            content,
287                        });
288                    }
289                }
290            }
291        }
292        
293        // Find custom dropdowns
294        let custom_dropdowns = driver.find_all(By::Css("[role='combobox'], .dropdown, [data-toggle='dropdown']"))
295            .await.unwrap_or_default();
296        
297        for (idx, dropdown) in custom_dropdowns.iter().enumerate() {
298            let label = dropdown.attr("aria-label").await.ok().flatten();
299            
300            if let Ok(_) = dropdown.click().await {
301                sleep(Duration::from_millis(500)).await;
302                
303                // Look for dropdown items
304                if let Ok(items) = driver.find_all(By::Css(".dropdown-item, [role='option'], li")).await {
305                    for item in items.iter().take(5) { // Limit to first 5 items
306                        if let Ok(_) = item.click().await {
307                            sleep(Duration::from_millis(1000)).await;
308                            self.wait_for_page_ready(driver).await?;
309                            
310                            let content = driver.source().await
311                            .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
312                            contents.push(DropdownContent {
313                                index: selects.len() + idx,
314                                label: label.clone(),
315                                content,
316                            });
317                            
318                            // Re-open dropdown
319                            dropdown.click().await.ok();
320                            sleep(Duration::from_millis(500)).await;
321                        }
322                    }
323                }
324            }
325        }
326        
327        info!("Extracted {} dropdown variations", contents.len());
328        Ok(contents)
329    }
330    
331    async fn interact_with_filters(&self, driver: &WebDriver) -> Result<Vec<FilterContent>> {
332        let mut contents = Vec::new();
333        
334        // Find checkboxes and radio buttons
335        let filters = driver.find_all(By::Css("input[type='checkbox'], input[type='radio'], [role='checkbox'], [role='radio']"))
336            .await.unwrap_or_default();
337        
338        for (idx, filter) in filters.iter().enumerate().take(10) { // Limit to first 10 filters
339            let label = filter.attr("aria-label").await.ok().flatten();
340            
341            if let Ok(_) = filter.click().await {
342                sleep(Duration::from_millis(1000)).await;
343                self.wait_for_page_ready(driver).await?;
344                
345                let content = driver.source().await
346                            .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
347                contents.push(FilterContent {
348                    index: idx,
349                    label,
350                    content,
351                });
352            }
353        }
354        
355        info!("Extracted {} filter variations", contents.len());
356        Ok(contents)
357    }
358
359    async fn extract_links_js(&self, driver: &WebDriver, _base_url: &Url) -> Result<Vec<Url>> {
360        let script = r#"
361            return Array.from(document.querySelectorAll('a[href]'))
362                .map(a => a.href)
363                .filter(href => href.startsWith('http'));
364        "#;
365
366        let links_value = driver
367            .execute(script, vec![])
368            .await
369            .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?;
370
371        let mut links = Vec::new();
372
373        // Convert the ScriptRet value to JSON
374        let json_value = links_value.json();
375        if let Some(array) = json_value.as_array() {
376            for value in array {
377                if let Some(href) = value.as_str() {
378                    if let Ok(url) = Url::parse(href) {
379                        links.push(url);
380                    }
381                }
382            }
383        }
384
385        Ok(links)
386    }
387
388    pub async fn disconnect(&mut self) -> Result<()> {
389        if let Some(driver) = self.driver.take() {
390            driver
391                .quit()
392                .await
393                .map_err(|e| Error::Browser(format!("Failed to quit browser: {e}")))?;
394        }
395        Ok(())
396    }
397}
398
399impl Drop for BrowserEngine {
400    fn drop(&mut self) {
401        if let Some(driver) = self.driver.take() {
402            tokio::task::spawn(async move {
403                let _ = driver.quit().await;
404            });
405        }
406    }
407}