1use anyhow::{Context, Result};
2use thirtyfour::prelude::*;
3use thirtyfour::{ChromeCapabilities, FirefoxCapabilities};
4use std::time::Duration;
5use tokio::time::sleep;
6use serde::{Deserialize, Serialize};
7use tracing::{info, warn, debug};
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct BrowserConfig {
11 pub browser_type: BrowserType,
12 pub headless: bool,
13 pub page_load_timeout: u64,
14 pub script_timeout: u64,
15 pub implicit_wait: u64,
16 pub driver_url: Option<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
20#[serde(rename_all = "lowercase")]
21pub enum BrowserType {
22 Chrome,
23 Firefox,
24}
25
26impl Default for BrowserConfig {
27 fn default() -> Self {
28 Self {
29 browser_type: BrowserType::Chrome,
30 headless: true,
31 page_load_timeout: 30,
32 script_timeout: 30,
33 implicit_wait: 10,
34 driver_url: None,
35 }
36 }
37}
38
39pub struct BrowserEngine {
40 driver: WebDriver,
41 #[allow(dead_code)]
42 config: BrowserConfig,
43}
44
45impl BrowserEngine {
46 pub async fn new(config: BrowserConfig) -> Result<Self> {
47 let driver_url = config.driver_url.clone()
48 .unwrap_or_else(|| "http://localhost:4444".to_string());
49
50 let driver = match config.browser_type {
51 BrowserType::Chrome => {
52 let mut caps = ChromeCapabilities::new();
53 if config.headless {
54 caps.add_arg("--headless")?;
55 }
56 caps.add_arg("--no-sandbox")?;
57 caps.add_arg("--disable-dev-shm-usage")?;
58 caps.add_arg("--disable-gpu")?;
59 caps.add_arg("--window-size=1920,1080")?;
60 caps.add_arg("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")?;
61
62 WebDriver::new(&driver_url, caps).await
63 .context("Failed to create Chrome WebDriver")?
64 }
65 BrowserType::Firefox => {
66 let mut caps = FirefoxCapabilities::new();
67 if config.headless {
68 caps.add_arg("-headless")?;
69 }
70 caps.add_arg("-width=1920")?;
71 caps.add_arg("-height=1080")?;
72
73 WebDriver::new(&driver_url, caps).await
74 .context("Failed to create Firefox WebDriver")?
75 }
76 };
77
78 driver.set_page_load_timeout(Duration::from_secs(config.page_load_timeout)).await?;
80 driver.set_script_timeout(Duration::from_secs(config.script_timeout)).await?;
81 driver.set_implicit_wait_timeout(Duration::from_secs(config.implicit_wait)).await?;
82
83 Ok(Self { driver, config })
84 }
85
86 pub async fn navigate(&self, url: &str) -> Result<()> {
87 self.driver.goto(url).await
88 .context(format!("Failed to navigate to {}", url))?;
89
90 self.wait_for_page_ready().await?;
92
93 Ok(())
94 }
95
96 pub async fn wait_for_page_ready(&self) -> Result<()> {
97 let script = r#"
98 return document.readyState === 'complete' &&
99 (typeof jQuery === 'undefined' || jQuery.active === 0) &&
100 (typeof angular === 'undefined' || !angular.element(document).injector() ||
101 angular.element(document).injector().get('$http').pendingRequests.length === 0);
102 "#;
103
104 let max_wait = Duration::from_secs(30);
105 let start = std::time::Instant::now();
106
107 loop {
108 if start.elapsed() > max_wait {
109 warn!("Page load timeout exceeded");
110 break;
111 }
112
113 match self.driver.execute(script, vec![]).await {
114 Ok(ret) => {
115 if let Some(ready) = ret.json().as_bool() {
116 if ready {
117 debug!("Page is ready");
118 break;
119 }
120 }
121 }
122 Err(_) => {
123 let basic_script = "return document.readyState === 'complete';";
126 if let Ok(ret) = self.driver.execute(basic_script, vec![]).await {
127 if let Some(ready) = ret.json().as_bool() {
128 if ready {
129 break;
130 }
131 }
132 }
133 }
134 }
135
136 sleep(Duration::from_millis(500)).await;
137 }
138
139 Ok(())
140 }
141
142 pub async fn find_dropdowns(&self) -> Result<Vec<WebElement>> {
143 let selectors = vec![
144 "select",
145 "div[role='combobox']",
146 "div[role='listbox']",
147 ".dropdown",
148 ".select-wrapper",
149 "[data-toggle='dropdown']",
150 ];
151
152 let mut dropdowns = Vec::new();
153
154 for selector in selectors {
155 match self.driver.find_all(By::Css(selector)).await {
156 Ok(elements) => dropdowns.extend(elements),
157 Err(_) => continue,
158 }
159 }
160
161 Ok(dropdowns)
162 }
163
164 pub async fn find_filters(&self) -> Result<Vec<WebElement>> {
165 let selectors = vec![
166 "input[type='checkbox']",
167 "input[type='radio']",
168 ".filter",
169 ".filter-option",
170 "[data-filter]",
171 "[role='checkbox']",
172 "[role='radio']",
173 ];
174
175 let mut filters = Vec::new();
176
177 for selector in selectors {
178 match self.driver.find_all(By::Css(selector)).await {
179 Ok(elements) => filters.extend(elements),
180 Err(_) => continue,
181 }
182 }
183
184 Ok(filters)
185 }
186
187 pub async fn interact_with_dropdown(&self, dropdown: &WebElement) -> Result<Vec<String>> {
188 let mut contents = Vec::new();
189
190 if dropdown.tag_name().await?.to_lowercase() == "select" {
192 let options = dropdown.find_all(By::Css("option")).await?;
193
194 for option in options {
195 if let Ok(_) = option.click().await {
197 sleep(Duration::from_millis(1000)).await;
198 self.wait_for_page_ready().await?;
199
200 let content = self.get_page_content().await?;
202 contents.push(content);
203 }
204 }
205 } else {
206 if let Ok(_) = dropdown.click().await {
208 sleep(Duration::from_millis(500)).await;
209
210 let item_selectors = vec![
212 "li",
213 ".dropdown-item",
214 "[role='option']",
215 ".option",
216 ];
217
218 for selector in item_selectors {
219 if let Ok(items) = self.driver.find_all(By::Css(selector)).await {
220 for item in items {
221 if let Ok(_) = item.click().await {
222 sleep(Duration::from_millis(1000)).await;
223 self.wait_for_page_ready().await?;
224
225 let content = self.get_page_content().await?;
226 contents.push(content);
227
228 dropdown.click().await.ok();
230 sleep(Duration::from_millis(500)).await;
231 }
232 }
233 break;
234 }
235 }
236 }
237 }
238
239 Ok(contents)
240 }
241
242 pub async fn interact_with_filter(&self, filter: &WebElement) -> Result<String> {
243 filter.click().await?;
244 sleep(Duration::from_millis(1000)).await;
245 self.wait_for_page_ready().await?;
246
247 self.get_page_content().await
248 }
249
250 pub async fn get_page_content(&self) -> Result<String> {
251 let html = self.driver.source().await
252 .context("Failed to get page source")?;
253 Ok(html)
254 }
255
256 pub async fn scroll_to_bottom(&self) -> Result<()> {
257 let script = "window.scrollTo(0, document.body.scrollHeight);";
258 self.driver.execute(script, vec![]).await?;
259 sleep(Duration::from_millis(1000)).await;
260 Ok(())
261 }
262
263 pub async fn infinite_scroll(&self, max_scrolls: u32) -> Result<()> {
264 let mut last_height: i64 = 0;
265
266 for _ in 0..max_scrolls {
267 let script = "return document.body.scrollHeight;";
269 let height_result = self.driver.execute(script, vec![]).await?;
270 let current_height = height_result.json().as_i64().unwrap_or(0);
271
272 if current_height == last_height {
273 break;
275 }
276
277 last_height = current_height;
278
279 self.scroll_to_bottom().await?;
281
282 sleep(Duration::from_secs(2)).await;
284 }
285
286 Ok(())
287 }
288
289 pub async fn extract_dynamic_content(&self, url: &str) -> Result<DynamicContent> {
290 info!("Extracting dynamic content from: {}", url);
291
292 self.navigate(url).await?;
293
294 let initial_height = self.driver
296 .execute("return document.body.scrollHeight;", vec![])
297 .await?
298 .json()
299 .as_i64()
300 .unwrap_or(0);
301
302 self.scroll_to_bottom().await?;
303 sleep(Duration::from_secs(2)).await;
304
305 let new_height = self.driver
306 .execute("return document.body.scrollHeight;", vec![])
307 .await?
308 .json()
309 .as_i64()
310 .unwrap_or(0);
311
312 let has_infinite_scroll = new_height > initial_height;
313
314 if has_infinite_scroll {
315 info!("Detected infinite scroll, loading all content...");
316 self.infinite_scroll(10).await?;
317 }
318
319 let main_content = self.get_page_content().await?;
321
322 let dropdowns = self.find_dropdowns().await?;
324 let filters = self.find_filters().await?;
325
326 info!("Found {} dropdowns and {} filters", dropdowns.len(), filters.len());
327
328 let mut dropdown_contents = Vec::new();
329 let mut filter_contents = Vec::new();
330
331 for (idx, dropdown) in dropdowns.iter().enumerate() {
333 info!("Processing dropdown {}/{}", idx + 1, dropdowns.len());
334 match self.interact_with_dropdown(dropdown).await {
335 Ok(contents) => {
336 for content in contents {
337 dropdown_contents.push(DropdownContent {
338 index: idx,
339 content,
340 });
341 }
342 }
343 Err(e) => warn!("Failed to interact with dropdown {}: {}", idx, e),
344 }
345 }
346
347 for (idx, filter) in filters.iter().enumerate() {
349 info!("Processing filter {}/{}", idx + 1, filters.len());
350 match self.interact_with_filter(filter).await {
351 Ok(content) => {
352 filter_contents.push(FilterContent {
353 index: idx,
354 content,
355 });
356 }
357 Err(e) => warn!("Failed to interact with filter {}: {}", idx, e),
358 }
359 }
360
361 Ok(DynamicContent {
362 url: url.to_string(),
363 main_content,
364 dropdown_contents,
365 filter_contents,
366 has_infinite_scroll,
367 })
368 }
369
370 pub async fn quit(self) -> Result<()> {
371 self.driver.quit().await
372 .context("Failed to quit WebDriver")?;
373 Ok(())
374 }
375}
376
377#[derive(Debug, Serialize, Deserialize)]
378pub struct DynamicContent {
379 pub url: String,
380 pub main_content: String,
381 pub dropdown_contents: Vec<DropdownContent>,
382 pub filter_contents: Vec<FilterContent>,
383 pub has_infinite_scroll: bool,
384}
385
386#[derive(Debug, Serialize, Deserialize)]
387pub struct DropdownContent {
388 pub index: usize,
389 pub content: String,
390}
391
392#[derive(Debug, Serialize, Deserialize)]
393pub struct FilterContent {
394 pub index: usize,
395 pub content: String,
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[tokio::test]
403 async fn test_browser_config() {
404 let config = BrowserConfig::default();
405 assert!(config.headless);
406 assert_eq!(config.page_load_timeout, 30);
407 }
408}