1use crate::{CrawlResult, Error, Result};
2use crate::extractor::ContentExtractor;
3use thirtyfour::prelude::*;
4use url::Url;
5use tokio::time::sleep;
6use std::time::Duration;
7use tracing::{info, warn, debug};
8use serde::{Serialize, Deserialize};
9
10pub struct BrowserEngine {
11 driver: Option<WebDriver>,
12 headless: bool,
13}
14
15#[derive(Debug, Serialize, Deserialize)]
16pub struct DynamicContent {
17 pub url: String,
18 pub main_content: String,
19 pub dropdown_contents: Vec<DropdownContent>,
20 pub filter_contents: Vec<FilterContent>,
21 pub has_infinite_scroll: bool,
22}
23
24#[derive(Debug, Serialize, Deserialize)]
25pub struct DropdownContent {
26 pub index: usize,
27 pub label: Option<String>,
28 pub content: String,
29}
30
31#[derive(Debug, Serialize, Deserialize)]
32pub struct FilterContent {
33 pub index: usize,
34 pub label: Option<String>,
35 pub content: String,
36}
37
38impl BrowserEngine {
39 pub async fn new() -> Result<Self> {
40 Ok(Self {
41 driver: None,
42 headless: true,
43 })
44 }
45
46 pub async fn new_with_options(headless: bool) -> Result<Self> {
47 Ok(Self {
48 driver: None,
49 headless,
50 })
51 }
52
53 pub async fn connect(&mut self) -> Result<()> {
54 let mut caps = DesiredCapabilities::chrome();
55
56 let chrome_args = if self.headless {
58 vec![
59 "--headless",
60 "--no-sandbox",
61 "--disable-dev-shm-usage",
62 "--disable-gpu",
63 "--window-size=1920,1080",
64 "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
65 ]
66 } else {
67 vec![
68 "--no-sandbox",
69 "--disable-dev-shm-usage",
70 "--window-size=1920,1080",
71 "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
72 ]
73 };
74
75 for arg in chrome_args {
76 caps.add_arg(arg).map_err(|e| Error::Browser(format!("Failed to add Chrome arg: {e}")))?;
77 }
78
79 match WebDriver::new("http://localhost:9515", caps).await {
80 Ok(driver) => {
81 self.driver = Some(driver);
82 Ok(())
83 }
84 Err(e) => Err(Error::Browser(format!("Failed to connect to browser: {e}"))),
85 }
86 }
87
88 pub async fn crawl_dynamic(&self, url: Url) -> Result<CrawlResult> {
89 let driver = self
90 .driver
91 .as_ref()
92 .ok_or_else(|| Error::Browser("Browser not connected".to_string()))?;
93
94 driver
95 .goto(url.as_str())
96 .await
97 .map_err(|e| Error::Browser(format!("Navigation failed: {e}")))?;
98
99 self.wait_for_page_ready(driver).await?;
101
102 let content = driver
103 .source()
104 .await
105 .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
106
107 let links = self.extract_links_js(driver, &url).await?;
108
109 let extractor = ContentExtractor::new();
111 let cleaned_content = Some(extractor.extract_clean_content(&content));
112
113 Ok(CrawlResult {
114 url: url.to_string(),
115 status_code: 200,
116 content,
117 cleaned_content,
118 headers: std::collections::HashMap::new(),
119 extracted_data: serde_json::json!({}),
120 links: links.into_iter().map(|u| u.to_string()).collect(),
121 crawled_at: chrono::Utc::now(),
122 })
123 }
124
125 pub async fn crawl_with_interactions(&self, url: Url) -> Result<DynamicContent> {
126 let driver = self
127 .driver
128 .as_ref()
129 .ok_or_else(|| Error::Browser("Browser not connected".to_string()))?;
130
131 info!("Crawling with interactions: {}", url);
132
133 driver
134 .goto(url.as_str())
135 .await
136 .map_err(|e| Error::Browser(format!("Navigation failed: {e}")))?;
137
138 self.wait_for_page_ready(driver).await?;
139
140 let has_infinite_scroll = self.detect_infinite_scroll(driver).await?;
142 if has_infinite_scroll {
143 info!("Detected infinite scroll, loading all content...");
144 self.handle_infinite_scroll(driver, 10).await?;
145 }
146
147 let main_content = driver
149 .source()
150 .await
151 .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
152
153 let dropdown_contents = self.interact_with_dropdowns(driver).await?;
155
156 let filter_contents = self.interact_with_filters(driver).await?;
158
159 Ok(DynamicContent {
160 url: url.to_string(),
161 main_content,
162 dropdown_contents,
163 filter_contents,
164 has_infinite_scroll,
165 })
166 }
167
168 async fn wait_for_page_ready(&self, driver: &WebDriver) -> Result<()> {
169 let script = r#"
170 return document.readyState === 'complete' &&
171 (typeof jQuery === 'undefined' || jQuery.active === 0) &&
172 (typeof angular === 'undefined' || !angular.element(document).injector() ||
173 angular.element(document).injector().get('$http').pendingRequests.length === 0);
174 "#;
175
176 let max_wait = Duration::from_secs(30);
177 let start = std::time::Instant::now();
178
179 loop {
180 if start.elapsed() > max_wait {
181 warn!("Page load timeout exceeded");
182 break;
183 }
184
185 match driver.execute(script, vec![]).await {
186 Ok(ret) => {
187 if let Some(ready) = ret.json().as_bool() {
188 if ready {
189 debug!("Page is ready");
190 break;
191 }
192 }
193 }
194 Err(_) => {
195 let basic_script = "return document.readyState === 'complete';";
197 if let Ok(ret) = driver.execute(basic_script, vec![]).await {
198 if let Some(ready) = ret.json().as_bool() {
199 if ready {
200 break;
201 }
202 }
203 }
204 }
205 }
206
207 sleep(Duration::from_millis(500)).await;
208 }
209
210 Ok(())
211 }
212
213 async fn detect_infinite_scroll(&self, driver: &WebDriver) -> Result<bool> {
214 let initial_height = driver
215 .execute("return document.body.scrollHeight;", vec![])
216 .await
217 .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?
218 .json()
219 .as_i64()
220 .unwrap_or(0);
221
222 driver.execute("window.scrollTo(0, document.body.scrollHeight);", vec![])
223 .await
224 .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?;
225 sleep(Duration::from_secs(2)).await;
226
227 let new_height = driver
228 .execute("return document.body.scrollHeight;", vec![])
229 .await
230 .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?
231 .json()
232 .as_i64()
233 .unwrap_or(0);
234
235 Ok(new_height > initial_height)
236 }
237
238 async fn handle_infinite_scroll(&self, driver: &WebDriver, max_scrolls: u32) -> Result<()> {
239 let mut last_height: i64 = 0;
240
241 for _ in 0..max_scrolls {
242 let current_height = driver
243 .execute("return document.body.scrollHeight;", vec![])
244 .await
245 .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?
246 .json()
247 .as_i64()
248 .unwrap_or(0);
249
250 if current_height == last_height {
251 break;
252 }
253
254 last_height = current_height;
255 driver.execute("window.scrollTo(0, document.body.scrollHeight);", vec![])
256 .await
257 .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?;
258 sleep(Duration::from_secs(2)).await;
259 }
260
261 Ok(())
262 }
263
264 async fn interact_with_dropdowns(&self, driver: &WebDriver) -> Result<Vec<DropdownContent>> {
265 let mut contents = Vec::new();
266
267 let selects = driver.find_all(By::Css("select")).await.unwrap_or_default();
269
270 for (idx, select) in selects.iter().enumerate() {
271 let label = select.attr("aria-label").await.ok().flatten();
273
274 if let Ok(options) = select.find_all(By::Css("option")).await {
276 for option in options.iter().skip(1) { if let Ok(_) = option.click().await {
278 sleep(Duration::from_millis(1000)).await;
279 self.wait_for_page_ready(driver).await?;
280
281 let content = driver.source().await
282 .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
283 contents.push(DropdownContent {
284 index: idx,
285 label: label.clone(),
286 content,
287 });
288 }
289 }
290 }
291 }
292
293 let custom_dropdowns = driver.find_all(By::Css("[role='combobox'], .dropdown, [data-toggle='dropdown']"))
295 .await.unwrap_or_default();
296
297 for (idx, dropdown) in custom_dropdowns.iter().enumerate() {
298 let label = dropdown.attr("aria-label").await.ok().flatten();
299
300 if let Ok(_) = dropdown.click().await {
301 sleep(Duration::from_millis(500)).await;
302
303 if let Ok(items) = driver.find_all(By::Css(".dropdown-item, [role='option'], li")).await {
305 for item in items.iter().take(5) { if let Ok(_) = item.click().await {
307 sleep(Duration::from_millis(1000)).await;
308 self.wait_for_page_ready(driver).await?;
309
310 let content = driver.source().await
311 .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
312 contents.push(DropdownContent {
313 index: selects.len() + idx,
314 label: label.clone(),
315 content,
316 });
317
318 dropdown.click().await.ok();
320 sleep(Duration::from_millis(500)).await;
321 }
322 }
323 }
324 }
325 }
326
327 info!("Extracted {} dropdown variations", contents.len());
328 Ok(contents)
329 }
330
331 async fn interact_with_filters(&self, driver: &WebDriver) -> Result<Vec<FilterContent>> {
332 let mut contents = Vec::new();
333
334 let filters = driver.find_all(By::Css("input[type='checkbox'], input[type='radio'], [role='checkbox'], [role='radio']"))
336 .await.unwrap_or_default();
337
338 for (idx, filter) in filters.iter().enumerate().take(10) { let label = filter.attr("aria-label").await.ok().flatten();
340
341 if let Ok(_) = filter.click().await {
342 sleep(Duration::from_millis(1000)).await;
343 self.wait_for_page_ready(driver).await?;
344
345 let content = driver.source().await
346 .map_err(|e| Error::Browser(format!("Failed to get page source: {e}")))?;
347 contents.push(FilterContent {
348 index: idx,
349 label,
350 content,
351 });
352 }
353 }
354
355 info!("Extracted {} filter variations", contents.len());
356 Ok(contents)
357 }
358
359 async fn extract_links_js(&self, driver: &WebDriver, _base_url: &Url) -> Result<Vec<Url>> {
360 let script = r#"
361 return Array.from(document.querySelectorAll('a[href]'))
362 .map(a => a.href)
363 .filter(href => href.startsWith('http'));
364 "#;
365
366 let links_value = driver
367 .execute(script, vec![])
368 .await
369 .map_err(|e| Error::Browser(format!("Script execution failed: {e}")))?;
370
371 let mut links = Vec::new();
372
373 let json_value = links_value.json();
375 if let Some(array) = json_value.as_array() {
376 for value in array {
377 if let Some(href) = value.as_str() {
378 if let Ok(url) = Url::parse(href) {
379 links.push(url);
380 }
381 }
382 }
383 }
384
385 Ok(links)
386 }
387
388 pub async fn disconnect(&mut self) -> Result<()> {
389 if let Some(driver) = self.driver.take() {
390 driver
391 .quit()
392 .await
393 .map_err(|e| Error::Browser(format!("Failed to quit browser: {e}")))?;
394 }
395 Ok(())
396 }
397}
398
399impl Drop for BrowserEngine {
400 fn drop(&mut self) {
401 if let Some(driver) = self.driver.take() {
402 tokio::task::spawn(async move {
403 let _ = driver.quit().await;
404 });
405 }
406 }
407}