#[cfg(not(target_arch = "wasm32"))]
mod batch;
mod builder;
#[cfg(not(target_arch = "wasm32"))]
mod crawl_loop;
use std::sync::Arc;
use crate::error::CrawlError;
#[cfg(not(target_arch = "wasm32"))]
use crate::tower::CrawlRequest;
use crate::traits::*;
use crate::types::*;
pub use builder::CrawlEngineBuilder;
#[derive(Clone)]
#[cfg_attr(target_arch = "wasm32", allow(dead_code))]
pub struct CrawlEngine {
pub(crate) config: CrawlConfig,
pub(crate) frontier: Arc<dyn Frontier>,
pub(crate) rate_limiter: Arc<dyn RateLimiter>,
pub(crate) store: Arc<dyn CrawlStore>,
pub(crate) event_emitter: Arc<dyn EventEmitter>,
pub(crate) strategy: Arc<dyn CrawlStrategy>,
pub(crate) content_filter: Arc<dyn ContentFilter>,
pub(crate) cache: Arc<dyn CrawlCache>,
#[cfg(not(target_arch = "wasm32"))]
ua_rotation: crate::tower::UaRotationLayer,
}
impl CrawlEngine {
pub fn builder() -> CrawlEngineBuilder {
CrawlEngineBuilder::new()
}
#[cfg(not(target_arch = "wasm32"))]
fn build_service(
&self,
client: &reqwest::Client,
) -> tower::util::BoxCloneService<CrawlRequest, crate::tower::CrawlResponse, CrawlError> {
use tower::ServiceBuilder;
let service = ServiceBuilder::new()
.layer(crate::tower::PerDomainRateLimitLayer::new(self.rate_limiter.clone()))
.layer(crate::tower::CrawlCacheLayer::new(self.cache.clone()))
.layer(self.ua_rotation.clone())
.service(crate::tower::HttpFetchService::new(client.clone(), self.config.clone()));
#[cfg(feature = "tracing")]
let service = tower::ServiceBuilder::new()
.layer(crate::tower::CrawlTracingLayer::new())
.service(service);
tower::util::BoxCloneService::new(service)
}
#[cfg(not(target_arch = "wasm32"))]
async fn fetch_response(&self, url: &str) -> Result<(crate::tower::CrawlResponse, bool), CrawlError> {
use crate::tower::CrawlResponse;
#[cfg(feature = "browser")]
fn browser_http_to_crawl(r: crate::http::HttpResponse) -> CrawlResponse {
CrawlResponse {
status: r.status,
content_type: r.content_type,
body: r.body,
body_bytes: r.body_bytes,
headers: std::collections::HashMap::new(),
}
}
#[cfg(feature = "browser")]
if self.config.browser.mode == crate::types::BrowserMode::Always {
let pool = self.config.browser_pool.as_deref();
let http_resp = crate::browser::browser_fetch(url, &self.config, None, pool).await?;
return Ok((browser_http_to_crawl(http_resp), true));
}
let client = crate::http::build_client(&self.config)?;
let mut service = self.build_service(&client);
use tower::Service;
match service.call(CrawlRequest::new(url)).await {
Ok(resp) => Ok((resp, false)),
Err(CrawlError::NotFound(_)) if self.config.respect_robots_txt => {
Ok((
CrawlResponse {
status: 404,
content_type: String::new(),
body: String::new(),
body_bytes: Vec::new(),
headers: std::collections::HashMap::new(),
},
false,
))
}
#[cfg(feature = "browser")]
Err(CrawlError::WafBlocked(_)) if self.config.browser.mode == crate::types::BrowserMode::Auto => {
let pool = self.config.browser_pool.as_deref();
let http_resp = crate::browser::browser_fetch(url, &self.config, None, pool).await?;
Ok((browser_http_to_crawl(http_resp), true))
}
Err(e) => Err(e),
}
}
pub async fn scrape(&self, url: &str) -> Result<ScrapeResult, CrawlError> {
self.config.validate()?;
#[cfg(not(target_arch = "wasm32"))]
let (response, browser_used_for_fetch) = {
let (resp, used_browser) = self.fetch_response(url).await?;
if resp.status == 404 && self.config.respect_robots_txt {
return Ok(ScrapeResult {
status_code: 404,
content_type: String::new(),
html: String::new(),
body_size: 0,
metadata: PageMetadata::default(),
links: Vec::new(),
images: Vec::new(),
feeds: Vec::new(),
json_ld: Vec::new(),
is_allowed: true,
crawl_delay: None,
noindex_detected: false,
nofollow_detected: false,
x_robots_tag: None,
is_pdf: false,
was_skipped: false,
detected_charset: None,
main_content_only: self.config.main_content_only,
auth_header_sent: self.config.auth.is_some(),
response_meta: None,
assets: Vec::new(),
js_render_hint: false,
browser_used: false,
markdown: None,
extracted_data: None,
extraction_meta: None,
screenshot: None,
downloaded_document: None,
});
}
(resp, used_browser)
};
#[cfg(target_arch = "wasm32")]
let (response, browser_used_for_fetch) = {
let client = crate::http::build_client(&self.config)?;
let resp =
crate::http::fetch_with_retry(url, &self.config, &std::collections::HashMap::new(), &client).await?;
let headers = std::collections::HashMap::new();
let crawl_resp = crate::tower::CrawlResponse {
status: resp.status,
content_type: resp.content_type,
body: resp.body,
body_bytes: resp.body_bytes,
headers,
};
(crawl_resp, false)
};
let mut result = crate::scrape::scrape_from_crawl_response(url, &response, &self.config).await?;
result.browser_used = browser_used_for_fetch;
#[cfg(all(not(target_arch = "wasm32"), feature = "browser"))]
if result.js_render_hint && !result.browser_used && self.config.browser.mode == crate::types::BrowserMode::Auto
{
let pool = self.config.browser_pool.as_deref();
let http_resp = crate::browser::browser_fetch(url, &self.config, None, pool).await?;
let crawl_resp = crate::tower::CrawlResponse {
status: http_resp.status,
content_type: http_resp.content_type,
body: http_resp.body,
body_bytes: http_resp.body_bytes,
headers: std::collections::HashMap::new(),
};
result = crate::scrape::scrape_from_crawl_response(url, &crawl_resp, &self.config).await?;
result.browser_used = true;
}
Ok(result)
}
pub async fn map(&self, url: &str) -> Result<MapResult, CrawlError> {
self.config.validate()?;
crate::map::map(url, &self.config).await
}
}
#[cfg(target_arch = "wasm32")]
impl CrawlEngine {
pub async fn crawl(&self, url: &str) -> Result<CrawlResult, CrawlError> {
let scrape = self.scrape(url).await?;
let page = CrawlPageResult {
url: url.to_owned(),
normalized_url: crate::normalize::normalize_url(url),
status_code: scrape.status_code,
content_type: scrape.content_type,
html: scrape.html,
body_size: scrape.body_size,
metadata: scrape.metadata,
links: scrape.links,
images: scrape.images,
feeds: scrape.feeds,
json_ld: scrape.json_ld,
depth: 0,
stayed_on_domain: true,
was_skipped: scrape.was_skipped,
is_pdf: scrape.is_pdf,
detected_charset: scrape.detected_charset,
markdown: scrape.markdown,
extracted_data: scrape.extracted_data,
extraction_meta: scrape.extraction_meta,
downloaded_document: scrape.downloaded_document,
};
Ok(CrawlResult::new(
vec![page],
url.to_owned(),
0,
false,
None,
Vec::new(),
vec![crate::normalize::normalize_url(url)],
))
}
pub async fn batch_scrape(&self, urls: &[&str]) -> Vec<(String, Result<ScrapeResult, CrawlError>)> {
let mut results = Vec::with_capacity(urls.len());
for url in urls {
let result = self.scrape(url).await;
results.push((url.to_string(), result));
}
results
}
pub async fn batch_crawl(&self, urls: &[&str]) -> Vec<(String, Result<CrawlResult, CrawlError>)> {
let mut results = Vec::with_capacity(urls.len());
for url in urls {
let result = self.crawl(url).await;
results.push((url.to_string(), result));
}
results
}
}