#[allow(unused_imports)]
pub use crate::telemetry::{current_traceparent, with_traceparent};
use crate::engine::CrawlEngine;
use crate::error::CrawlError;
use crate::interact::PageAction;
#[cfg(not(target_arch = "wasm32"))]
use crate::types::{BatchCrawlStreamRequest, CrawlEvent, CrawlStreamRequest};
use crate::types::{CrawlConfig, CrawlResult, InteractionResult, MapResult, ScrapeResult};
#[cfg(not(target_arch = "wasm32"))]
use futures::future::BoxFuture;
#[cfg(not(target_arch = "wasm32"))]
use futures::stream::{BoxStream, StreamExt};
use serde::{Deserialize, Serialize};
#[derive(Clone)]
pub struct CrawlEngineHandle {
inner: CrawlEngine,
}
impl CrawlEngineHandle {
#[cfg_attr(alef, alef(skip))]
pub fn from_engine(engine: CrawlEngine) -> Self {
Self { inner: engine }
}
}
#[cfg(not(target_arch = "wasm32"))]
impl CrawlEngineHandle {
pub fn crawl_stream(
&self,
req: CrawlStreamRequest,
) -> BoxFuture<'static, Result<BoxStream<'static, Result<CrawlEvent, CrawlError>>, CrawlError>> {
let engine = self.inner.clone();
Box::pin(async move {
let stream = engine.crawl_stream(&req.url);
Ok(stream.map(Ok::<CrawlEvent, CrawlError>).boxed())
})
}
pub fn batch_crawl_stream(
&self,
req: BatchCrawlStreamRequest,
) -> BoxFuture<'static, Result<BoxStream<'static, Result<CrawlEvent, CrawlError>>, CrawlError>> {
let engine = self.inner.clone();
Box::pin(async move {
let url_refs: Vec<&str> = req.urls.iter().map(String::as_str).collect();
let stream = engine.batch_crawl_stream(&url_refs);
Ok(stream.map(Ok::<CrawlEvent, CrawlError>).boxed())
})
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct BatchScrapeResult {
pub url: String,
pub result: Option<ScrapeResult>,
pub error: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct BatchCrawlResult {
pub url: String,
pub result: Option<CrawlResult>,
pub error: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct BatchScrapeResults {
pub results: Vec<BatchScrapeResult>,
pub total_count: usize,
pub completed_count: usize,
pub failed_count: usize,
}
impl From<Vec<BatchScrapeResult>> for BatchScrapeResults {
fn from(results: Vec<BatchScrapeResult>) -> Self {
let total_count = results.len();
let failed_count = results.iter().filter(|r| r.error.is_some()).count();
let completed_count = total_count - failed_count;
Self {
results,
total_count,
completed_count,
failed_count,
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct BatchCrawlResults {
pub results: Vec<BatchCrawlResult>,
pub total_count: usize,
pub completed_count: usize,
pub failed_count: usize,
}
impl From<Vec<BatchCrawlResult>> for BatchCrawlResults {
fn from(results: Vec<BatchCrawlResult>) -> Self {
let total_count = results.len();
let failed_count = results.iter().filter(|r| r.error.is_some()).count();
let completed_count = total_count - failed_count;
Self {
results,
total_count,
completed_count,
failed_count,
}
}
}
pub fn create_engine(config: Option<CrawlConfig>) -> Result<CrawlEngineHandle, CrawlError> {
let mut builder = CrawlEngine::builder();
if let Some(config) = config {
config.validate()?;
builder = builder.config(config);
}
let engine = builder.build()?;
Ok(CrawlEngineHandle { inner: engine })
}
pub async fn scrape(engine: &CrawlEngineHandle, url: &str) -> Result<ScrapeResult, CrawlError> {
engine.inner.scrape(url).await
}
pub async fn crawl(engine: &CrawlEngineHandle, url: &str) -> Result<CrawlResult, CrawlError> {
engine.inner.crawl(url).await
}
pub async fn map_urls(engine: &CrawlEngineHandle, url: &str) -> Result<MapResult, CrawlError> {
engine.inner.map(url).await
}
pub async fn interact(
engine: &CrawlEngineHandle,
url: &str,
actions: Vec<PageAction>,
) -> Result<InteractionResult, CrawlError> {
engine.inner.interact(url, &actions).await
}
pub async fn batch_scrape(engine: &CrawlEngineHandle, urls: Vec<String>) -> Result<BatchScrapeResults, CrawlError> {
if urls.is_empty() {
return Err(CrawlError::InvalidConfig("batch_urls must not be empty".into()));
}
let url_refs: Vec<&str> = urls.iter().map(String::as_str).collect();
let results = engine.inner.batch_scrape(&url_refs).await;
let per_url: Vec<BatchScrapeResult> = results
.into_iter()
.map(|(url, result)| match result {
Ok(r) => BatchScrapeResult {
url,
result: Some(r),
error: None,
},
Err(e) => BatchScrapeResult {
url,
result: None,
error: Some(e.to_string()),
},
})
.collect();
Ok(BatchScrapeResults::from(per_url))
}
#[cfg(not(target_arch = "wasm32"))]
pub async fn crawl_stream(
engine: &CrawlEngineHandle,
url: &str,
) -> Result<BoxStream<'static, Result<CrawlEvent, CrawlError>>, CrawlError> {
engine.crawl_stream(CrawlStreamRequest { url: url.to_string() }).await
}
#[cfg(not(target_arch = "wasm32"))]
pub async fn batch_crawl_stream(
engine: &CrawlEngineHandle,
urls: Vec<String>,
) -> Result<BoxStream<'static, Result<CrawlEvent, CrawlError>>, CrawlError> {
engine.batch_crawl_stream(BatchCrawlStreamRequest { urls }).await
}
pub async fn batch_crawl(engine: &CrawlEngineHandle, urls: Vec<String>) -> Result<BatchCrawlResults, CrawlError> {
if urls.is_empty() {
return Err(CrawlError::InvalidConfig("batch_urls must not be empty".into()));
}
let url_refs: Vec<&str> = urls.iter().map(String::as_str).collect();
let results = engine.inner.batch_crawl(&url_refs).await;
let per_url: Vec<BatchCrawlResult> = results
.into_iter()
.map(|(url, result)| match result {
Ok(r) => {
if let Some(ref err) = r.error {
BatchCrawlResult {
url,
result: None,
error: Some(err.clone()),
}
} else {
BatchCrawlResult {
url,
result: Some(r),
error: None,
}
}
}
Err(e) => BatchCrawlResult {
url,
result: None,
error: Some(e.to_string()),
},
})
.collect();
Ok(BatchCrawlResults::from(per_url))
}