use crate::{
engine::CrawlStats,
error::{ErrorPolicy, KumoError},
extract::Response,
request::CrawlRequest,
};
pub struct Output<T: serde::Serialize> {
pub(crate) items: Vec<T>,
pub follow: Vec<CrawlRequest>,
}
impl<T: serde::Serialize> Output<T> {
pub fn new() -> Self {
Self {
items: Vec::new(),
follow: Vec::new(),
}
}
pub fn item(mut self, item: T) -> Self {
self.items.push(item);
self
}
pub fn items(mut self, items: Vec<T>) -> Self {
self.items.extend(items);
self
}
pub fn follow(mut self, url: impl Into<String>) -> Self {
self.follow.push(CrawlRequest::get(url));
self
}
pub fn follow_many(mut self, urls: Vec<String>) -> Self {
self.follow.extend(urls.into_iter().map(CrawlRequest::get));
self
}
pub fn request(mut self, request: CrawlRequest) -> Self {
self.follow.push(request);
self
}
pub fn requests(mut self, requests: Vec<CrawlRequest>) -> Self {
self.follow.extend(requests);
self
}
}
impl<T: serde::Serialize> Default for Output<T> {
fn default() -> Self {
Self::new()
}
}
#[async_trait::async_trait]
pub trait Spider: Send + Sync {
type Item: serde::Serialize + Send;
fn name(&self) -> &str;
fn start_urls(&self) -> Vec<String>;
async fn parse(&self, response: &Response) -> Result<Output<Self::Item>, KumoError>;
fn on_error(&self, _url: &str, _err: &KumoError) -> ErrorPolicy {
ErrorPolicy::Skip
}
fn max_depth(&self) -> Option<usize> {
None
}
fn allowed_domains(&self) -> Vec<&str> {
vec![]
}
async fn open(&self) -> Result<(), KumoError> {
Ok(())
}
async fn close(&self, _stats: &CrawlStats) -> Result<(), KumoError> {
Ok(())
}
}