Skip to main content

spider_lib/
spider.rs

1//! Trait for defining custom web spiders in the `spider-lib` framework.
2//!
3//! This module provides the `Spider` trait, which serves as the blueprint
4//! for creating custom web scrapers. A spider defines how a specific website
5//! (or a group of websites) should be crawled and how data should be extracted.
6//!
7//! Implementors of the `Spider` trait must:
8//! - Specify the `Item` type (the data structure for scraped data).
9//! - Provide a list of `start_urls` or `start_requests` to begin the crawl.
10//! - Implement the `parse` method, which takes a `Response` and returns
11//!   `ParseOutput` containing new `Request`s to follow and `ScrapedItem`s.
12
13use crate::error::SpiderError;
14use crate::item::{ParseOutput, ScrapedItem};
15use crate::request::Request;
16use crate::response::Response;
17use anyhow::Result;
18use async_trait::async_trait;
19use url::Url;
20
21/// Defines the contract for a web spider.
22#[async_trait]
23pub trait Spider: Send + Sync + 'static {
24    /// The type of item that the spider scrapes.
25    type Item: ScrapedItem;
26
27    /// Returns the initial URLs to start crawling from.
28    fn start_urls(&self) -> Vec<&'static str> {
29        Vec::new()
30    }
31
32    /// Generates the initial requests to start crawling.
33    fn start_requests(&self) -> Result<Vec<Request>, SpiderError> {
34        let urls: Result<Vec<Url>, url::ParseError> =
35            self.start_urls().into_iter().map(Url::parse).collect();
36        Ok(urls?.into_iter().map(Request::new).collect())
37    }
38
39    /// Parses a response and extracts scraped items and new requests.
40    async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError>;
41}