Skip to main content

spider_core/
spider.rs

1//! # Spider Module
2//!
3//! Defines the core `Spider` trait and related components for implementing custom web scrapers.
4//!
5//! ## Overview
6//!
7//! The `Spider` trait is the primary interface for defining custom scraping logic.
8//! It specifies how to start a crawl (via start URLs) and how to process responses
9//! to extract data and discover new URLs to follow. This trait follows the Scrapy
10//! pattern of spiders that define the crawling behavior.
11//!
12//! ## Key Components
13//!
14//! - **Spider Trait**: The main trait for implementing custom scraping logic
15//! - **ParseOutput**: Container for returning scraped items and new requests
16//! - **Associated Types**: Define the item type that the spider produces
17//!
18//! ## Implementation
19//!
20//! Implementors must define:
21//! - `start_urls`: The initial URLs to begin the crawl
22//! - `parse`: Logic for extracting data and discovering new URLs from responses
23//! - `Item`: The type of data structure to store scraped information
24//!
25//! ## Example
26//!
27//! ```rust,ignore
28//! use spider_core::Spider;
29//! use spider_util::{response::Response, error::SpiderError};
30//! use async_trait::async_trait;
31//!
32//! #[spider_macro::scraped_item]
33//! struct Article {
34//!     title: String,
35//!     content: String,
36//! }
37//!
38//! struct ArticleSpider;
39//!
40//! #[async_trait]
41//! impl Spider for ArticleSpider {
42//!     type Item = Article;
43//!
44//!     fn start_urls(&self) -> Vec<&'static str> {
45//!         vec!["https://example.com/articles"]
46//!     }
47//!
48//!     async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
49//!         let mut output = ParseOutput::new();
50//!
51//!         // Extract articles from the page
52//!         // ... parsing logic ...
53//!
54//!         // Add discovered articles to output
55//!         // output.add_item(Article { title, content });
56//!
57//!         // Add new URLs to follow
58//!         // output.add_request(new_request);
59//!
60//!         Ok(output)
61//!     }
62//! }
63//! ```
64
65use spider_util::error::SpiderError;
66use spider_util::item::{ParseOutput, ScrapedItem};
67use spider_util::request::Request;
68use spider_util::response::Response;
69#[cfg(feature = "stream")]
70use spider_util::stream_response::StreamResponse;
71#[cfg(not(feature = "stream"))]
72pub struct StreamResponse;
73
74use anyhow::Result;
75use async_trait::async_trait;
76use url::Url;
77
78/// Defines the contract for a web spider.
79#[async_trait]
80pub trait Spider: Send + Sync + 'static {
81    /// The type of item that the spider scrapes.
82    type Item: ScrapedItem;
83
84    /// Returns the initial URLs to start crawling from.
85    fn start_urls(&self) -> Vec<&'static str> {
86        Vec::new()
87    }
88
89    /// Generates the initial requests to start crawling.
90    fn start_requests(&self) -> Result<Vec<Request>, SpiderError> {
91        let urls: Result<Vec<Url>, url::ParseError> =
92            self.start_urls().into_iter().map(Url::parse).collect();
93        Ok(urls?.into_iter().map(Request::new).collect())
94    }
95
96    /// Parses a response and extracts scraped items and new requests.
97    #[cfg(feature = "stream")]
98    async fn parse(&mut self, _response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
99        Ok(ParseOutput::new())
100    }
101
102    /// Parses a stream response and extracts scraped items and new requests.
103    /// This method is optional and only available when the 'stream' feature is enabled.
104    #[cfg(feature = "stream")]
105    async fn parse_stream(&mut self, response: StreamResponse) -> Result<ParseOutput<Self::Item>, SpiderError>;
106
107    /// Parses a response and extracts scraped items and new requests.
108    #[cfg(not(feature = "stream"))]
109    async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError>;
110
111    /// Parses a stream response and extracts scraped items and new requests.
112    /// This method is optional and only available when the 'stream' feature is enabled.
113    #[cfg(not(feature = "stream"))]
114    async fn parse_stream(&mut self, _response: StreamResponse) -> Result<ParseOutput<Self::Item>, SpiderError> {
115        Ok(ParseOutput::new())
116    }
117
118}