spider_core/spider.rs
1//! # Spider Module
2//!
3//! Defines the core `Spider` trait and related components for implementing custom web scrapers.
4//!
5//! ## Overview
6//!
7//! The `Spider` trait is the primary interface for defining custom scraping logic.
8//! It specifies how to start a crawl (via start URLs) and how to process responses
9//! to extract data and discover new URLs to follow. This trait follows the Scrapy
10//! pattern of spiders that define the crawling behavior.
11//!
12//! ## Key Components
13//!
14//! - **Spider Trait**: The main trait for implementing custom scraping logic
15//! - **ParseOutput**: Container for returning scraped items and new requests
16//! - **Associated Types**: Define the item type that the spider produces
17//!
18//! ## Implementation
19//!
20//! Implementors must define:
21//! - `start_urls`: The initial URLs to begin the crawl
22//! - `parse`: Logic for extracting data and discovering new URLs from responses
23//! - `Item`: The type of data structure to store scraped information
24//!
25//! ## Example
26//!
27//! ```rust,ignore
28//! use spider_core::Spider;
29//! use spider_util::{response::Response, error::SpiderError};
30//! use async_trait::async_trait;
31//!
32//! #[spider_macro::scraped_item]
33//! struct Article {
34//! title: String,
35//! content: String,
36//! }
37//!
38//! struct ArticleSpider;
39//!
40//! #[async_trait]
41//! impl Spider for ArticleSpider {
42//! type Item = Article;
43//!
44//! fn start_urls(&self) -> Vec<&'static str> {
45//! vec!["https://example.com/articles"]
46//! }
47//!
48//! async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
49//! let mut output = ParseOutput::new();
50//!
51//! // Extract articles from the page
52//! // ... parsing logic ...
53//!
54//! // Add discovered articles to output
55//! // output.add_item(Article { title, content });
56//!
57//! // Add new URLs to follow
58//! // output.add_request(new_request);
59//!
60//! Ok(output)
61//! }
62//! }
63//! ```
64
65use spider_util::error::SpiderError;
66use spider_util::item::{ParseOutput, ScrapedItem};
67use spider_util::request::Request;
68use spider_util::response::Response;
69#[cfg(feature = "stream")]
70use spider_util::stream_response::StreamResponse;
71#[cfg(not(feature = "stream"))]
72pub struct StreamResponse;
73
74use anyhow::Result;
75use async_trait::async_trait;
76use url::Url;
77
78/// Defines the contract for a web spider.
79#[async_trait]
80pub trait Spider: Send + Sync + 'static {
81 /// The type of item that the spider scrapes.
82 type Item: ScrapedItem;
83
84 /// Returns the initial URLs to start crawling from.
85 fn start_urls(&self) -> Vec<&'static str> {
86 Vec::new()
87 }
88
89 /// Generates the initial requests to start crawling.
90 fn start_requests(&self) -> Result<Vec<Request>, SpiderError> {
91 let urls: Result<Vec<Url>, url::ParseError> =
92 self.start_urls().into_iter().map(Url::parse).collect();
93 Ok(urls?.into_iter().map(Request::new).collect())
94 }
95
96 /// Parses a response and extracts scraped items and new requests.
97 #[cfg(feature = "stream")]
98 async fn parse(&mut self, _response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
99 Ok(ParseOutput::new())
100 }
101
102 /// Parses a stream response and extracts scraped items and new requests.
103 /// This method is optional and only available when the 'stream' feature is enabled.
104 #[cfg(feature = "stream")]
105 async fn parse_stream(&mut self, response: StreamResponse) -> Result<ParseOutput<Self::Item>, SpiderError>;
106
107 /// Parses a response and extracts scraped items and new requests.
108 #[cfg(not(feature = "stream"))]
109 async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError>;
110
111 /// Parses a stream response and extracts scraped items and new requests.
112 /// This method is optional and only available when the 'stream' feature is enabled.
113 #[cfg(not(feature = "stream"))]
114 async fn parse_stream(&mut self, _response: StreamResponse) -> Result<ParseOutput<Self::Item>, SpiderError> {
115 Ok(ParseOutput::new())
116 }
117
118}