spider_core/spider.rs
1//! The spider trait and request bootstrap types.
2//!
3//! [`Spider`] is the main contract every crawler implements. It defines how
4//! a crawl starts and how each downloaded response turns into scraped items and
5//! follow-up requests.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::Spider;
11//! use spider_util::{response::Response, error::SpiderError, item::ParseOutput};
12//! use async_trait::async_trait;
13//!
14//! #[spider_macro::scraped_item]
15//! struct Article {
16//! title: String,
17//! content: String,
18//! }
19//!
20//! // State for tracking page count
21//! use std::sync::Arc;
22//! use std::sync::atomic::{AtomicUsize, Ordering};
23//! use dashmap::DashMap;
24//!
25//! #[derive(Clone, Default)]
26//! struct ArticleSpiderState {
27//! page_count: Arc<AtomicUsize>,
28//! visited_urls: Arc<DashMap<String, bool>>,
29//! }
30//!
31//! impl ArticleSpiderState {
32//! fn increment_page_count(&self) {
33//! self.page_count.fetch_add(1, Ordering::SeqCst);
34//! }
35//!
36//! fn mark_url_visited(&self, url: String) {
37//! self.visited_urls.insert(url, true);
38//! }
39//! }
40//!
41//! struct ArticleSpider;
42//!
43//! #[async_trait]
44//! impl Spider for ArticleSpider {
45//! type Item = Article;
46//! type State = ArticleSpiderState;
47//!
48//! fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
49//! let req = Request::new("https://example.com/articles".parse()?);
50//! Ok(StartRequests::Iter(Box::new(std::iter::once(Ok(req)))))
51//! }
52//!
53//! async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
54//! // Update state - can be done concurrently without blocking the spider
55//! state.increment_page_count();
56//! state.mark_url_visited(response.url.to_string());
57//!
58//! let mut output = ParseOutput::new();
59//!
60//! // Extract articles from the page
61//! // ... parsing logic ...
62//!
63//! // Add discovered articles to output
64//! // output.add_item(Article { title, content });
65//!
66//! // Add new URLs to follow
67//! // output.add_request(new_request);
68//!
69//! Ok(output)
70//! }
71//! }
72//! ```
73
74use spider_util::error::SpiderError;
75use spider_util::item::{ParseOutput, ScrapedItem};
76use spider_util::request::Request;
77use spider_util::response::Response;
78
79use anyhow::Result;
80use async_trait::async_trait;
81use std::fs::File;
82use std::io::{BufRead, BufReader};
83use std::path::Path;
84use url::Url;
85
86/// A boxed iterator of start requests.
87pub type StartRequestIter<'a> = Box<dyn Iterator<Item = Result<Request, SpiderError>> + Send + 'a>;
88
89/// Initial request source returned by [`Spider::start_requests`].
90///
91/// Use [`StartRequests::Urls`] for simple static seeds, [`StartRequests::Iter`]
92/// when you need to construct full [`Request`] values or generate seeds
93/// lazily, and [`StartRequests::File`] when you want to keep large seed lists
94/// outside compiled code.
95pub enum StartRequests<'a> {
96 /// Fixed list of seed URLs.
97 Urls(Vec<&'a str>),
98 /// Direct request iterator supplied by the spider.
99 Iter(StartRequestIter<'a>),
100 /// Path to a plain-text seed file (one URL per line).
101 File(&'a str),
102}
103
104impl<'a> StartRequests<'a> {
105 /// Creates a file-based source from a path string.
106 ///
107 /// The file is expected to contain one URL per line. Empty lines and lines
108 /// starting with `#` are ignored.
109 pub fn file(path: &'a str) -> Self {
110 StartRequests::File(path)
111 }
112
113 /// Resolves this source into a concrete request iterator.
114 #[allow(clippy::should_implement_trait)]
115 ///
116 /// URL strings are parsed eagerly as the iterator is consumed. Invalid file
117 /// entries become `SpiderError::ConfigurationError` items that preserve the
118 /// original line number.
119 pub fn into_iter(self) -> Result<StartRequestIter<'a>, SpiderError> {
120 match self {
121 StartRequests::Urls(urls) => {
122 let requests = urls
123 .into_iter()
124 .map(|u| Url::parse(u).map(Request::new).map_err(SpiderError::from));
125 Ok(Box::new(requests))
126 }
127 StartRequests::Iter(iter) => Ok(iter),
128 StartRequests::File(path) => start_requests_from_file(path),
129 }
130 }
131}
132
133fn start_requests_from_file<P: AsRef<Path>>(
134 path: P,
135) -> Result<StartRequestIter<'static>, SpiderError> {
136 let path = path.as_ref();
137 let file = File::open(path)?;
138 let path_display = path.display().to_string();
139 let mut lines = BufReader::new(file).lines().enumerate();
140
141 let iter = std::iter::from_fn(move || {
142 loop {
143 let (line_idx, line_res) = lines.next()?;
144 let line_number = line_idx + 1;
145 match line_res {
146 Ok(line) => {
147 let trimmed = line.trim();
148 if trimmed.is_empty() || trimmed.starts_with('#') {
149 continue;
150 }
151
152 return Some(match Url::parse(trimmed) {
153 Ok(url) => Ok(Request::new(url)),
154 Err(e) => Err(SpiderError::ConfigurationError(format!(
155 "Invalid start URL in {} at line {}: {}",
156 path_display, line_number, e
157 ))),
158 });
159 }
160 Err(e) => {
161 return Some(Err(SpiderError::IoError(format!(
162 "Failed reading {} at line {}: {}",
163 path_display, line_number, e
164 ))));
165 }
166 }
167 }
168 });
169
170 Ok(Box::new(iter))
171}
172
173/// Defines the contract for a spider.
174///
175/// ## Type Parameters
176///
177/// - `Item`: The type of scraped data structure (must implement [`ScrapedItem`])
178/// - `State`: The type of shared state (must implement `Default`)
179///
180/// ## Design Notes
181///
182/// The trait uses `&self` (immutable reference) instead of `&mut self` for the
183/// [`parse`](Spider::parse) method. This design enables efficient concurrent crawling
184/// by eliminating the need for mutex locks when accessing the spider from multiple
185/// async tasks. State that needs mutation should be stored in the associated
186/// `State` type using thread-safe primitives like `Arc<AtomicUsize>` or `DashMap`.
187///
188/// A typical crawl lifecycle looks like this:
189///
190/// 1. [`start_requests`](Spider::start_requests) produces the initial requests
191/// 2. the runtime schedules and downloads them
192/// 3. [`parse`](Spider::parse) turns each [`Response`] into a [`ParseOutput`]
193/// 4. emitted items go to pipelines and emitted requests go back to the scheduler
194#[async_trait]
195pub trait Spider: Send + Sync + 'static {
196 /// The type of item that the spider scrapes.
197 ///
198 /// This associated type must implement the [`ScrapedItem`] trait, which
199 /// provides methods for type erasure, cloning, and JSON serialization.
200 /// Use the `#[scraped_item]` procedural macro to automatically implement
201 /// all required traits for your data structures.
202 type Item: ScrapedItem;
203
204 /// The type of state that the spider uses.
205 ///
206 /// The state type must implement `Default` so it can be instantiated
207 /// automatically by the crawler. It should also be `Send + Sync` to
208 /// enable safe concurrent access from multiple async tasks.
209 ///
210 /// ## Example
211 ///
212 /// ```rust,ignore
213 /// use std::sync::Arc;
214 /// use std::sync::atomic::{AtomicUsize, Ordering};
215 /// use dashmap::DashMap;
216 ///
217 /// #[derive(Clone, Default)]
218 /// struct MySpiderState {
219 /// page_count: Arc<AtomicUsize>,
220 /// visited_urls: Arc<DashMap<String, bool>>,
221 /// }
222 /// ```
223 type State: Default + Send + Sync;
224
225 /// Returns static seed URLs.
226 ///
227 /// This method is optional and useful for simple spiders. The default
228 /// [`start_requests`](Spider::start_requests) implementation converts these
229 /// URLs into a request iterator.
230 ///
231 /// Prefer this method when plain URL strings are enough. Override
232 /// [`start_requests`](Spider::start_requests) instead when you need custom
233 /// headers, methods, request metadata, seed-file loading, or dynamic seed
234 /// generation.
235 fn start_urls(&self) -> Vec<&'static str> {
236 Vec::new()
237 }
238
239 /// Returns the initial request source used to start crawling.
240 ///
241 /// The default implementation converts [`start_urls`](Spider::start_urls)
242 /// into an iterator.
243 ///
244 /// To load from seed file, return `StartRequests::file(path)`.
245 /// To use a fixed list of URL strings, return `StartRequests::Urls(...)`.
246 /// To use custom generation logic, return `StartRequests::Iter(...)`.
247 ///
248 /// This method is the better override point whenever initial requests need
249 /// more than a URL string, such as per-request metadata, POST bodies, or
250 /// custom headers.
251 ///
252 /// ## Example
253 ///
254 /// ```rust,ignore
255 /// # use spider_core::{scraped_item, Spider, StartRequests};
256 /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
257 /// # #[scraped_item]
258 /// # struct ExampleItem {
259 /// # value: String,
260 /// # }
261 /// # struct MySpider;
262 /// # #[async_trait::async_trait]
263 /// # impl Spider for MySpider {
264 /// # type Item = ExampleItem;
265 /// # type State = ();
266 /// fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
267 /// Ok(StartRequests::file("seeds/start_urls.txt"))
268 /// }
269 /// # async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
270 /// # todo!()
271 /// # }
272 /// # }
273 /// ```
274 fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
275 Ok(StartRequests::Urls(self.start_urls()))
276 }
277
278 /// Parses a response and extracts scraped items and new requests.
279 ///
280 /// # Errors
281 ///
282 /// This is the primary method where scraping logic is implemented. It receives
283 /// a [`Response`] object and should extract structured data (items) and/or
284 /// discover new URLs to crawl (requests).
285 ///
286 /// ## Parameters
287 ///
288 /// - `response`: The HTTP response to parse, containing the body, headers, and URL
289 /// - `state`: A shared reference to the spider's state, which can be used to
290 /// track information across multiple parse calls
291 ///
292 /// ## Returns
293 ///
294 /// Returns a [`ParseOutput`] containing:
295 /// - Scraped items of type `Self::Item`
296 /// - New [`Request`] objects to be enqueued
297 ///
298 /// The usual pattern is:
299 /// - call [`ParseOutput::new`]
300 /// - add zero or more items with [`ParseOutput::add_item`] or `add_items`
301 /// - add zero or more follow-up requests with [`ParseOutput::add_request`]
302 /// or `add_requests`
303 /// - return the accumulated output
304 ///
305 /// ## Design Notes
306 ///
307 /// This method takes an immutable reference to `self` (`&self`) instead of
308 /// mutable (`&mut self`), eliminating the need for mutex locks when accessing
309 /// the spider in concurrent environments. State that needs to be modified
310 /// should be stored in the `State` type using thread-safe primitives.
311 ///
312 /// # Errors
313 ///
314 /// Returns a [`SpiderError`] if parsing fails or if an unrecoverable error
315 /// occurs during processing.
316 ///
317 /// # Example
318 ///
319 /// ```rust,ignore
320 /// # use spider_core::{scraped_item, Spider, StartRequests};
321 /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
322 /// # use async_trait::async_trait;
323 /// # struct MySpider;
324 /// # #[scraped_item]
325 /// # struct ExampleItem {
326 /// # value: String,
327 /// # }
328 /// # #[derive(Default)]
329 /// # struct MySpiderState;
330 /// # #[async_trait]
331 /// # impl Spider for MySpider {
332 /// # type Item = ExampleItem;
333 /// # type State = MySpiderState;
334 /// # fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
335 /// # Ok(StartRequests::Iter(Box::new(std::iter::empty())))
336 /// # }
337 /// async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
338 /// let mut output = ParseOutput::new();
339 ///
340 /// // Parse HTML and extract data
341 /// if let Ok(html) = response.to_html() {
342 /// // ... extraction logic ...
343 /// }
344 ///
345 /// Ok(output)
346 /// }
347 /// # }
348 /// ```
349 async fn parse(
350 &self,
351 response: Response,
352 state: &Self::State,
353 ) -> Result<ParseOutput<Self::Item>, SpiderError>;
354}