spider_core/spider.rs
1//! The spider trait and request bootstrap types.
2//!
3//! [`Spider`] is the main contract every crawler implements. It defines how
4//! a crawl starts and how each downloaded response turns into scraped items and
5//! follow-up requests.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::Spider;
11//! use spider_util::{response::Response, error::SpiderError, item::ParseOutput};
12//! use async_trait::async_trait;
13//!
14//! #[spider_macro::scraped_item]
15//! struct Article {
16//! title: String,
17//! content: String,
18//! }
19//!
20//! // State for tracking page count
21//! use std::sync::Arc;
22//! use std::sync::atomic::{AtomicUsize, Ordering};
23//! use dashmap::DashMap;
24//!
25//! #[derive(Clone, Default)]
26//! struct ArticleSpiderState {
27//! page_count: Arc<AtomicUsize>,
28//! visited_urls: Arc<DashMap<String, bool>>,
29//! }
30//!
31//! impl ArticleSpiderState {
32//! fn increment_page_count(&self) {
33//! self.page_count.fetch_add(1, Ordering::SeqCst);
34//! }
35//!
36//! fn mark_url_visited(&self, url: String) {
37//! self.visited_urls.insert(url, true);
38//! }
39//! }
40//!
41//! struct ArticleSpider;
42//!
43//! #[async_trait]
44//! impl Spider for ArticleSpider {
45//! type Item = Article;
46//! type State = ArticleSpiderState;
47//!
48//! fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
49//! let req = Request::new("https://example.com/articles".parse()?);
50//! Ok(StartRequests::iter(std::iter::once(Ok(req))))
51//! }
52//!
53//! async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
54//! // Update state - can be done concurrently without blocking the spider
55//! state.increment_page_count();
56//! state.mark_url_visited(response.url.to_string());
57//!
58//! let mut output = ParseOutput::new();
59//!
60//! // Extract articles from the page
61//! // ... parsing logic ...
62//!
63//! // Add discovered articles to output
64//! // output.add_item(Article { title, content });
65//!
66//! // Add new URLs to follow
67//! // output.add_request(new_request);
68//!
69//! Ok(output)
70//! }
71//! }
72//! ```
73
74use spider_util::error::SpiderError;
75use spider_util::item::{ParseOutput, ScrapedItem};
76use spider_util::request::Request;
77use spider_util::response::Response;
78
79use anyhow::Result;
80use async_trait::async_trait;
81use std::fs::File;
82use std::io::{BufRead, BufReader};
83use std::path::Path;
84use url::Url;
85
86/// A boxed iterator of start requests.
87pub type StartRequestIter<'a> = Box<dyn Iterator<Item = Result<Request, SpiderError>> + Send + 'a>;
88
89/// Initial request source returned by [`Spider::start_requests`].
90///
91/// Use [`StartRequests::Urls`] for simple static seeds, [`StartRequests::Iter`]
92/// when you need to construct full [`Request`] values or generate seeds
93/// lazily, and [`StartRequests::File`] when you want to keep large seed lists
94/// outside compiled code.
95pub enum StartRequests<'a> {
96 /// Fixed list of seed URLs.
97 Urls(Vec<&'a str>),
98 /// Direct request iterator supplied by the spider.
99 Iter(StartRequestIter<'a>),
100 /// Path to a plain-text seed file (one URL per line).
101 File(&'a str),
102}
103
104impl<'a> StartRequests<'a> {
105 /// Creates an iterator-based source from any compatible request iterator.
106 pub fn iter<I>(iter: I) -> Self
107 where
108 I: Iterator<Item = Result<Request, SpiderError>> + Send + 'a,
109 {
110 StartRequests::Iter(Box::new(iter))
111 }
112
113 /// Creates a file-based source from a path string.
114 ///
115 /// The file is expected to contain one URL per line. Empty lines and lines
116 /// starting with `#` are ignored.
117 pub fn file(path: &'a str) -> Self {
118 StartRequests::File(path)
119 }
120
121 /// Resolves this source into a concrete request iterator.
122 #[allow(clippy::should_implement_trait)]
123 ///
124 /// URL strings are parsed eagerly as the iterator is consumed. Invalid file
125 /// entries become `SpiderError::ConfigurationError` items that preserve the
126 /// original line number.
127 pub fn into_iter(self) -> Result<StartRequestIter<'a>, SpiderError> {
128 match self {
129 StartRequests::Urls(urls) => {
130 let requests = urls
131 .into_iter()
132 .map(|u| Url::parse(u).map(Request::new).map_err(SpiderError::from));
133 Ok(Box::new(requests))
134 }
135 StartRequests::Iter(iter) => Ok(iter),
136 StartRequests::File(path) => start_requests_from_file(path),
137 }
138 }
139}
140
141impl<'a, I> From<I> for StartRequests<'a>
142where
143 I: Iterator<Item = Result<Request, SpiderError>> + Send + 'a,
144{
145 fn from(iter: I) -> Self {
146 StartRequests::iter(iter)
147 }
148}
149
150fn start_requests_from_file<P: AsRef<Path>>(
151 path: P,
152) -> Result<StartRequestIter<'static>, SpiderError> {
153 let path = path.as_ref();
154 let file = File::open(path)?;
155 let path_display = path.display().to_string();
156 let mut lines = BufReader::new(file).lines().enumerate();
157
158 let iter = std::iter::from_fn(move || {
159 loop {
160 let (line_idx, line_res) = lines.next()?;
161 let line_number = line_idx + 1;
162 match line_res {
163 Ok(line) => {
164 let trimmed = line.trim();
165 if trimmed.is_empty() || trimmed.starts_with('#') {
166 continue;
167 }
168
169 return Some(match Url::parse(trimmed) {
170 Ok(url) => Ok(Request::new(url)),
171 Err(e) => Err(SpiderError::ConfigurationError(format!(
172 "Invalid start URL in {} at line {}: {}",
173 path_display, line_number, e
174 ))),
175 });
176 }
177 Err(e) => {
178 return Some(Err(SpiderError::IoError(format!(
179 "Failed reading {} at line {}: {}",
180 path_display, line_number, e
181 ))));
182 }
183 }
184 }
185 });
186
187 Ok(Box::new(iter))
188}
189
190/// Defines the contract for a spider.
191///
192/// ## Type Parameters
193///
194/// - `Item`: The type of scraped data structure (must implement [`ScrapedItem`])
195/// - `State`: The type of shared state (must implement `Default`)
196///
197/// ## Design Notes
198///
199/// The trait uses `&self` (immutable reference) instead of `&mut self` for the
200/// [`parse`](Spider::parse) method. This design enables efficient concurrent crawling
201/// by eliminating the need for mutex locks when accessing the spider from multiple
202/// async tasks. State that needs mutation should be stored in the associated
203/// `State` type using thread-safe primitives like `Arc<AtomicUsize>` or `DashMap`.
204///
205/// A typical crawl lifecycle looks like this:
206///
207/// 1. [`start_requests`](Spider::start_requests) produces the initial requests
208/// 2. the runtime schedules and downloads them
209/// 3. [`parse`](Spider::parse) turns each [`Response`] into a [`ParseOutput`]
210/// 4. emitted items go to pipelines and emitted requests go back to the scheduler
211#[async_trait]
212pub trait Spider: Send + Sync + 'static {
213 /// The type of item that the spider scrapes.
214 ///
215 /// This associated type must implement the [`ScrapedItem`] trait, which
216 /// provides methods for type erasure, cloning, and JSON serialization.
217 /// Use the `#[scraped_item]` procedural macro to automatically implement
218 /// all required traits for your data structures.
219 type Item: ScrapedItem;
220
221 /// The type of state that the spider uses.
222 ///
223 /// The state type must implement `Default` so it can be instantiated
224 /// automatically by the crawler. It should also be `Send + Sync` to
225 /// enable safe concurrent access from multiple async tasks.
226 ///
227 /// ## Example
228 ///
229 /// ```rust,ignore
230 /// use std::sync::Arc;
231 /// use std::sync::atomic::{AtomicUsize, Ordering};
232 /// use dashmap::DashMap;
233 ///
234 /// #[derive(Clone, Default)]
235 /// struct MySpiderState {
236 /// page_count: Arc<AtomicUsize>,
237 /// visited_urls: Arc<DashMap<String, bool>>,
238 /// }
239 /// ```
240 type State: Default + Send + Sync;
241
242 /// Returns static seed URLs.
243 ///
244 /// This method is optional and useful for simple spiders. The default
245 /// [`start_requests`](Spider::start_requests) implementation converts these
246 /// URLs into a request iterator.
247 ///
248 /// Prefer this method when plain URL strings are enough. Override
249 /// [`start_requests`](Spider::start_requests) instead when you need custom
250 /// headers, methods, request metadata, seed-file loading, or dynamic seed
251 /// generation.
252 fn start_urls(&self) -> Vec<&'static str> {
253 Vec::new()
254 }
255
256 /// Returns the initial request source used to start crawling.
257 ///
258 /// The default implementation converts [`start_urls`](Spider::start_urls)
259 /// into an iterator.
260 ///
261 /// To load from seed file, return `StartRequests::file(path)`.
262 /// To use a fixed list of URL strings, return `StartRequests::Urls(...)`.
263 /// To use custom generation logic, return `StartRequests::iter(...)`.
264 ///
265 /// This method is the better override point whenever initial requests need
266 /// more than a URL string, such as per-request metadata, POST bodies, or
267 /// custom headers.
268 ///
269 /// ## Example
270 ///
271 /// ```rust,ignore
272 /// # use spider_core::{scraped_item, Spider, StartRequests};
273 /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
274 /// # #[scraped_item]
275 /// # struct ExampleItem {
276 /// # value: String,
277 /// # }
278 /// # struct MySpider;
279 /// # #[async_trait::async_trait]
280 /// # impl Spider for MySpider {
281 /// # type Item = ExampleItem;
282 /// # type State = ();
283 /// fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
284 /// Ok(StartRequests::file("seeds/start_urls.txt"))
285 /// }
286 /// # async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
287 /// # todo!()
288 /// # }
289 /// # }
290 /// ```
291 fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
292 Ok(StartRequests::Urls(self.start_urls()))
293 }
294
295 /// Parses a response and extracts scraped items and new requests.
296 ///
297 /// # Errors
298 ///
299 /// This is the primary method where scraping logic is implemented. It receives
300 /// a [`Response`] object and should extract structured data (items) and/or
301 /// discover new URLs to crawl (requests).
302 ///
303 /// ## Parameters
304 ///
305 /// - `response`: The HTTP response to parse, containing the body, headers, and URL
306 /// - `state`: A shared reference to the spider's state, which can be used to
307 /// track information across multiple parse calls
308 ///
309 /// ## Returns
310 ///
311 /// Returns a [`ParseOutput`] containing:
312 /// - Scraped items of type `Self::Item`
313 /// - New [`Request`] objects to be enqueued
314 ///
315 /// The usual pattern is:
316 /// - call [`ParseOutput::new`]
317 /// - add zero or more items with [`ParseOutput::add_item`] or `add_items`
318 /// - add zero or more follow-up requests with [`ParseOutput::add_request`]
319 /// or `add_requests`
320 /// - return the accumulated output
321 ///
322 /// ## Design Notes
323 ///
324 /// This method takes an immutable reference to `self` (`&self`) instead of
325 /// mutable (`&mut self`), eliminating the need for mutex locks when accessing
326 /// the spider in concurrent environments. State that needs to be modified
327 /// should be stored in the `State` type using thread-safe primitives.
328 ///
329 /// # Errors
330 ///
331 /// Returns a [`SpiderError`] if parsing fails or if an unrecoverable error
332 /// occurs during processing.
333 ///
334 /// # Example
335 ///
336 /// ```rust,ignore
337 /// # use spider_core::{scraped_item, Spider, StartRequests};
338 /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
339 /// # use async_trait::async_trait;
340 /// # struct MySpider;
341 /// # #[scraped_item]
342 /// # struct ExampleItem {
343 /// # value: String,
344 /// # }
345 /// # #[derive(Default)]
346 /// # struct MySpiderState;
347 /// # #[async_trait]
348 /// # impl Spider for MySpider {
349 /// # type Item = ExampleItem;
350 /// # type State = MySpiderState;
351 /// # fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
352 /// # Ok(StartRequests::iter(std::iter::empty()))
353 /// # }
354 /// async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
355 /// let mut output = ParseOutput::new();
356 ///
357 /// // Parse HTML and extract data
358 /// if let Ok(html) = response.to_html() {
359 /// // ... extraction logic ...
360 /// }
361 ///
362 /// Ok(output)
363 /// }
364 /// # }
365 /// ```
366 async fn parse(
367 &self,
368 response: Response,
369 state: &Self::State,
370 ) -> Result<ParseOutput<Self::Item>, SpiderError>;
371}