Skip to main content

spider_core/
spider.rs

1//! The spider trait and request bootstrap types.
2//!
3//! [`Spider`] is the main contract every crawler implements. It defines how
4//! a crawl starts and how each downloaded response turns into scraped items and
5//! follow-up requests.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::Spider;
11//! use spider_util::{response::Response, error::SpiderError, item::ParseOutput};
12//! use async_trait::async_trait;
13//!
14//! #[spider_macro::scraped_item]
15//! struct Article {
16//!     title: String,
17//!     content: String,
18//! }
19//!
20//! // State for tracking page count
21//! use std::sync::Arc;
22//! use std::sync::atomic::{AtomicUsize, Ordering};
23//! use dashmap::DashMap;
24//!
25//! #[derive(Clone, Default)]
26//! struct ArticleSpiderState {
27//!     page_count: Arc<AtomicUsize>,
28//!     visited_urls: Arc<DashMap<String, bool>>,
29//! }
30//!
31//! impl ArticleSpiderState {
32//!     fn increment_page_count(&self) {
33//!         self.page_count.fetch_add(1, Ordering::SeqCst);
34//!     }
35//!
36//!     fn mark_url_visited(&self, url: String) {
37//!         self.visited_urls.insert(url, true);
38//!     }
39//! }
40//!
41//! struct ArticleSpider;
42//!
43//! #[async_trait]
44//! impl Spider for ArticleSpider {
45//!     type Item = Article;
46//!     type State = ArticleSpiderState;
47//!
48//!     fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
49//!         let req = Request::new("https://example.com/articles".parse()?);
50//!         Ok(StartRequests::iter(std::iter::once(Ok(req))))
51//!     }
52//!
53//!     async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
54//!         // Update state - can be done concurrently without blocking the spider
55//!         state.increment_page_count();
56//!         state.mark_url_visited(response.url.to_string());
57//!
58//!         let mut output = ParseOutput::new();
59//!
60//!         // Extract articles from the page
61//!         // ... parsing logic ...
62//!
63//!         // Add discovered articles to output
64//!         // output.add_item(Article { title, content });
65//!
66//!         // Add new URLs to follow
67//!         // output.add_request(new_request);
68//!
69//!         Ok(output)
70//!     }
71//! }
72//! ```
73
74use spider_util::error::SpiderError;
75use spider_util::item::{ParseOutput, ScrapedItem};
76use spider_util::request::Request;
77use spider_util::response::Response;
78
79use anyhow::Result;
80use async_trait::async_trait;
81use std::fs::File;
82use std::io::{BufRead, BufReader};
83use std::path::Path;
84use url::Url;
85
86/// A boxed iterator of start requests.
87pub type StartRequestIter<'a> = Box<dyn Iterator<Item = Result<Request, SpiderError>> + Send + 'a>;
88
89/// Initial request source returned by [`Spider::start_requests`].
90///
91/// Use [`StartRequests::Urls`] for simple static seeds, [`StartRequests::Iter`]
92/// when you need to construct full [`Request`] values or generate seeds
93/// lazily, and [`StartRequests::File`] when you want to keep large seed lists
94/// outside compiled code.
95pub enum StartRequests<'a> {
96    /// Fixed list of seed URLs.
97    Urls(Vec<&'a str>),
98    /// Direct request iterator supplied by the spider.
99    Iter(StartRequestIter<'a>),
100    /// Path to a plain-text seed file (one URL per line).
101    File(&'a str),
102}
103
104impl<'a> StartRequests<'a> {
105    /// Creates an iterator-based source from any compatible request iterator.
106    pub fn iter<I>(iter: I) -> Self
107    where
108        I: Iterator<Item = Result<Request, SpiderError>> + Send + 'a,
109    {
110        StartRequests::Iter(Box::new(iter))
111    }
112
113    /// Creates a file-based source from a path string.
114    ///
115    /// The file is expected to contain one URL per line. Empty lines and lines
116    /// starting with `#` are ignored.
117    pub fn file(path: &'a str) -> Self {
118        StartRequests::File(path)
119    }
120
121    /// Resolves this source into a concrete request iterator.
122    #[allow(clippy::should_implement_trait)]
123    ///
124    /// URL strings are parsed eagerly as the iterator is consumed. Invalid file
125    /// entries become `SpiderError::ConfigurationError` items that preserve the
126    /// original line number.
127    pub fn into_iter(self) -> Result<StartRequestIter<'a>, SpiderError> {
128        match self {
129            StartRequests::Urls(urls) => {
130                let requests = urls
131                    .into_iter()
132                    .map(|u| Url::parse(u).map(Request::new).map_err(SpiderError::from));
133                Ok(Box::new(requests))
134            }
135            StartRequests::Iter(iter) => Ok(iter),
136            StartRequests::File(path) => start_requests_from_file(path),
137        }
138    }
139}
140
141impl<'a, I> From<I> for StartRequests<'a>
142where
143    I: Iterator<Item = Result<Request, SpiderError>> + Send + 'a,
144{
145    fn from(iter: I) -> Self {
146        StartRequests::iter(iter)
147    }
148}
149
150fn start_requests_from_file<P: AsRef<Path>>(
151    path: P,
152) -> Result<StartRequestIter<'static>, SpiderError> {
153    let path = path.as_ref();
154    let file = File::open(path)?;
155    let path_display = path.display().to_string();
156    let mut lines = BufReader::new(file).lines().enumerate();
157
158    let iter = std::iter::from_fn(move || {
159        loop {
160            let (line_idx, line_res) = lines.next()?;
161            let line_number = line_idx + 1;
162            match line_res {
163                Ok(line) => {
164                    let trimmed = line.trim();
165                    if trimmed.is_empty() || trimmed.starts_with('#') {
166                        continue;
167                    }
168
169                    return Some(match Url::parse(trimmed) {
170                        Ok(url) => Ok(Request::new(url)),
171                        Err(e) => Err(SpiderError::ConfigurationError(format!(
172                            "Invalid start URL in {} at line {}: {}",
173                            path_display, line_number, e
174                        ))),
175                    });
176                }
177                Err(e) => {
178                    return Some(Err(SpiderError::IoError(format!(
179                        "Failed reading {} at line {}: {}",
180                        path_display, line_number, e
181                    ))));
182                }
183            }
184        }
185    });
186
187    Ok(Box::new(iter))
188}
189
190/// Defines the contract for a spider.
191///
192/// ## Type Parameters
193///
194/// - `Item`: The type of scraped data structure (must implement [`ScrapedItem`])
195/// - `State`: The type of shared state (must implement `Default`)
196///
197/// ## Design Notes
198///
199/// The trait uses `&self` (immutable reference) instead of `&mut self` for the
200/// [`parse`](Spider::parse) method. This design enables efficient concurrent crawling
201/// by eliminating the need for mutex locks when accessing the spider from multiple
202/// async tasks. State that needs mutation should be stored in the associated
203/// `State` type using thread-safe primitives like `Arc<AtomicUsize>` or `DashMap`.
204///
205/// A typical crawl lifecycle looks like this:
206///
207/// 1. [`start_requests`](Spider::start_requests) produces the initial requests
208/// 2. the runtime schedules and downloads them
209/// 3. [`parse`](Spider::parse) turns each [`Response`] into a [`ParseOutput`]
210/// 4. emitted items go to pipelines and emitted requests go back to the scheduler
211#[async_trait]
212pub trait Spider: Send + Sync + 'static {
213    /// The type of item that the spider scrapes.
214    ///
215    /// This associated type must implement the [`ScrapedItem`] trait, which
216    /// provides methods for type erasure, cloning, and JSON serialization.
217    /// Use the `#[scraped_item]` procedural macro to automatically implement
218    /// all required traits for your data structures.
219    type Item: ScrapedItem;
220
221    /// The type of state that the spider uses.
222    ///
223    /// The state type must implement `Default` so it can be instantiated
224    /// automatically by the crawler. It should also be `Send + Sync` to
225    /// enable safe concurrent access from multiple async tasks.
226    ///
227    /// ## Example
228    ///
229    /// ```rust,ignore
230    /// use std::sync::Arc;
231    /// use std::sync::atomic::{AtomicUsize, Ordering};
232    /// use dashmap::DashMap;
233    ///
234    /// #[derive(Clone, Default)]
235    /// struct MySpiderState {
236    ///     page_count: Arc<AtomicUsize>,
237    ///     visited_urls: Arc<DashMap<String, bool>>,
238    /// }
239    /// ```
240    type State: Default + Send + Sync;
241
242    /// Returns static seed URLs.
243    ///
244    /// This method is optional and useful for simple spiders. The default
245    /// [`start_requests`](Spider::start_requests) implementation converts these
246    /// URLs into a request iterator.
247    ///
248    /// Prefer this method when plain URL strings are enough. Override
249    /// [`start_requests`](Spider::start_requests) instead when you need custom
250    /// headers, methods, request metadata, seed-file loading, or dynamic seed
251    /// generation.
252    fn start_urls(&self) -> Vec<&'static str> {
253        Vec::new()
254    }
255
256    /// Returns the initial request source used to start crawling.
257    ///
258    /// The default implementation converts [`start_urls`](Spider::start_urls)
259    /// into an iterator.
260    ///
261    /// To load from seed file, return `StartRequests::file(path)`.
262    /// To use a fixed list of URL strings, return `StartRequests::Urls(...)`.
263    /// To use custom generation logic, return `StartRequests::iter(...)`.
264    ///
265    /// This method is the better override point whenever initial requests need
266    /// more than a URL string, such as per-request metadata, POST bodies, or
267    /// custom headers.
268    ///
269    /// ## Example
270    ///
271    /// ```rust,ignore
272    /// # use spider_core::{scraped_item, Spider, StartRequests};
273    /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
274    /// # #[scraped_item]
275    /// # struct ExampleItem {
276    /// #     value: String,
277    /// # }
278    /// # struct MySpider;
279    /// # #[async_trait::async_trait]
280    /// # impl Spider for MySpider {
281    /// #     type Item = ExampleItem;
282    /// #     type State = ();
283    /// fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
284    ///     Ok(StartRequests::file("seeds/start_urls.txt"))
285    /// }
286    /// # async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
287    /// #     todo!()
288    /// # }
289    /// # }
290    /// ```
291    fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
292        Ok(StartRequests::Urls(self.start_urls()))
293    }
294
295    /// Parses a response and extracts scraped items and new requests.
296    ///
297    /// # Errors
298    ///
299    /// This is the primary method where scraping logic is implemented. It receives
300    /// a [`Response`] object and should extract structured data (items) and/or
301    /// discover new URLs to crawl (requests).
302    ///
303    /// ## Parameters
304    ///
305    /// - `response`: The HTTP response to parse, containing the body, headers, and URL
306    /// - `state`: A shared reference to the spider's state, which can be used to
307    ///   track information across multiple parse calls
308    ///
309    /// ## Returns
310    ///
311    /// Returns a [`ParseOutput`] containing:
312    /// - Scraped items of type `Self::Item`
313    /// - New [`Request`] objects to be enqueued
314    ///
315    /// The usual pattern is:
316    /// - call [`ParseOutput::new`]
317    /// - add zero or more items with [`ParseOutput::add_item`] or `add_items`
318    /// - add zero or more follow-up requests with [`ParseOutput::add_request`]
319    ///   or `add_requests`
320    /// - return the accumulated output
321    ///
322    /// ## Design Notes
323    ///
324    /// This method takes an immutable reference to `self` (`&self`) instead of
325    /// mutable (`&mut self`), eliminating the need for mutex locks when accessing
326    /// the spider in concurrent environments. State that needs to be modified
327    /// should be stored in the `State` type using thread-safe primitives.
328    ///
329    /// # Errors
330    ///
331    /// Returns a [`SpiderError`] if parsing fails or if an unrecoverable error
332    /// occurs during processing.
333    ///
334    /// # Example
335    ///
336    /// ```rust,ignore
337    /// # use spider_core::{scraped_item, Spider, StartRequests};
338    /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
339    /// # use async_trait::async_trait;
340    /// # struct MySpider;
341    /// # #[scraped_item]
342    /// # struct ExampleItem {
343    /// #     value: String,
344    /// # }
345    /// # #[derive(Default)]
346    /// # struct MySpiderState;
347    /// # #[async_trait]
348    /// # impl Spider for MySpider {
349    /// #     type Item = ExampleItem;
350    /// #     type State = MySpiderState;
351    /// #     fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
352    /// #         Ok(StartRequests::iter(std::iter::empty()))
353    /// #     }
354    /// async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
355    ///     let mut output = ParseOutput::new();
356    ///
357    ///     // Parse HTML and extract data
358    ///     if let Ok(html) = response.to_html() {
359    ///         // ... extraction logic ...
360    ///     }
361    ///
362    ///     Ok(output)
363    /// }
364    /// # }
365    /// ```
366    async fn parse(
367        &self,
368        response: Response,
369        state: &Self::State,
370    ) -> Result<ParseOutput<Self::Item>, SpiderError>;
371}