Skip to main content

spider_core/
spider.rs

1//! The spider trait and request bootstrap types.
2//!
3//! [`Spider`] is the main contract every crawler implements. It defines how
4//! a crawl starts and how each downloaded response turns into scraped items and
5//! follow-up requests.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::Spider;
11//! use spider_util::{response::Response, error::SpiderError, item::ParseOutput};
12//! use async_trait::async_trait;
13//!
14//! #[spider_macro::scraped_item]
15//! struct Article {
16//!     title: String,
17//!     content: String,
18//! }
19//!
20//! // State for tracking page count
21//! use std::sync::Arc;
22//! use std::sync::atomic::{AtomicUsize, Ordering};
23//! use dashmap::DashMap;
24//!
25//! #[derive(Clone, Default)]
26//! struct ArticleSpiderState {
27//!     page_count: Arc<AtomicUsize>,
28//!     visited_urls: Arc<DashMap<String, bool>>,
29//! }
30//!
31//! impl ArticleSpiderState {
32//!     fn increment_page_count(&self) {
33//!         self.page_count.fetch_add(1, Ordering::SeqCst);
34//!     }
35//!
36//!     fn mark_url_visited(&self, url: String) {
37//!         self.visited_urls.insert(url, true);
38//!     }
39//! }
40//!
41//! struct ArticleSpider;
42//!
43//! #[async_trait]
44//! impl Spider for ArticleSpider {
45//!     type Item = Article;
46//!     type State = ArticleSpiderState;
47//!
48//!     fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
49//!         let req = Request::new("https://example.com/articles".parse()?);
50//!         Ok(StartRequests::Iter(Box::new(std::iter::once(Ok(req)))))
51//!     }
52//!
53//!     async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
54//!         // Update state - can be done concurrently without blocking the spider
55//!         state.increment_page_count();
56//!         state.mark_url_visited(response.url.to_string());
57//!
58//!         let mut output = ParseOutput::new();
59//!
60//!         // Extract articles from the page
61//!         // ... parsing logic ...
62//!
63//!         // Add discovered articles to output
64//!         // output.add_item(Article { title, content });
65//!
66//!         // Add new URLs to follow
67//!         // output.add_request(new_request);
68//!
69//!         Ok(output)
70//!     }
71//! }
72//! ```
73
74use spider_util::error::SpiderError;
75use spider_util::item::{ParseOutput, ScrapedItem};
76use spider_util::request::Request;
77use spider_util::response::Response;
78
79use anyhow::Result;
80use async_trait::async_trait;
81use std::fs::File;
82use std::io::{BufRead, BufReader};
83use std::path::Path;
84use url::Url;
85
86/// A boxed iterator of start requests.
87pub type StartRequestIter<'a> = Box<dyn Iterator<Item = Result<Request, SpiderError>> + Send + 'a>;
88
89/// Initial request source returned by [`Spider::start_requests`].
90///
91/// Use [`StartRequests::Urls`] for simple static seeds, [`StartRequests::Iter`]
92/// when you need to construct full [`Request`] values or generate seeds
93/// lazily, and [`StartRequests::File`] when you want to keep large seed lists
94/// outside compiled code.
95pub enum StartRequests<'a> {
96    /// Fixed list of seed URLs.
97    Urls(Vec<&'a str>),
98    /// Direct request iterator supplied by the spider.
99    Iter(StartRequestIter<'a>),
100    /// Path to a plain-text seed file (one URL per line).
101    File(&'a str),
102}
103
104impl<'a> StartRequests<'a> {
105    /// Creates a file-based source from a path string.
106    ///
107    /// The file is expected to contain one URL per line. Empty lines and lines
108    /// starting with `#` are ignored.
109    pub fn file(path: &'a str) -> Self {
110        StartRequests::File(path)
111    }
112
113    /// Resolves this source into a concrete request iterator.
114    #[allow(clippy::should_implement_trait)]
115    ///
116    /// URL strings are parsed eagerly as the iterator is consumed. Invalid file
117    /// entries become `SpiderError::ConfigurationError` items that preserve the
118    /// original line number.
119    pub fn into_iter(self) -> Result<StartRequestIter<'a>, SpiderError> {
120        match self {
121            StartRequests::Urls(urls) => {
122                let requests = urls
123                    .into_iter()
124                    .map(|u| Url::parse(u).map(Request::new).map_err(SpiderError::from));
125                Ok(Box::new(requests))
126            }
127            StartRequests::Iter(iter) => Ok(iter),
128            StartRequests::File(path) => start_requests_from_file(path),
129        }
130    }
131}
132
133fn start_requests_from_file<P: AsRef<Path>>(
134    path: P,
135) -> Result<StartRequestIter<'static>, SpiderError> {
136    let path = path.as_ref();
137    let file = File::open(path)?;
138    let path_display = path.display().to_string();
139    let mut lines = BufReader::new(file).lines().enumerate();
140
141    let iter = std::iter::from_fn(move || {
142        loop {
143            let (line_idx, line_res) = lines.next()?;
144            let line_number = line_idx + 1;
145            match line_res {
146                Ok(line) => {
147                    let trimmed = line.trim();
148                    if trimmed.is_empty() || trimmed.starts_with('#') {
149                        continue;
150                    }
151
152                    return Some(match Url::parse(trimmed) {
153                        Ok(url) => Ok(Request::new(url)),
154                        Err(e) => Err(SpiderError::ConfigurationError(format!(
155                            "Invalid start URL in {} at line {}: {}",
156                            path_display, line_number, e
157                        ))),
158                    });
159                }
160                Err(e) => {
161                    return Some(Err(SpiderError::IoError(format!(
162                        "Failed reading {} at line {}: {}",
163                        path_display, line_number, e
164                    ))));
165                }
166            }
167        }
168    });
169
170    Ok(Box::new(iter))
171}
172
173/// Defines the contract for a spider.
174///
175/// ## Type Parameters
176///
177/// - `Item`: The type of scraped data structure (must implement [`ScrapedItem`])
178/// - `State`: The type of shared state (must implement `Default`)
179///
180/// ## Design Notes
181///
182/// The trait uses `&self` (immutable reference) instead of `&mut self` for the
183/// [`parse`](Spider::parse) method. This design enables efficient concurrent crawling
184/// by eliminating the need for mutex locks when accessing the spider from multiple
185/// async tasks. State that needs mutation should be stored in the associated
186/// `State` type using thread-safe primitives like `Arc<AtomicUsize>` or `DashMap`.
187///
188/// A typical crawl lifecycle looks like this:
189///
190/// 1. [`start_requests`](Spider::start_requests) produces the initial requests
191/// 2. the runtime schedules and downloads them
192/// 3. [`parse`](Spider::parse) turns each [`Response`] into a [`ParseOutput`]
193/// 4. emitted items go to pipelines and emitted requests go back to the scheduler
194#[async_trait]
195pub trait Spider: Send + Sync + 'static {
196    /// The type of item that the spider scrapes.
197    ///
198    /// This associated type must implement the [`ScrapedItem`] trait, which
199    /// provides methods for type erasure, cloning, and JSON serialization.
200    /// Use the `#[scraped_item]` procedural macro to automatically implement
201    /// all required traits for your data structures.
202    type Item: ScrapedItem;
203
204    /// The type of state that the spider uses.
205    ///
206    /// The state type must implement `Default` so it can be instantiated
207    /// automatically by the crawler. It should also be `Send + Sync` to
208    /// enable safe concurrent access from multiple async tasks.
209    ///
210    /// ## Example
211    ///
212    /// ```rust,ignore
213    /// use std::sync::Arc;
214    /// use std::sync::atomic::{AtomicUsize, Ordering};
215    /// use dashmap::DashMap;
216    ///
217    /// #[derive(Clone, Default)]
218    /// struct MySpiderState {
219    ///     page_count: Arc<AtomicUsize>,
220    ///     visited_urls: Arc<DashMap<String, bool>>,
221    /// }
222    /// ```
223    type State: Default + Send + Sync;
224
225    /// Returns static seed URLs.
226    ///
227    /// This method is optional and useful for simple spiders. The default
228    /// [`start_requests`](Spider::start_requests) implementation converts these
229    /// URLs into a request iterator.
230    ///
231    /// Prefer this method when plain URL strings are enough. Override
232    /// [`start_requests`](Spider::start_requests) instead when you need custom
233    /// headers, methods, request metadata, seed-file loading, or dynamic seed
234    /// generation.
235    fn start_urls(&self) -> Vec<&'static str> {
236        Vec::new()
237    }
238
239    /// Returns the initial request source used to start crawling.
240    ///
241    /// The default implementation converts [`start_urls`](Spider::start_urls)
242    /// into an iterator.
243    ///
244    /// To load from seed file, return `StartRequests::file(path)`.
245    /// To use a fixed list of URL strings, return `StartRequests::Urls(...)`.
246    /// To use custom generation logic, return `StartRequests::Iter(...)`.
247    ///
248    /// This method is the better override point whenever initial requests need
249    /// more than a URL string, such as per-request metadata, POST bodies, or
250    /// custom headers.
251    ///
252    /// ## Example
253    ///
254    /// ```rust,ignore
255    /// # use spider_core::{scraped_item, Spider, StartRequests};
256    /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
257    /// # #[scraped_item]
258    /// # struct ExampleItem {
259    /// #     value: String,
260    /// # }
261    /// # struct MySpider;
262    /// # #[async_trait::async_trait]
263    /// # impl Spider for MySpider {
264    /// #     type Item = ExampleItem;
265    /// #     type State = ();
266    /// fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
267    ///     Ok(StartRequests::file("seeds/start_urls.txt"))
268    /// }
269    /// # async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
270    /// #     todo!()
271    /// # }
272    /// # }
273    /// ```
274    fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
275        Ok(StartRequests::Urls(self.start_urls()))
276    }
277
278    /// Parses a response and extracts scraped items and new requests.
279    ///
280    /// # Errors
281    ///
282    /// This is the primary method where scraping logic is implemented. It receives
283    /// a [`Response`] object and should extract structured data (items) and/or
284    /// discover new URLs to crawl (requests).
285    ///
286    /// ## Parameters
287    ///
288    /// - `response`: The HTTP response to parse, containing the body, headers, and URL
289    /// - `state`: A shared reference to the spider's state, which can be used to
290    ///   track information across multiple parse calls
291    ///
292    /// ## Returns
293    ///
294    /// Returns a [`ParseOutput`] containing:
295    /// - Scraped items of type `Self::Item`
296    /// - New [`Request`] objects to be enqueued
297    ///
298    /// The usual pattern is:
299    /// - call [`ParseOutput::new`]
300    /// - add zero or more items with [`ParseOutput::add_item`] or `add_items`
301    /// - add zero or more follow-up requests with [`ParseOutput::add_request`]
302    ///   or `add_requests`
303    /// - return the accumulated output
304    ///
305    /// ## Design Notes
306    ///
307    /// This method takes an immutable reference to `self` (`&self`) instead of
308    /// mutable (`&mut self`), eliminating the need for mutex locks when accessing
309    /// the spider in concurrent environments. State that needs to be modified
310    /// should be stored in the `State` type using thread-safe primitives.
311    ///
312    /// # Errors
313    ///
314    /// Returns a [`SpiderError`] if parsing fails or if an unrecoverable error
315    /// occurs during processing.
316    ///
317    /// # Example
318    ///
319    /// ```rust,ignore
320    /// # use spider_core::{scraped_item, Spider, StartRequests};
321    /// # use spider_util::{response::Response, error::SpiderError, item::{ParseOutput, ScrapedItem}};
322    /// # use async_trait::async_trait;
323    /// # struct MySpider;
324    /// # #[scraped_item]
325    /// # struct ExampleItem {
326    /// #     value: String,
327    /// # }
328    /// # #[derive(Default)]
329    /// # struct MySpiderState;
330    /// # #[async_trait]
331    /// # impl Spider for MySpider {
332    /// #     type Item = ExampleItem;
333    /// #     type State = MySpiderState;
334    /// #     fn start_requests(&self) -> Result<StartRequests<'_>, SpiderError> {
335    /// #         Ok(StartRequests::Iter(Box::new(std::iter::empty())))
336    /// #     }
337    /// async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
338    ///     let mut output = ParseOutput::new();
339    ///
340    ///     // Parse HTML and extract data
341    ///     if let Ok(html) = response.to_html() {
342    ///         // ... extraction logic ...
343    ///     }
344    ///
345    ///     Ok(output)
346    /// }
347    /// # }
348    /// ```
349    async fn parse(
350        &self,
351        response: Response,
352        state: &Self::State,
353    ) -> Result<ParseOutput<Self::Item>, SpiderError>;
354}