Skip to main content

spider_lib/
item.rs

1//! Data structures for scraped items and spider output in `spider-lib`.
2//!
3//! This module defines the `ScrapedItem` trait, which is the core abstraction
4//! for any data extracted by a web spider. Implementors of this trait define
5//! the shape of the data they wish to collect.
6//!
7//! Additionally, the `ParseOutput` struct is provided as the standard return type
8//! for a spider's `parse` method. It encapsulates both the `ScrapedItem`s
9//! found on a page and any new `Request`s that should be scheduled for crawling.
10//! This allows spiders to not only extract data but also to discover and
11//! follow new links within the same processing step.
12
13use crate::request::Request;
14use serde_json::Value;
15use std::any::Any;
16use std::fmt::Debug;
17
18/// The output of a spider's `parse` method.
19#[derive(Debug, Clone)]
20pub struct ParseOutput<I> {
21    items: Vec<I>,
22    requests: Vec<Request>,
23}
24
25impl<I> ParseOutput<I> {
26    /// Creates a new, empty `ParseOutput`.
27    pub fn new() -> Self {
28        Self {
29            items: Vec::new(),
30            requests: Vec::new(),
31        }
32    }
33
34    /// Consumes the `ParseOutput` and returns its inner items and requests.
35    pub fn into_parts(self) -> (Vec<I>, Vec<Request>) {
36        (self.items, self.requests)
37    }
38
39    /// Adds a scraped item to the output.
40    pub fn add_item(&mut self, item: I) {
41        self.items.push(item);
42    }
43
44    /// Adds a new request to be crawled.
45    pub fn add_request(&mut self, request: Request) {
46        self.requests.push(request);
47    }
48
49    /// Adds multiple scraped items to the output.
50    pub fn add_items(&mut self, items: impl IntoIterator<Item = I>) {
51        self.items.extend(items);
52    }
53
54    /// Adds multiple new requests to be crawled.
55    pub fn add_requests(&mut self, requests: impl IntoIterator<Item = Request>) {
56        self.requests.extend(requests);
57    }
58}
59
60impl<I> Default for ParseOutput<I> {
61    fn default() -> Self {
62        Self::new()
63    }
64}
65
66/// A trait representing a scraped item.
67pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
68    /// Returns the item as a `dyn Any` for downcasting.
69    fn as_any(&self) -> &dyn Any;
70    /// Clones the item into a `Box<dyn ScrapedItem>`.
71    fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
72    /// Converts the item to a `serde_json::Value`.
73    fn to_json_value(&self) -> Value;
74}
75
76impl Clone for Box<dyn ScrapedItem + Send + Sync> {
77    fn clone(&self) -> Self {
78        self.box_clone()
79    }
80}