Skip to main content

spider_util/
item.rs

1//! Item traits and parse results.
2//!
3//! [`ParseOutput`] is what a spider returns after parsing a page. It carries the
4//! emitted items plus any follow-up requests discovered on that page.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_util::item::{ScrapedItem, ParseOutput};
10//!
11//! #[spider_macro::scraped_item]
12//! struct Article {
13//!     title: String,
14//!     content: String,
15//! }
16//!
17//! // In your spider's parse method:
18//! // let mut output = ParseOutput::new();
19//! // output.add_item(Article { title: "...", content: "..." });
20//! // Ok(output)
21//! ```
22//!
23//! `ParseOutput` is intentionally small: it is just the handoff object between
24//! parsing and the rest of the runtime. Use it to emit items, schedule new
25//! requests, or both from the same page.
26
27use crate::request::Request;
28use serde_json::Value;
29use std::any::Any;
30use std::fmt::Debug;
31
32/// Stable field kinds used by typed item schema metadata.
33#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
34pub enum FieldValueType {
35    Bool,
36    Integer,
37    Float,
38    String,
39    Json,
40    Sequence,
41    Map,
42    Unknown,
43}
44
45/// Static schema metadata for a single item field.
46#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
47pub struct ItemFieldSchema {
48    pub name: String,
49    pub rust_type: String,
50    pub value_type: FieldValueType,
51    pub nullable: bool,
52}
53
54/// Static schema metadata for a scraped item type.
55#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
56pub struct ItemSchema {
57    pub item_name: String,
58    pub version: u32,
59    pub fields: Vec<ItemFieldSchema>,
60}
61
62impl ItemSchema {
63    /// Returns the fields in their declared order.
64    pub fn fields(&self) -> &[ItemFieldSchema] {
65        &self.fields
66    }
67}
68
69/// Trait for typed item definitions that can expose static schema metadata.
70pub trait TypedItemSchema {
71    /// Returns the typed schema for the item.
72    fn schema() -> ItemSchema;
73
74    /// Returns the schema version used by the item.
75    fn schema_version() -> u32 {
76        1
77    }
78}
79
80/// The output returned by a spider's `parse` method.
81#[derive(Debug, Clone)]
82pub struct ParseOutput<I> {
83    items: Vec<I>,
84    requests: Vec<Request>,
85}
86
87impl<I> ParseOutput<I> {
88    /// Creates a new, empty `ParseOutput`.
89    ///
90    /// Most spiders start each parse call with this and then append items and
91    /// follow-up requests as they discover them.
92    pub fn new() -> Self {
93        Self {
94            items: Vec::new(),
95            requests: Vec::new(),
96        }
97    }
98
99    /// Consumes the `ParseOutput` and returns its inner items and requests.
100    ///
101    /// This is mainly used by the runtime, but can also be handy in isolated
102    /// parsing helpers.
103    pub fn into_parts(self) -> (Vec<I>, Vec<Request>) {
104        (self.items, self.requests)
105    }
106
107    /// Adds a scraped item to the output.
108    ///
109    /// Use this when the current page produced one structured result that
110    /// should continue through the configured pipeline chain.
111    pub fn add_item(&mut self, item: I) {
112        self.items.push(item);
113    }
114
115    /// Adds a new request to be crawled.
116    ///
117    /// Requests emitted here are handed back to the scheduler after the parse
118    /// step completes.
119    pub fn add_request(&mut self, request: Request) {
120        self.requests.push(request);
121    }
122
123    /// Adds multiple scraped items to the output.
124    pub fn add_items(&mut self, items: impl IntoIterator<Item = I>) {
125        self.items.extend(items);
126    }
127
128    /// Adds multiple new requests to be crawled.
129    pub fn add_requests(&mut self, requests: impl IntoIterator<Item = Request>) {
130        self.requests.extend(requests);
131    }
132}
133
134impl<I> Default for ParseOutput<I> {
135    fn default() -> Self {
136        Self::new()
137    }
138}
139
140/// Trait implemented by item types emitted from spiders.
141///
142/// In normal application code you usually do not implement this trait by hand.
143/// Prefer annotating the item struct with `#[scraped_item]`, which wires up the
144/// required serialization and cloning behavior automatically.
145pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
146    /// Returns the item as a `dyn Any` for downcasting.
147    fn as_any(&self) -> &dyn Any;
148    /// Clones the item into a `Box<dyn ScrapedItem>`.
149    fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
150    /// Converts the item to a `serde_json::Value`.
151    fn to_json_value(&self) -> Value;
152    /// Returns typed schema metadata when the item type exposes it.
153    fn item_schema(&self) -> Option<ItemSchema> {
154        None
155    }
156    /// Returns the schema version used by this item.
157    fn item_schema_version(&self) -> u32 {
158        1
159    }
160}
161
162impl Clone for Box<dyn ScrapedItem + Send + Sync> {
163    fn clone(&self) -> Self {
164        self.box_clone()
165    }
166}