spider_util/item.rs
1//! Item traits and parse results.
2//!
3//! [`ParseOutput`] is what a spider returns after parsing a page. It carries the
4//! emitted items plus any follow-up requests discovered on that page.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_util::item::{ScrapedItem, ParseOutput};
10//!
11//! #[spider_macro::scraped_item]
12//! struct Article {
13//! title: String,
14//! content: String,
15//! }
16//!
17//! // In your spider's parse method:
18//! // let mut output = ParseOutput::new();
19//! // output.add_item(Article { title: "...", content: "..." });
20//! // Ok(output)
21//! ```
22//!
23//! `ParseOutput` is intentionally small: it is just the handoff object between
24//! parsing and the rest of the runtime. Use it to emit items, schedule new
25//! requests, or both from the same page.
26
27use crate::request::Request;
28use serde_json::Value;
29use std::any::Any;
30use std::fmt::Debug;
31
32/// Stable field kinds used by typed item schema metadata.
33#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
34pub enum FieldValueType {
35 Bool,
36 Integer,
37 Float,
38 String,
39 Json,
40 Sequence,
41 Map,
42 Unknown,
43}
44
45/// Static schema metadata for a single item field.
46#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
47pub struct ItemFieldSchema {
48 pub name: String,
49 pub rust_type: String,
50 pub value_type: FieldValueType,
51 pub nullable: bool,
52}
53
54/// Static schema metadata for a scraped item type.
55#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
56pub struct ItemSchema {
57 pub item_name: String,
58 pub version: u32,
59 pub fields: Vec<ItemFieldSchema>,
60}
61
62impl ItemSchema {
63 /// Returns the fields in their declared order.
64 pub fn fields(&self) -> &[ItemFieldSchema] {
65 &self.fields
66 }
67}
68
69/// Trait for typed item definitions that can expose static schema metadata.
70pub trait TypedItemSchema {
71 /// Returns the typed schema for the item.
72 fn schema() -> ItemSchema;
73
74 /// Returns the schema version used by the item.
75 fn schema_version() -> u32 {
76 1
77 }
78}
79
80/// The output returned by a spider's `parse` method.
81#[derive(Debug, Clone)]
82pub struct ParseOutput<I> {
83 items: Vec<I>,
84 requests: Vec<Request>,
85}
86
87impl<I> ParseOutput<I> {
88 /// Creates a new, empty `ParseOutput`.
89 ///
90 /// Most spiders start each parse call with this and then append items and
91 /// follow-up requests as they discover them.
92 pub fn new() -> Self {
93 Self {
94 items: Vec::new(),
95 requests: Vec::new(),
96 }
97 }
98
99 /// Consumes the `ParseOutput` and returns its inner items and requests.
100 ///
101 /// This is mainly used by the runtime, but can also be handy in isolated
102 /// parsing helpers.
103 pub fn into_parts(self) -> (Vec<I>, Vec<Request>) {
104 (self.items, self.requests)
105 }
106
107 /// Adds a scraped item to the output.
108 ///
109 /// Use this when the current page produced one structured result that
110 /// should continue through the configured pipeline chain.
111 pub fn add_item(&mut self, item: I) {
112 self.items.push(item);
113 }
114
115 /// Adds a new request to be crawled.
116 ///
117 /// Requests emitted here are handed back to the scheduler after the parse
118 /// step completes.
119 pub fn add_request(&mut self, request: Request) {
120 self.requests.push(request);
121 }
122
123 /// Adds multiple scraped items to the output.
124 pub fn add_items(&mut self, items: impl IntoIterator<Item = I>) {
125 self.items.extend(items);
126 }
127
128 /// Adds multiple new requests to be crawled.
129 pub fn add_requests(&mut self, requests: impl IntoIterator<Item = Request>) {
130 self.requests.extend(requests);
131 }
132}
133
134impl<I> Default for ParseOutput<I> {
135 fn default() -> Self {
136 Self::new()
137 }
138}
139
140/// Trait implemented by item types emitted from spiders.
141///
142/// In normal application code you usually do not implement this trait by hand.
143/// Prefer annotating the item struct with `#[scraped_item]`, which wires up the
144/// required serialization and cloning behavior automatically.
145pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
146 /// Returns the item as a `dyn Any` for downcasting.
147 fn as_any(&self) -> &dyn Any;
148 /// Clones the item into a `Box<dyn ScrapedItem>`.
149 fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
150 /// Converts the item to a `serde_json::Value`.
151 fn to_json_value(&self) -> Value;
152 /// Returns typed schema metadata when the item type exposes it.
153 fn item_schema(&self) -> Option<ItemSchema> {
154 None
155 }
156 /// Returns the schema version used by this item.
157 fn item_schema_version(&self) -> u32 {
158 1
159 }
160}
161
162impl Clone for Box<dyn ScrapedItem + Send + Sync> {
163 fn clone(&self) -> Self {
164 self.box_clone()
165 }
166}