Skip to main content

spider_util/
item.rs

1//! Item traits and parse results.
2//!
3//! [`ParseOutput`] is what a spider returns after parsing a page. It carries the
4//! emitted items plus any follow-up requests discovered on that page.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_util::item::{ScrapedItem, ParseOutput};
10//!
11//! #[spider_macro::scraped_item]
12//! struct Article {
13//!     title: String,
14//!     content: String,
15//! }
16//!
17//! // In your spider's parse method:
18//! // let mut output = ParseOutput::new();
19//! // output.add_item(Article { title: "...", content: "..." });
20//! // Ok(output)
21//! ```
22//!
23//! `ParseOutput` is intentionally small: it is just the handoff object between
24//! parsing and the rest of the runtime. Use it to emit items, schedule new
25//! requests, or both from the same page.
26
27use crate::request::Request;
28use serde_json::Value;
29use std::any::Any;
30use std::fmt::Debug;
31
32/// The output returned by a spider's `parse` method.
33#[derive(Debug, Clone)]
34pub struct ParseOutput<I> {
35    items: Vec<I>,
36    requests: Vec<Request>,
37}
38
39impl<I> ParseOutput<I> {
40    /// Creates a new, empty `ParseOutput`.
41    ///
42    /// Most spiders start each parse call with this and then append items and
43    /// follow-up requests as they discover them.
44    pub fn new() -> Self {
45        Self {
46            items: Vec::new(),
47            requests: Vec::new(),
48        }
49    }
50
51    /// Consumes the `ParseOutput` and returns its inner items and requests.
52    ///
53    /// This is mainly used by the runtime, but can also be handy in isolated
54    /// parsing helpers.
55    pub fn into_parts(self) -> (Vec<I>, Vec<Request>) {
56        (self.items, self.requests)
57    }
58
59    /// Adds a scraped item to the output.
60    ///
61    /// Use this when the current page produced one structured result that
62    /// should continue through the configured pipeline chain.
63    pub fn add_item(&mut self, item: I) {
64        self.items.push(item);
65    }
66
67    /// Adds a new request to be crawled.
68    ///
69    /// Requests emitted here are handed back to the scheduler after the parse
70    /// step completes.
71    pub fn add_request(&mut self, request: Request) {
72        self.requests.push(request);
73    }
74
75    /// Adds multiple scraped items to the output.
76    pub fn add_items(&mut self, items: impl IntoIterator<Item = I>) {
77        self.items.extend(items);
78    }
79
80    /// Adds multiple new requests to be crawled.
81    pub fn add_requests(&mut self, requests: impl IntoIterator<Item = Request>) {
82        self.requests.extend(requests);
83    }
84}
85
86impl<I> Default for ParseOutput<I> {
87    fn default() -> Self {
88        Self::new()
89    }
90}
91
92/// Trait implemented by item types emitted from spiders.
93///
94/// In normal application code you usually do not implement this trait by hand.
95/// Prefer annotating the item struct with `#[scraped_item]`, which wires up the
96/// required serialization and cloning behavior automatically.
97pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
98    /// Returns the item as a `dyn Any` for downcasting.
99    fn as_any(&self) -> &dyn Any;
100    /// Clones the item into a `Box<dyn ScrapedItem>`.
101    fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
102    /// Converts the item to a `serde_json::Value`.
103    fn to_json_value(&self) -> Value;
104}
105
106impl Clone for Box<dyn ScrapedItem + Send + Sync> {
107    fn clone(&self) -> Self {
108        self.box_clone()
109    }
110}