spider_util/item.rs
1//! Item traits and parse results.
2//!
3//! [`ParseOutput`] is what a spider returns after parsing a page. It carries the
4//! emitted items plus any follow-up requests discovered on that page.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_util::item::{ScrapedItem, ParseOutput};
10//!
11//! #[spider_macro::scraped_item]
12//! struct Article {
13//! title: String,
14//! content: String,
15//! }
16//!
17//! // In your spider's parse method:
18//! // let mut output = ParseOutput::new();
19//! // output.add_item(Article { title: "...", content: "..." });
20//! // Ok(output)
21//! ```
22//!
23//! `ParseOutput` is intentionally small: it is just the handoff object between
24//! parsing and the rest of the runtime. Use it to emit items, schedule new
25//! requests, or both from the same page.
26
27use crate::request::Request;
28use serde_json::Value;
29use std::any::Any;
30use std::fmt::Debug;
31
32/// The output returned by a spider's `parse` method.
33#[derive(Debug, Clone)]
34pub struct ParseOutput<I> {
35 items: Vec<I>,
36 requests: Vec<Request>,
37}
38
39impl<I> ParseOutput<I> {
40 /// Creates a new, empty `ParseOutput`.
41 ///
42 /// Most spiders start each parse call with this and then append items and
43 /// follow-up requests as they discover them.
44 pub fn new() -> Self {
45 Self {
46 items: Vec::new(),
47 requests: Vec::new(),
48 }
49 }
50
51 /// Consumes the `ParseOutput` and returns its inner items and requests.
52 ///
53 /// This is mainly used by the runtime, but can also be handy in isolated
54 /// parsing helpers.
55 pub fn into_parts(self) -> (Vec<I>, Vec<Request>) {
56 (self.items, self.requests)
57 }
58
59 /// Adds a scraped item to the output.
60 ///
61 /// Use this when the current page produced one structured result that
62 /// should continue through the configured pipeline chain.
63 pub fn add_item(&mut self, item: I) {
64 self.items.push(item);
65 }
66
67 /// Adds a new request to be crawled.
68 ///
69 /// Requests emitted here are handed back to the scheduler after the parse
70 /// step completes.
71 pub fn add_request(&mut self, request: Request) {
72 self.requests.push(request);
73 }
74
75 /// Adds multiple scraped items to the output.
76 pub fn add_items(&mut self, items: impl IntoIterator<Item = I>) {
77 self.items.extend(items);
78 }
79
80 /// Adds multiple new requests to be crawled.
81 pub fn add_requests(&mut self, requests: impl IntoIterator<Item = Request>) {
82 self.requests.extend(requests);
83 }
84}
85
86impl<I> Default for ParseOutput<I> {
87 fn default() -> Self {
88 Self::new()
89 }
90}
91
92/// Trait implemented by item types emitted from spiders.
93///
94/// In normal application code you usually do not implement this trait by hand.
95/// Prefer annotating the item struct with `#[scraped_item]`, which wires up the
96/// required serialization and cloning behavior automatically.
97pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
98 /// Returns the item as a `dyn Any` for downcasting.
99 fn as_any(&self) -> &dyn Any;
100 /// Clones the item into a `Box<dyn ScrapedItem>`.
101 fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
102 /// Converts the item to a `serde_json::Value`.
103 fn to_json_value(&self) -> Value;
104}
105
106impl Clone for Box<dyn ScrapedItem + Send + Sync> {
107 fn clone(&self) -> Self {
108 self.box_clone()
109 }
110}