spider_lib/item.rs
1//! Data structures for scraped items and spider output in `spider-lib`.
2//!
3//! This module defines the `ScrapedItem` trait, which is the core abstraction
4//! for any data extracted by a web spider. Implementors of this trait define
5//! the shape of the data they wish to collect.
6//!
7//! Additionally, the `ParseOutput` struct is provided as the standard return type
8//! for a spider's `parse` method. It encapsulates both the `ScrapedItem`s
9//! found on a page and any new `Request`s that should be scheduled for crawling.
10//! This allows spiders to not only extract data but also to discover and
11//! follow new links within the same processing step.
12
13use crate::request::Request;
14use serde_json::Value;
15use std::any::Any;
16use std::fmt::Debug;
17
18/// The output of a spider's `parse` method.
19#[derive(Debug, Clone)]
20pub struct ParseOutput<I> {
21 items: Vec<I>,
22 requests: Vec<Request>,
23}
24
25impl<I> ParseOutput<I> {
26 /// Creates a new, empty `ParseOutput`.
27 pub fn new() -> Self {
28 Self {
29 items: Vec::new(),
30 requests: Vec::new(),
31 }
32 }
33
34 /// Consumes the `ParseOutput` and returns its inner items and requests.
35 pub fn into_parts(self) -> (Vec<I>, Vec<Request>) {
36 (self.items, self.requests)
37 }
38
39 /// Adds a scraped item to the output.
40 pub fn add_item(&mut self, item: I) {
41 self.items.push(item);
42 }
43
44 /// Adds a new request to be crawled.
45 pub fn add_request(&mut self, request: Request) {
46 self.requests.push(request);
47 }
48
49 /// Adds multiple scraped items to the output.
50 pub fn add_items(&mut self, items: impl IntoIterator<Item = I>) {
51 self.items.extend(items);
52 }
53
54 /// Adds multiple new requests to be crawled.
55 pub fn add_requests(&mut self, requests: impl IntoIterator<Item = Request>) {
56 self.requests.extend(requests);
57 }
58}
59
60impl<I> Default for ParseOutput<I> {
61 fn default() -> Self {
62 Self::new()
63 }
64}
65
66/// A trait representing a scraped item.
67pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
68 /// Returns the item as a `dyn Any` for downcasting.
69 fn as_any(&self) -> &dyn Any;
70 /// Clones the item into a `Box<dyn ScrapedItem>`.
71 fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
72 /// Converts the item to a `serde_json::Value`.
73 fn to_json_value(&self) -> Value;
74}
75
76impl Clone for Box<dyn ScrapedItem + Send + Sync> {
77 fn clone(&self) -> Self {
78 self.box_clone()
79 }
80}