Skip to main content

spider_lib/
pipeline.rs

1use crate::error::PipelineError;
2use crate::item::ScrapedItem;
3use async_trait::async_trait;
4use serde_json::Value;
5
6/// The `Pipeline` trait defines the contract for item processing pipelines.
7///
8/// Pipelines are responsible for processing scraped items, such as storing them in a database,
9/// writing them to a file, or performing data validation.
10#[async_trait]
11pub trait Pipeline<I: ScrapedItem>: Send + Sync + 'static {
12    /// Returns the name of the pipeline.
13    fn name(&self) -> &str;
14
15    /// Processes a single scraped item.
16    ///
17    /// This method can perform any processing on the item, such as storing it, validating it,
18    /// or passing it to another pipeline. It can also choose to drop the item by returning `Ok(None)`.
19    async fn process_item(&self, item: I) -> Result<Option<I>, PipelineError>;
20
21    /// Called when the spider is closing.
22    ///
23    /// This method can be used to perform any cleanup tasks, such as closing file handles or
24    /// database connections.
25    async fn close(&self) -> Result<(), PipelineError> {
26        Ok(())
27    }
28
29    /// Returns the current state of the pipeline as a JSON value.
30    ///
31    /// This method is called during checkpointing to save the pipeline's state.
32    /// The returned state should be sufficient to restore the pipeline to its current
33    /// state using `restore_state`.
34    async fn get_state(&self) -> Result<Option<Value>, PipelineError> {
35        Ok(None)
36    }
37
38    /// Restores the pipeline's state from a JSON value.
39    ///
40    /// This method is called when resuming from a checkpoint. The provided state
41    /// should be used to restore the pipeline to the state it was in when the
42    /// checkpoint was created.
43    async fn restore_state(&self, _state: Value) -> Result<(), PipelineError> {
44        Ok(())
45    }
46}