spider_lib/pipeline.rs
1use crate::error::PipelineError;
2use crate::item::ScrapedItem;
3use async_trait::async_trait;
4use serde_json::Value;
5
6/// The `Pipeline` trait defines the contract for item processing pipelines.
7///
8/// Pipelines are responsible for processing scraped items, such as storing them in a database,
9/// writing them to a file, or performing data validation.
10#[async_trait]
11pub trait Pipeline<I: ScrapedItem>: Send + Sync + 'static {
12 /// Returns the name of the pipeline.
13 fn name(&self) -> &str;
14
15 /// Processes a single scraped item.
16 ///
17 /// This method can perform any processing on the item, such as storing it, validating it,
18 /// or passing it to another pipeline. It can also choose to drop the item by returning `Ok(None)`.
19 async fn process_item(&self, item: I) -> Result<Option<I>, PipelineError>;
20
21 /// Called when the spider is closing.
22 ///
23 /// This method can be used to perform any cleanup tasks, such as closing file handles or
24 /// database connections.
25 async fn close(&self) -> Result<(), PipelineError> {
26 Ok(())
27 }
28
29 /// Returns the current state of the pipeline as a JSON value.
30 ///
31 /// This method is called during checkpointing to save the pipeline's state.
32 /// The returned state should be sufficient to restore the pipeline to its current
33 /// state using `restore_state`.
34 async fn get_state(&self) -> Result<Option<Value>, PipelineError> {
35 Ok(None)
36 }
37
38 /// Restores the pipeline's state from a JSON value.
39 ///
40 /// This method is called when resuming from a checkpoint. The provided state
41 /// should be used to restore the pipeline to the state it was in when the
42 /// checkpoint was created.
43 async fn restore_state(&self, _state: Value) -> Result<(), PipelineError> {
44 Ok(())
45 }
46}