Skip to main content

spider_core/engine/
context.rs

1//! Aggregated context shared across all crawler tasks.
2//!
3//! This module provides the `CrawlerContext` struct, which bundles together
4//! all the shared state that needs to be passed between different crawler tasks.
5//! By wrapping this context in an Arc, we can clone it cheaply (just incrementing
6//! the reference count) instead of cloning each individual component.
7//!
8//! ## Benefits
9//!
10//! - **Reduced cloning overhead**: Single Arc::clone() instead of 5+ individual clones
11//! - **Better code organization**: Related state is grouped together
12//! - **Easier refactoring**: Adding new shared state only requires one change
13
14use crate::{Scheduler, spider::Spider, stats::StatCollector};
15use spider_pipeline::pipeline::Pipeline;
16use spider_util::item::ScrapedItem;
17use std::sync::Arc;
18
19/// Inner data shared across all crawler tasks.
20///
21/// This struct contains all the Arc-wrapped components that need to be
22/// shared between the crawler's various async tasks.
23pub struct CrawlerContextInner<S, I>
24where
25    S: Spider<Item = I>,
26    I: ScrapedItem,
27{
28    pub scheduler: Arc<Scheduler>,
29    pub stats: Arc<StatCollector>,
30    pub spider: Arc<S>,
31    pub spider_state: Arc<S::State>,
32    pub pipelines: Arc<Vec<Box<dyn Pipeline<I>>>>,
33}
34
35/// Aggregated context shared across all crawler tasks.
36///
37/// This struct wraps CrawlerContextInner in a single Arc, allowing
38/// efficient cloning with just one atomic reference count operation.
39pub struct CrawlerContext<S, I>(pub Arc<CrawlerContextInner<S, I>>)
40where
41    S: Spider<Item = I>,
42    I: ScrapedItem;
43
44impl<S, I> Clone for CrawlerContext<S, I>
45where
46    S: Spider<Item = I>,
47    I: ScrapedItem,
48{
49    fn clone(&self) -> Self {
50        CrawlerContext(Arc::clone(&self.0))
51    }
52}
53
54impl<S, I> CrawlerContext<S, I>
55where
56    S: Spider<Item = I>,
57    I: ScrapedItem,
58{
59    /// Creates a new CrawlerContext with the given components.
60    pub fn new(
61        scheduler: Arc<Scheduler>,
62        stats: Arc<StatCollector>,
63        spider: Arc<S>,
64        spider_state: Arc<S::State>,
65        pipelines: Arc<Vec<Box<dyn Pipeline<I>>>>,
66    ) -> Self {
67        CrawlerContext(Arc::new(CrawlerContextInner {
68            scheduler,
69            stats,
70            spider,
71            spider_state,
72            pipelines,
73        }))
74    }
75
76    /// Creates a CrawlerContext from a Crawler instance.
77    pub fn from_crawler(
78        scheduler: Arc<Scheduler>,
79        stats: Arc<StatCollector>,
80        spider: Arc<S>,
81        spider_state: Arc<S::State>,
82        pipelines: Arc<Vec<Box<dyn Pipeline<I>>>>,
83    ) -> Self {
84        Self::new(scheduler, stats, spider, spider_state, pipelines)
85    }
86}
87
88// Implement Deref for convenient access to inner fields
89impl<S, I> std::ops::Deref for CrawlerContext<S, I>
90where
91    S: Spider<Item = I>,
92    I: ScrapedItem,
93{
94    type Target = CrawlerContextInner<S, I>;
95
96    fn deref(&self) -> &Self::Target {
97        &self.0
98    }
99}