spider_lib/
builder.rs

1use crate::downloader::Downloader;
2use crate::error::SpiderError;
3use crate::middleware::Middleware;
4use crate::middleware::UserAgentMiddleware;
5use crate::pipeline::ItemPipeline;
6use crate::scheduler::Scheduler;
7use crate::spider::Spider;
8use crate::ConsoleWriterPipeline;
9use num_cpus;
10use std::marker::PhantomData;
11use std::sync::Arc;
12use std::time::Duration;
13use tracing::info;
14
15use super::Engine;
16
17pub struct EngineConfig {
18    pub max_concurrent_downloads: usize,
19    pub parser_workers: usize,
20}
21
22impl Default for EngineConfig {
23    fn default() -> Self {
24        EngineConfig {
25            max_concurrent_downloads: 5,
26            parser_workers: num_cpus::get(),
27        }
28    }
29}
30
31pub struct DownloaderConfig {
32    pub request_timeout: Duration,
33    pub user_agent: Option<String>,
34    pub middlewares: Vec<Box<dyn Middleware>>,
35}
36
37impl Default for DownloaderConfig {
38    fn default() -> Self {
39        DownloaderConfig {
40            request_timeout: Duration::from_secs(30),
41            user_agent: Some(format!(
42                "{}/{}",
43                env!("CARGO_PKG_NAME"),
44                env!("CARGO_PKG_VERSION")
45            )),
46            middlewares: Vec::new(),
47        }
48    }
49}
50
51pub struct EngineBuilder<S: Spider> {
52    engine_config: EngineConfig,
53    downloader_config: DownloaderConfig,
54    spider: Option<S>,
55    item_pipelines: Vec<Box<dyn ItemPipeline<S::Item>>>,
56    _phantom: PhantomData<S>,
57}
58
59impl<S: Spider> Default for EngineBuilder<S> {
60    fn default() -> Self {
61        Self {
62            engine_config: EngineConfig::default(),
63            downloader_config: DownloaderConfig::default(),
64            spider: None,
65            item_pipelines: Vec::new(),
66            _phantom: PhantomData,
67        }
68    }
69}
70
71impl<S: Spider> EngineBuilder<S> {
72    pub fn new(spider: S) -> Self {
73        Self {
74            spider: Some(spider),
75            ..Default::default()
76        }
77    }
78
79    pub fn max_concurrent_downloads(mut self, limit: usize) -> Self {
80        self.engine_config.max_concurrent_downloads = limit;
81        self
82    }
83
84    pub fn parser_workers(mut self, limit: usize) -> Self {
85        self.engine_config.parser_workers = limit;
86        self
87    }
88
89    pub fn request_timeout(mut self, timeout: Duration) -> Self {
90        self.downloader_config.request_timeout = timeout;
91        self
92    }
93
94    pub fn user_agent(mut self, ua: &str) -> Self {
95        self.downloader_config.user_agent = Some(ua.to_string());
96        self
97    }
98
99    pub fn disable_user_agent(mut self) -> Self {
100        self.downloader_config.user_agent = None;
101        self
102    }
103
104    pub fn add_downloader_middleware(mut self, middleware: Box<dyn Middleware>) -> Self {
105        self.downloader_config.middlewares.push(middleware);
106        self
107    }
108
109    pub fn add_pipeline(mut self, pipeline: Box<dyn ItemPipeline<S::Item>>) -> Self {
110        info!("EngineBuilder: Adding pipeline: {}", pipeline.name());
111        self.item_pipelines.push(pipeline);
112        self
113    }
114
115    pub async fn build(mut self) -> Result<Engine<S>, SpiderError> {
116        if self.item_pipelines.is_empty() {
117            self = self.add_pipeline(Box::new(ConsoleWriterPipeline::new()));
118        }
119
120        let spider = self.spider.take().ok_or_else(|| {
121            SpiderError::Msg("Engine must have a spider.".to_string())
122        })?;
123
124        if self.engine_config.max_concurrent_downloads == 0 {
125            return Err(SpiderError::Msg(
126                "max_concurrent_downloads must be greater than 0.".to_string(),
127            ));
128        }
129
130        if self.engine_config.parser_workers == 0 {
131            return Err(SpiderError::Msg(
132                "parser_workers must be greater than 0.".to_string(),
133            ));
134        }
135
136        let (scheduler_arc, req_rx) = Scheduler::new();
137        let mut downloader = Downloader::new_with_timeout(self.downloader_config.request_timeout);
138
139        if let Some(ua_string) = self.downloader_config.user_agent {
140            let user_agent_mw = UserAgentMiddleware::new(&ua_string)?;
141            downloader = downloader.with_middleware(Box::new(user_agent_mw));
142        }
143
144        for middleware in self.downloader_config.middlewares {
145            downloader = downloader.with_middleware(middleware);
146        }
147        let downloader_arc = Arc::new(downloader);
148
149        let engine = Engine::new(
150            scheduler_arc,
151            req_rx,
152            downloader_arc,
153            spider,
154            self.item_pipelines,
155            self.engine_config.max_concurrent_downloads,
156            self.engine_config.parser_workers,
157        );
158
159        Ok(engine)
160    }
161}