1use crate::downloader::Downloader;
2use crate::error::SpiderError;
3use crate::middleware::Middleware;
4use crate::middleware::UserAgentMiddleware;
5use crate::pipeline::ItemPipeline;
6use crate::scheduler::Scheduler;
7use crate::spider::Spider;
8use crate::ConsoleWriterPipeline;
9use num_cpus;
10use std::marker::PhantomData;
11use std::sync::Arc;
12use std::time::Duration;
13use tracing::info;
14
15use super::Engine;
16
17pub struct EngineConfig {
18 pub max_concurrent_downloads: usize,
19 pub parser_workers: usize,
20}
21
22impl Default for EngineConfig {
23 fn default() -> Self {
24 EngineConfig {
25 max_concurrent_downloads: 5,
26 parser_workers: num_cpus::get(),
27 }
28 }
29}
30
31pub struct DownloaderConfig {
32 pub request_timeout: Duration,
33 pub user_agent: Option<String>,
34 pub middlewares: Vec<Box<dyn Middleware>>,
35}
36
37impl Default for DownloaderConfig {
38 fn default() -> Self {
39 DownloaderConfig {
40 request_timeout: Duration::from_secs(30),
41 user_agent: Some(format!(
42 "{}/{}",
43 env!("CARGO_PKG_NAME"),
44 env!("CARGO_PKG_VERSION")
45 )),
46 middlewares: Vec::new(),
47 }
48 }
49}
50
51pub struct EngineBuilder<S: Spider> {
52 engine_config: EngineConfig,
53 downloader_config: DownloaderConfig,
54 spider: Option<S>,
55 item_pipelines: Vec<Box<dyn ItemPipeline<S::Item>>>,
56 _phantom: PhantomData<S>,
57}
58
59impl<S: Spider> Default for EngineBuilder<S> {
60 fn default() -> Self {
61 Self {
62 engine_config: EngineConfig::default(),
63 downloader_config: DownloaderConfig::default(),
64 spider: None,
65 item_pipelines: Vec::new(),
66 _phantom: PhantomData,
67 }
68 }
69}
70
71impl<S: Spider> EngineBuilder<S> {
72 pub fn new(spider: S) -> Self {
73 Self {
74 spider: Some(spider),
75 ..Default::default()
76 }
77 }
78
79 pub fn max_concurrent_downloads(mut self, limit: usize) -> Self {
80 self.engine_config.max_concurrent_downloads = limit;
81 self
82 }
83
84 pub fn parser_workers(mut self, limit: usize) -> Self {
85 self.engine_config.parser_workers = limit;
86 self
87 }
88
89 pub fn request_timeout(mut self, timeout: Duration) -> Self {
90 self.downloader_config.request_timeout = timeout;
91 self
92 }
93
94 pub fn user_agent(mut self, ua: &str) -> Self {
95 self.downloader_config.user_agent = Some(ua.to_string());
96 self
97 }
98
99 pub fn disable_user_agent(mut self) -> Self {
100 self.downloader_config.user_agent = None;
101 self
102 }
103
104 pub fn add_downloader_middleware(mut self, middleware: Box<dyn Middleware>) -> Self {
105 self.downloader_config.middlewares.push(middleware);
106 self
107 }
108
109 pub fn add_pipeline(mut self, pipeline: Box<dyn ItemPipeline<S::Item>>) -> Self {
110 info!("EngineBuilder: Adding pipeline: {}", pipeline.name());
111 self.item_pipelines.push(pipeline);
112 self
113 }
114
115 pub async fn build(mut self) -> Result<Engine<S>, SpiderError> {
116 if self.item_pipelines.is_empty() {
117 self = self.add_pipeline(Box::new(ConsoleWriterPipeline::new()));
118 }
119
120 let spider = self.spider.take().ok_or_else(|| {
121 SpiderError::Msg("Engine must have a spider.".to_string())
122 })?;
123
124 if self.engine_config.max_concurrent_downloads == 0 {
125 return Err(SpiderError::Msg(
126 "max_concurrent_downloads must be greater than 0.".to_string(),
127 ));
128 }
129
130 if self.engine_config.parser_workers == 0 {
131 return Err(SpiderError::Msg(
132 "parser_workers must be greater than 0.".to_string(),
133 ));
134 }
135
136 let (scheduler_arc, req_rx) = Scheduler::new();
137 let mut downloader = Downloader::new_with_timeout(self.downloader_config.request_timeout);
138
139 if let Some(ua_string) = self.downloader_config.user_agent {
140 let user_agent_mw = UserAgentMiddleware::new(&ua_string)?;
141 downloader = downloader.with_middleware(Box::new(user_agent_mw));
142 }
143
144 for middleware in self.downloader_config.middlewares {
145 downloader = downloader.with_middleware(middleware);
146 }
147 let downloader_arc = Arc::new(downloader);
148
149 let engine = Engine::new(
150 scheduler_arc,
151 req_rx,
152 downloader_arc,
153 spider,
154 self.item_pipelines,
155 self.engine_config.max_concurrent_downloads,
156 self.engine_config.parser_workers,
157 );
158
159 Ok(engine)
160 }
161}