spider_core/builder.rs
1//! Builder API for assembling a [`Crawler`].
2//!
3//! [`CrawlerBuilder`] is where runtime composition happens: concurrency,
4//! downloader selection, middleware, pipelines, item limits, logging, and
5//! optional checkpointing all start here.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::CrawlerBuilder;
11//! use spider_middleware::rate_limit::RateLimitMiddleware;
12//! use spider_pipeline::console::ConsolePipeline;
13//! use spider_util::error::SpiderError;
14//!
15//! async fn setup_crawler() -> Result<(), SpiderError> {
16//! let crawler = CrawlerBuilder::new(MySpider)
17//! .max_concurrent_downloads(10)
18//! .max_parser_workers(4)
19//! .add_middleware(RateLimitMiddleware::default())
20//! .add_pipeline(ConsolePipeline::new())
21//! .with_checkpoint_path("./crawl.checkpoint")
22//! .build()
23//! .await?;
24//!
25//! crawler.start_crawl().await
26//! }
27//! ```
28
29use crate::Downloader;
30use crate::ReqwestClientDownloader;
31use crate::config::{
32 CheckpointConfig, CrawlShapePreset, CrawlerConfig, DiscoveryConfig, DiscoveryMode,
33 DiscoveryRule,
34};
35use crate::scheduler::Scheduler;
36use crate::spider::Spider;
37use spider_middleware::middleware::Middleware;
38use spider_pipeline::pipeline::Pipeline;
39use spider_util::response::{LinkExtractOptions, LinkType};
40
41#[cfg(feature = "checkpoint")]
42type RestoreResult = (
43 Option<crate::SchedulerCheckpoint>,
44 Option<std::collections::HashMap<String, serde_json::Value>>,
45);
46use spider_util::error::SpiderError;
47use std::marker::PhantomData;
48use std::path::Path;
49use std::sync::Arc;
50use std::time::Duration;
51
52use super::Crawler;
53use crate::stats::StatCollector;
54use log::LevelFilter;
55#[cfg(feature = "checkpoint")]
56use log::{info, warn};
57
58#[cfg(feature = "checkpoint")]
59use rmp_serde;
60#[cfg(feature = "checkpoint")]
61use std::fs;
62
63/// A fluent builder for constructing [`Crawler`] instances.
64///
65/// ## Type Parameters
66///
67/// - `S`: The [`Spider`] implementation type
68/// - `D`: The [`Downloader`] implementation type
69///
70/// ## Example
71///
72/// ```rust,ignore
73/// # use spider_core::{CrawlerBuilder, Spider};
74/// # use spider_util::{response::Response, error::SpiderError, item::ParseOutput};
75/// # struct MySpider;
76/// # #[async_trait::async_trait]
77/// # impl Spider for MySpider {
78/// # type Item = String;
79/// # type State = ();
80/// # fn start_requests(&self) -> Result<spider_core::spider::StartRequests<'_>, SpiderError> {
81/// # Ok(spider_core::spider::StartRequests::iter(std::iter::empty()))
82/// # }
83/// # async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> { todo!() }
84/// # }
85/// let builder = CrawlerBuilder::new(MySpider)
86/// .max_concurrent_downloads(8)
87/// .max_pending_requests(16)
88/// .max_parser_workers(4);
89/// ```
90pub struct CrawlerBuilder<S: Spider, D>
91where
92 D: Downloader,
93{
94 config: CrawlerConfig,
95 checkpoint_config: CheckpointConfig,
96 downloader: D,
97 spider: Option<S>,
98 middlewares: Vec<Box<dyn Middleware<D::Client> + Send + Sync>>,
99 pipelines: Vec<Box<dyn Pipeline<S::Item>>>,
100 log_level: Option<LevelFilter>,
101 _phantom: PhantomData<S>,
102}
103
104impl<S: Spider> Default for CrawlerBuilder<S, ReqwestClientDownloader> {
105 fn default() -> Self {
106 Self {
107 config: CrawlerConfig::default(),
108 checkpoint_config: CheckpointConfig::default(),
109 downloader: ReqwestClientDownloader::default(),
110 spider: None,
111 middlewares: Vec::new(),
112 pipelines: Vec::new(),
113 log_level: None,
114 _phantom: PhantomData,
115 }
116 }
117}
118
119impl<S: Spider> CrawlerBuilder<S, ReqwestClientDownloader> {
120 /// Creates a new `CrawlerBuilder` for a given spider with the default [`ReqwestClientDownloader`].
121 ///
122 /// ## Example
123 ///
124 /// ```rust,ignore
125 /// let crawler = CrawlerBuilder::new(MySpider)
126 /// .build()
127 /// .await?;
128 /// ```
129 pub fn new(spider: S) -> Self {
130 Self {
131 spider: Some(spider),
132 ..Default::default()
133 }
134 }
135
136 /// Enables or disables balanced browser-like default headers for the built-in reqwest downloader.
137 pub fn browser_like_headers(mut self, enabled: bool) -> Self {
138 self.config.browser_like_headers = enabled;
139 self.downloader = self.downloader.with_browser_like_headers(enabled);
140 self
141 }
142}
143
144impl<S: Spider, D: Downloader> CrawlerBuilder<S, D> {
145 #[cfg(feature = "checkpoint")]
146 fn load_checkpoint_from_path(&self, path: &std::path::Path) -> Option<crate::Checkpoint> {
147 match fs::read(path) {
148 Ok(bytes) => match rmp_serde::from_slice::<crate::Checkpoint>(&bytes) {
149 Ok(checkpoint) => Some(checkpoint),
150 Err(e) => {
151 warn!("Failed to deserialize checkpoint from {:?}: {}", path, e);
152 None
153 }
154 },
155 Err(e) => {
156 warn!("Failed to read checkpoint file {:?}: {}", path, e);
157 None
158 }
159 }
160 }
161
162 /// Sets the maximum number of concurrent downloads.
163 ///
164 /// This controls how many HTTP requests can be in-flight simultaneously.
165 /// Higher values increase throughput but may overwhelm target servers.
166 ///
167 /// ## Default
168 ///
169 /// Defaults to twice the number of CPU cores, clamped between 4 and 64.
170 pub fn max_concurrent_downloads(mut self, limit: usize) -> Self {
171 self.config.max_concurrent_downloads = limit;
172 self
173 }
174
175 /// Applies guided concurrency defaults for a common crawl shape.
176 pub fn crawl_shape_preset(mut self, preset: CrawlShapePreset) -> Self {
177 self.config = self.config.with_crawl_shape_preset(preset);
178 self
179 }
180
181 /// Sets the maximum number of outstanding requests tracked by the scheduler.
182 ///
183 /// This includes queued requests plus requests already handed off for download.
184 /// Lower values keep the frontier tighter and reduce internal request buildup.
185 pub fn max_pending_requests(mut self, limit: usize) -> Self {
186 self.config.max_pending_requests = limit;
187 self
188 }
189
190 /// Sets the number of worker tasks dedicated to parsing responses.
191 ///
192 /// Parser workers process HTTP responses concurrently, calling the
193 /// spider's [`parse`](Spider::parse) method to extract items and
194 /// discover new URLs.
195 ///
196 /// ## Default
197 ///
198 /// Defaults to the number of CPU cores, clamped between 4 and 16.
199 pub fn max_parser_workers(mut self, limit: usize) -> Self {
200 self.config.parser_workers = limit;
201 self
202 }
203
204 /// Sets the maximum number of concurrent item processing pipelines.
205 ///
206 /// This controls how many items can be processed by pipelines simultaneously.
207 ///
208 /// ## Default
209 ///
210 /// Defaults to the number of CPU cores, with a maximum of 8.
211 pub fn max_concurrent_pipelines(mut self, limit: usize) -> Self {
212 self.config.max_concurrent_pipelines = limit;
213 self
214 }
215
216 /// Sets the capacity of internal communication channels.
217 ///
218 /// This controls the buffer size for channels between the downloader,
219 /// parser, and pipeline components. Higher values can improve throughput
220 /// at the cost of increased memory usage.
221 ///
222 /// ## Default
223 ///
224 /// Defaults to 1000.
225 pub fn channel_capacity(mut self, capacity: usize) -> Self {
226 self.config.channel_capacity = capacity;
227 self
228 }
229
230 /// Sets the parser output batch size.
231 ///
232 /// Larger batches can reduce coordination overhead when pages emit many
233 /// items or follow-up requests, while smaller batches tend to improve
234 /// latency and memory locality.
235 pub fn output_batch_size(mut self, batch_size: usize) -> Self {
236 self.config.output_batch_size = batch_size;
237 self
238 }
239
240 /// Sets the downloader response-channel backpressure threshold.
241 ///
242 /// When the downloader-to-parser channel reaches this threshold, the
243 /// runtime starts applying backpressure so downloaded responses do not pile
244 /// up unboundedly in memory.
245 pub fn response_backpressure_threshold(mut self, threshold: usize) -> Self {
246 self.config.response_backpressure_threshold = threshold;
247 self
248 }
249
250 /// Sets the parser item-channel backpressure threshold.
251 ///
252 /// This primarily matters when parsing is faster than downstream pipeline
253 /// processing. Lower thresholds keep memory tighter; higher thresholds let
254 /// parsers run further ahead.
255 pub fn item_backpressure_threshold(mut self, threshold: usize) -> Self {
256 self.config.item_backpressure_threshold = threshold;
257 self
258 }
259
260 /// Controls whether retries release downloader permits before waiting.
261 ///
262 /// Enabling this is usually better for throughput because a sleeping retry
263 /// does not occupy scarce downloader concurrency. Disabling it can be
264 /// useful when you want retries to count fully against download capacity.
265 pub fn retry_release_permit(mut self, enabled: bool) -> Self {
266 self.config.retry_release_permit = enabled;
267 self
268 }
269
270 /// Enables or disables live, in-place statistics updates on terminal stdout.
271 ///
272 /// When enabled, spider-* logs are forced to `LevelFilter::Off` during build
273 /// to avoid interleaving with the live terminal renderer.
274 pub fn live_stats(mut self, enabled: bool) -> Self {
275 self.config.live_stats = enabled;
276 self
277 }
278
279 /// Sets the refresh interval for live statistics updates.
280 ///
281 /// Shorter intervals make the terminal view feel more responsive, while
282 /// longer intervals reduce redraw overhead.
283 pub fn live_stats_interval(mut self, interval: Duration) -> Self {
284 self.config.live_stats_interval = interval;
285 self
286 }
287
288 /// Sets which scraped item fields should be shown in live stats preview.
289 ///
290 /// Field names support dot notation for nested JSON objects such as
291 /// `title`, `source_url`, or `metadata.Japanese`.
292 ///
293 /// You can also set aliases with `label=path`, for example
294 /// `url=source_url` or `jp=metadata.Japanese`.
295 pub fn live_stats_preview_fields(
296 mut self,
297 fields: impl IntoIterator<Item = impl Into<String>>,
298 ) -> Self {
299 self.config.live_stats_preview_fields = Some(fields.into_iter().map(Into::into).collect());
300 self
301 }
302
303 /// Sets the maximum grace period for crawler shutdown before forcing task abort.
304 ///
305 /// This gives pipelines, checkpoint writes, and other in-flight work time
306 /// to finish cleanly after shutdown begins.
307 pub fn shutdown_grace_period(mut self, grace_period: Duration) -> Self {
308 self.config.shutdown_grace_period = grace_period;
309 self
310 }
311
312 /// Stops the crawl after `limit` scraped items have been admitted for processing.
313 ///
314 /// This is especially useful for smoke runs, local previews, and
315 /// documentation examples where you want predictable bounded work.
316 pub fn limit(mut self, limit: usize) -> Self {
317 self.config.item_limit = Some(limit);
318 self
319 }
320
321 /// Sets the runtime-managed discovery mode.
322 pub fn discovery_mode(mut self, mode: DiscoveryMode) -> Self {
323 self.config.discovery.mode = mode;
324 self
325 }
326
327 /// Replaces the full discovery configuration.
328 pub fn discovery(mut self, discovery: DiscoveryConfig) -> Self {
329 self.config.discovery = discovery;
330 self
331 }
332
333 /// Replaces runtime discovery rules.
334 pub fn discovery_rules(mut self, rules: impl IntoIterator<Item = DiscoveryRule>) -> Self {
335 self.config.discovery.rules = rules.into_iter().collect();
336 self
337 }
338
339 /// Adds a runtime discovery rule.
340 pub fn add_discovery_rule(mut self, rule: DiscoveryRule) -> Self {
341 self.config.discovery.rules.push(rule);
342 self
343 }
344
345 /// Enables or disables sitemap parsing.
346 pub fn enable_sitemaps(mut self, enabled: bool) -> Self {
347 self.config.discovery.discover_sitemaps = enabled;
348 self
349 }
350
351 /// Enables or disables page metadata extraction.
352 pub fn extract_page_metadata(mut self, enabled: bool) -> Self {
353 self.config.discovery.extract_page_metadata = enabled;
354 self
355 }
356
357 /// Sets the maximum nested sitemap depth.
358 pub fn max_sitemap_depth(mut self, depth: usize) -> Self {
359 self.config.discovery.max_sitemap_depth = depth;
360 self
361 }
362
363 /// Overrides the link extraction options used by runtime discovery.
364 pub fn discovery_link_options(mut self, options: LinkExtractOptions) -> Self {
365 self.config.discovery.link_extract_options = options;
366 self
367 }
368
369 /// Sets whether runtime discovery should keep links on the same site only.
370 pub fn discover_same_site_only(mut self, enabled: bool) -> Self {
371 self.config.discovery.link_extract_options.same_site_only = enabled;
372 self
373 }
374
375 /// Sets whether runtime discovery should scan text nodes for plain-text URLs.
376 pub fn discover_text_links(mut self, enabled: bool) -> Self {
377 self.config
378 .discovery
379 .link_extract_options
380 .include_text_links = enabled;
381 self
382 }
383
384 /// Adds glob-style allow patterns to runtime discovery.
385 pub fn discover_allow_patterns(
386 mut self,
387 patterns: impl IntoIterator<Item = impl Into<String>>,
388 ) -> Self {
389 self.config.discovery.link_extract_options = self
390 .config
391 .discovery
392 .link_extract_options
393 .with_allow_patterns(patterns);
394 self
395 }
396
397 /// Adds glob-style deny patterns to runtime discovery.
398 pub fn discover_deny_patterns(
399 mut self,
400 patterns: impl IntoIterator<Item = impl Into<String>>,
401 ) -> Self {
402 self.config.discovery.link_extract_options = self
403 .config
404 .discovery
405 .link_extract_options
406 .with_deny_patterns(patterns);
407 self
408 }
409
410 /// Restricts runtime discovery to the provided domains or subdomains.
411 pub fn discover_allow_domains(
412 mut self,
413 domains: impl IntoIterator<Item = impl Into<String>>,
414 ) -> Self {
415 self.config.discovery.link_extract_options = self
416 .config
417 .discovery
418 .link_extract_options
419 .with_allow_domains(domains);
420 self
421 }
422
423 /// Excludes runtime discovery for the provided domains or subdomains.
424 pub fn discover_deny_domains(
425 mut self,
426 domains: impl IntoIterator<Item = impl Into<String>>,
427 ) -> Self {
428 self.config.discovery.link_extract_options = self
429 .config
430 .discovery
431 .link_extract_options
432 .with_deny_domains(domains);
433 self
434 }
435
436 /// Restricts runtime discovery to the provided URL path prefixes.
437 pub fn discover_allow_path_prefixes(
438 mut self,
439 prefixes: impl IntoIterator<Item = impl Into<String>>,
440 ) -> Self {
441 self.config.discovery.link_extract_options = self
442 .config
443 .discovery
444 .link_extract_options
445 .with_allow_path_prefixes(prefixes);
446 self
447 }
448
449 /// Excludes runtime discovery for the provided URL path prefixes.
450 pub fn discover_deny_path_prefixes(
451 mut self,
452 prefixes: impl IntoIterator<Item = impl Into<String>>,
453 ) -> Self {
454 self.config.discovery.link_extract_options = self
455 .config
456 .discovery
457 .link_extract_options
458 .with_deny_path_prefixes(prefixes);
459 self
460 }
461
462 /// Restricts runtime discovery to specific HTML tags.
463 pub fn discover_allowed_tags(
464 mut self,
465 tags: impl IntoIterator<Item = impl Into<String>>,
466 ) -> Self {
467 self.config.discovery.link_extract_options = self
468 .config
469 .discovery
470 .link_extract_options
471 .with_allowed_tags(tags);
472 self
473 }
474
475 /// Restricts runtime discovery to specific HTML attributes.
476 pub fn discover_allowed_attributes(
477 mut self,
478 attributes: impl IntoIterator<Item = impl Into<String>>,
479 ) -> Self {
480 self.config.discovery.link_extract_options = self
481 .config
482 .discovery
483 .link_extract_options
484 .with_allowed_attributes(attributes);
485 self
486 }
487
488 /// Restricts runtime discovery to specific link types.
489 pub fn discover_allowed_link_types(
490 mut self,
491 link_types: impl IntoIterator<Item = LinkType>,
492 ) -> Self {
493 self.config.discovery.link_extract_options = self
494 .config
495 .discovery
496 .link_extract_options
497 .with_allowed_link_types(link_types);
498 self
499 }
500
501 /// Excludes specific link types from runtime discovery.
502 pub fn discover_denied_link_types(
503 mut self,
504 link_types: impl IntoIterator<Item = LinkType>,
505 ) -> Self {
506 self.config.discovery.link_extract_options = self
507 .config
508 .discovery
509 .link_extract_options
510 .with_denied_link_types(link_types);
511 self
512 }
513
514 /// Sets a custom downloader implementation.
515 ///
516 /// Use this method to provide a custom [`Downloader`] implementation
517 /// instead of the default [`ReqwestClientDownloader`].
518 ///
519 /// Reach for this when transport behavior itself needs to change, such as
520 /// request signing, alternate HTTP stacks, downloader-level tracing, or
521 /// protocol-specific request execution.
522 pub fn downloader(mut self, downloader: D) -> Self {
523 self.downloader = downloader;
524 self
525 }
526
527 /// Adds a middleware to the crawler's middleware stack.
528 ///
529 /// Middlewares intercept and modify requests before they are sent and
530 /// responses after they are received. They are executed in the order
531 /// they are added.
532 ///
533 /// Middleware is the right layer for cross-cutting HTTP behavior such as
534 /// retry policy, rate limiting, cookies, user-agent management, cache
535 /// lookup, or `robots.txt` enforcement.
536 ///
537 /// ## Example
538 ///
539 /// ```rust,ignore
540 /// let crawler = CrawlerBuilder::new(MySpider)
541 /// .add_middleware(RateLimitMiddleware::default())
542 /// .add_middleware(RetryMiddleware::new())
543 /// .build()
544 /// .await?;
545 /// ```
546 pub fn add_middleware<M>(mut self, middleware: M) -> Self
547 where
548 M: Middleware<D::Client> + Send + Sync + 'static,
549 {
550 self.middlewares.push(Box::new(middleware));
551 self
552 }
553
554 /// Adds a pipeline to the crawler's pipeline stack.
555 ///
556 /// Pipelines process scraped items after they are extracted by the spider.
557 /// They can be used for validation, transformation, deduplication, or
558 /// storage (e.g., writing to files or databases).
559 ///
560 /// Pipelines are ordered. A common pattern is transform first, validate
561 /// second, deduplicate next, and write to outputs last.
562 ///
563 /// ## Example
564 ///
565 /// ```rust,ignore
566 /// let crawler = CrawlerBuilder::new(MySpider)
567 /// .add_pipeline(ConsolePipeline::new())
568 /// .add_pipeline(JsonPipeline::new("output.json")?)
569 /// .build()
570 /// .await?;
571 /// ```
572 pub fn add_pipeline<P>(mut self, pipeline: P) -> Self
573 where
574 P: Pipeline<S::Item> + 'static,
575 {
576 self.pipelines.push(Box::new(pipeline));
577 self
578 }
579
580 /// Sets the log level for `spider-*` library crates.
581 ///
582 /// This configures the logging level specifically for the spider-lib ecosystem
583 /// (spider-core, spider-middleware, spider-pipeline, spider-util, spider-downloader).
584 /// Logs from other dependencies (e.g., reqwest, tokio) will not be affected.
585 ///
586 /// ## Log Levels
587 ///
588 /// - `LevelFilter::Error` - Only error messages
589 /// - `LevelFilter::Warn` - Warnings and errors
590 /// - `LevelFilter::Info` - Informational messages, warnings, and errors
591 /// - `LevelFilter::Debug` - Debug messages and above
592 /// - `LevelFilter::Trace` - All messages including trace
593 ///
594 /// ## Example
595 ///
596 /// ```rust,ignore
597 /// use log::LevelFilter;
598 ///
599 /// let crawler = CrawlerBuilder::new(MySpider)
600 /// .log_level(LevelFilter::Debug)
601 /// .build()
602 /// .await?;
603 /// ```
604 pub fn log_level(mut self, level: LevelFilter) -> Self {
605 self.log_level = Some(level);
606 self
607 }
608
609 /// Sets the path for saving and loading checkpoints.
610 ///
611 /// When enabled, the crawler periodically saves its state to this file,
612 /// allowing crawls to be resumed after interruption.
613 ///
614 /// Requires the `checkpoint` feature to be enabled.
615 ///
616 /// If a checkpoint file already exists at build time, the builder will
617 /// attempt to restore scheduler and pipeline state from it.
618 pub fn with_checkpoint_path<P: AsRef<Path>>(mut self, path: P) -> Self {
619 self.checkpoint_config.path = Some(path.as_ref().to_path_buf());
620 self
621 }
622
623 /// Sets the interval between automatic checkpoint saves.
624 ///
625 /// When enabled, the crawler saves its state at this interval.
626 /// Shorter intervals provide more frequent recovery points but may
627 /// impact performance.
628 ///
629 /// Requires the `checkpoint` feature to be enabled.
630 pub fn with_checkpoint_interval(mut self, interval: Duration) -> Self {
631 self.checkpoint_config.interval = Some(interval);
632 self
633 }
634
635 /// Builds the [`Crawler`] instance.
636 ///
637 /// This method finalizes the crawler configuration and initializes all
638 /// components. It performs validation and sets up default values where
639 /// necessary.
640 ///
641 /// Build time is where the runtime:
642 /// - validates concurrency and channel settings
643 /// - initializes logging and live-stats behavior
644 /// - restores checkpoint state if configured
645 /// - constructs the scheduler and runtime handles
646 ///
647 /// # Errors
648 ///
649 /// Returns a [`SpiderError::ConfigurationError`] if:
650 /// - `max_concurrent_downloads` is 0
651 /// - `parser_workers` is 0
652 /// - No spider was provided to the builder
653 ///
654 /// # Example
655 ///
656 /// ```rust,ignore
657 /// let crawler = CrawlerBuilder::new(MySpider)
658 /// .max_concurrent_downloads(10)
659 /// .build()
660 /// .await?;
661 /// ```
662 pub async fn build(mut self) -> Result<Crawler<S, D::Client>, SpiderError>
663 where
664 D: Downloader + Send + Sync + 'static,
665 D::Client: Send + Sync + Clone,
666 S::Item: Send + Sync + 'static,
667 {
668 let spider = self.take_spider()?;
669 self.init_default_pipeline();
670
671 // Live stats redraw on stdout and should not interleave with spider-* logs.
672 // Force spider-* logs to Off when live stats mode is enabled.
673 let effective_log_level = if self.config.live_stats {
674 Some(LevelFilter::Off)
675 } else {
676 self.log_level
677 };
678
679 // Initialize logging for spider-* crates if an effective log level is configured.
680 if let Some(level) = effective_log_level {
681 self.init_logging(level);
682 }
683
684 // Validate config
685 self.config
686 .validate()
687 .map_err(SpiderError::ConfigurationError)?;
688
689 // Restore checkpoint and get scheduler state
690 #[cfg(feature = "checkpoint")]
691 let (scheduler_state, pipeline_states) = self.restore_checkpoint()?;
692 #[cfg(not(feature = "checkpoint"))]
693 let scheduler_state: Option<()> = None;
694
695 // Restore pipeline states if checkpoint was loaded
696 #[cfg(feature = "checkpoint")]
697 {
698 if let Some(states) = pipeline_states {
699 for (name, state) in states {
700 if let Some(pipeline) = self.pipelines.iter().find(|p| p.name() == name) {
701 pipeline.restore_state(state).await?;
702 } else {
703 warn!("Checkpoint contains state for unknown pipeline: {}", name);
704 }
705 }
706 }
707 }
708
709 // Get cookie store if feature is enabled
710 #[cfg(feature = "cookie-store")]
711 let cookie_store = {
712 #[cfg(feature = "checkpoint")]
713 {
714 let (_, cookie_store) = self.restore_cookie_store().await?;
715 cookie_store
716 }
717 #[cfg(not(feature = "checkpoint"))]
718 {
719 Some(crate::CookieStore::default())
720 }
721 };
722
723 // Create scheduler with or without checkpoint state
724 let (scheduler_arc, req_rx) =
725 Scheduler::new(scheduler_state, self.config.max_pending_requests);
726 let downloader_arc = Arc::new(self.downloader);
727 let stats = Arc::new(StatCollector::new(
728 self.config.live_stats_preview_fields.clone(),
729 ));
730
731 // Build crawler with or without cookie store based on feature flag
732 #[cfg(feature = "cookie-store")]
733 let crawler = Crawler::new(
734 scheduler_arc,
735 req_rx,
736 downloader_arc,
737 self.middlewares,
738 spider,
739 self.pipelines,
740 self.config,
741 #[cfg(feature = "checkpoint")]
742 self.checkpoint_config,
743 stats,
744 Arc::new(tokio::sync::RwLock::new(cookie_store.unwrap_or_default())),
745 );
746
747 #[cfg(not(feature = "cookie-store"))]
748 let crawler = Crawler::new(
749 scheduler_arc,
750 req_rx,
751 downloader_arc,
752 self.middlewares,
753 spider,
754 self.pipelines,
755 self.config,
756 #[cfg(feature = "checkpoint")]
757 self.checkpoint_config,
758 stats,
759 );
760
761 Ok(crawler)
762 }
763
764 /// Restores checkpoint state from disk (checkpoint feature only).
765 ///
766 /// This internal method loads a previously saved checkpoint file and
767 /// restores the scheduler state and pipeline states.
768 ///
769 /// # Returns
770 ///
771 /// Returns a tuple of `(SchedulerCheckpoint, Option<HashMap<String, Value>>)`.
772 /// If no checkpoint path is configured or the file doesn't exist, returns
773 /// default values.
774 ///
775 /// # Errors
776 ///
777 /// Returns a [`SpiderError`] if deserialization fails.
778 /// Note: Checkpoint file read/deserialization errors are logged as warnings
779 /// but do not fail the operation—the crawl proceeds without checkpoint data.
780 #[cfg(feature = "checkpoint")]
781 fn restore_checkpoint(&mut self) -> Result<RestoreResult, SpiderError> {
782 let mut scheduler_state = None;
783 let mut pipeline_states = None;
784
785 if let Some(path) = &self.checkpoint_config.path {
786 info!("Attempting to restore checkpoint from {:?}", path);
787 if let Some(checkpoint) = self.load_checkpoint_from_path(path) {
788 scheduler_state = Some(checkpoint.scheduler);
789 pipeline_states = Some(checkpoint.pipelines);
790 }
791 }
792
793 Ok((scheduler_state, pipeline_states))
794 }
795
796 /// Restores cookie store from checkpoint (checkpoint + cookie-store features).
797 #[cfg(all(feature = "checkpoint", feature = "cookie-store"))]
798 async fn restore_cookie_store(
799 &mut self,
800 ) -> Result<(Option<()>, Option<crate::CookieStore>), SpiderError> {
801 let mut cookie_store = None;
802
803 if let Some(path) = &self.checkpoint_config.path {
804 info!(
805 "Attempting to restore cookie store from checkpoint {:?}",
806 path
807 );
808 if let Some(checkpoint) = self.load_checkpoint_from_path(path) {
809 cookie_store = Some(checkpoint.cookie_store);
810 }
811 }
812
813 Ok((None, cookie_store))
814 }
815
816 /// Extracts the spider from the builder.
817 ///
818 /// # Errors
819 ///
820 /// Returns a [`SpiderError::ConfigurationError`] if:
821 /// - `max_concurrent_downloads` is 0
822 /// - `parser_workers` is 0
823 /// - No spider was provided
824 fn take_spider(&mut self) -> Result<S, SpiderError> {
825 if self.config.max_concurrent_downloads == 0 {
826 return Err(SpiderError::ConfigurationError(
827 "max_concurrent_downloads must be greater than 0.".to_string(),
828 ));
829 }
830 if self.config.max_pending_requests == 0 {
831 return Err(SpiderError::ConfigurationError(
832 "max_pending_requests must be greater than 0.".to_string(),
833 ));
834 }
835 if self.config.parser_workers == 0 {
836 return Err(SpiderError::ConfigurationError(
837 "parser_workers must be greater than 0.".to_string(),
838 ));
839 }
840 self.spider.take().ok_or_else(|| {
841 SpiderError::ConfigurationError("Crawler must have a spider.".to_string())
842 })
843 }
844
845 /// Initializes the pipeline stack with a default [`ConsolePipeline`] if empty.
846 ///
847 /// This ensures that scraped items are always output somewhere, even if
848 /// no explicit pipelines are configured by the user.
849 fn init_default_pipeline(&mut self) {
850 if self.pipelines.is_empty() {
851 use spider_pipeline::console::ConsolePipeline;
852 self.pipelines.push(Box::new(ConsolePipeline::new()));
853 }
854 }
855
856 /// Initializes logging for spider-* crates only.
857 ///
858 /// This sets up env_logger with a filter that only enables logging for
859 /// crates within the spider-lib ecosystem.
860 fn init_logging(&self, level: LevelFilter) {
861 use env_logger::{Builder, Env};
862
863 let mut builder = Builder::from_env(Env::default().default_filter_or("off"));
864
865 // Set filter specifically for spider-* crates
866 builder.filter_module("spider_core", level);
867 builder.filter_module("spider_middleware", level);
868 builder.filter_module("spider_pipeline", level);
869 builder.filter_module("spider_util", level);
870 builder.filter_module("spider_downloader", level);
871 builder.filter_module("spider_macro", level);
872
873 builder.init();
874 }
875}