spider_core/
builder.rs

1//! Builder API for assembling a [`Crawler`].
2//!
3//! [`CrawlerBuilder`] is where runtime composition happens: concurrency,
4//! downloader selection, middleware, pipelines, item limits, logging, and
5//! optional checkpointing all start here.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::CrawlerBuilder;
11//! use spider_middleware::rate_limit::RateLimitMiddleware;
12//! use spider_pipeline::console::ConsolePipeline;
13//! use spider_util::error::SpiderError;
14//!
15//! async fn setup_crawler() -> Result<(), SpiderError> {
16//!     let crawler = CrawlerBuilder::new(MySpider)
17//!         .max_concurrent_downloads(10)
18//!         .max_parser_workers(4)
19//!         .add_middleware(RateLimitMiddleware::default())
20//!         .add_pipeline(ConsolePipeline::new())
21//!         .with_checkpoint_path("./crawl.checkpoint")
22//!         .build()
23//!         .await?;
24//!
25//!     crawler.start_crawl().await
26//! }
27//! ```
28
29use crate::Downloader;
30use crate::ReqwestClientDownloader;
31use crate::config::{
32    CheckpointConfig, CrawlShapePreset, CrawlerConfig, DiscoveryConfig, DiscoveryMode,
33    DiscoveryRule,
34};
35use crate::scheduler::Scheduler;
36use crate::spider::Spider;
37use spider_middleware::middleware::Middleware;
38use spider_pipeline::pipeline::Pipeline;
39use spider_util::response::{LinkExtractOptions, LinkType};
40
41#[cfg(feature = "checkpoint")]
42type RestoreResult = (
43    Option<crate::SchedulerCheckpoint>,
44    Option<std::collections::HashMap<String, serde_json::Value>>,
45);
46use spider_util::error::SpiderError;
47use std::marker::PhantomData;
48use std::path::Path;
49use std::sync::Arc;
50use std::time::Duration;
51
52use super::Crawler;
53use crate::stats::StatCollector;
54use log::LevelFilter;
55#[cfg(feature = "checkpoint")]
56use log::{info, warn};
57
58#[cfg(feature = "checkpoint")]
59use rmp_serde;
60#[cfg(feature = "checkpoint")]
61use std::fs;
62
63/// A fluent builder for constructing [`Crawler`] instances.
64///
65/// ## Type Parameters
66///
67/// - `S`: The [`Spider`] implementation type
68/// - `D`: The [`Downloader`] implementation type
69///
70/// ## Example
71///
72/// ```rust,ignore
73/// # use spider_core::{CrawlerBuilder, Spider};
74/// # use spider_util::{response::Response, error::SpiderError, item::ParseOutput};
75/// # struct MySpider;
76/// # #[async_trait::async_trait]
77/// # impl Spider for MySpider {
78/// #     type Item = String;
79/// #     type State = ();
80/// #     fn start_requests(&self) -> Result<spider_core::spider::StartRequests<'_>, SpiderError> {
81/// #         Ok(spider_core::spider::StartRequests::iter(std::iter::empty()))
82/// #     }
83/// #     async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> { todo!() }
84/// # }
85/// let builder = CrawlerBuilder::new(MySpider)
86///     .max_concurrent_downloads(8)
87///     .max_pending_requests(16)
88///     .max_parser_workers(4);
89/// ```
90pub struct CrawlerBuilder<S: Spider, D>
91where
92    D: Downloader,
93{
94    config: CrawlerConfig,
95    checkpoint_config: CheckpointConfig,
96    downloader: D,
97    spider: Option<S>,
98    middlewares: Vec<Box<dyn Middleware<D::Client> + Send + Sync>>,
99    pipelines: Vec<Box<dyn Pipeline<S::Item>>>,
100    log_level: Option<LevelFilter>,
101    _phantom: PhantomData<S>,
102}
103
104impl<S: Spider> Default for CrawlerBuilder<S, ReqwestClientDownloader> {
105    fn default() -> Self {
106        Self {
107            config: CrawlerConfig::default(),
108            checkpoint_config: CheckpointConfig::default(),
109            downloader: ReqwestClientDownloader::default(),
110            spider: None,
111            middlewares: Vec::new(),
112            pipelines: Vec::new(),
113            log_level: None,
114            _phantom: PhantomData,
115        }
116    }
117}
118
119impl<S: Spider> CrawlerBuilder<S, ReqwestClientDownloader> {
120    /// Creates a new `CrawlerBuilder` for a given spider with the default [`ReqwestClientDownloader`].
121    ///
122    /// ## Example
123    ///
124    /// ```rust,ignore
125    /// let crawler = CrawlerBuilder::new(MySpider)
126    ///     .build()
127    ///     .await?;
128    /// ```
129    pub fn new(spider: S) -> Self {
130        Self {
131            spider: Some(spider),
132            ..Default::default()
133        }
134    }
135
136    /// Enables or disables balanced browser-like default headers for the built-in reqwest downloader.
137    pub fn browser_like_headers(mut self, enabled: bool) -> Self {
138        self.config.browser_like_headers = enabled;
139        self.downloader = self.downloader.with_browser_like_headers(enabled);
140        self
141    }
142}
143
144impl<S: Spider, D: Downloader> CrawlerBuilder<S, D> {
145    #[cfg(feature = "checkpoint")]
146    fn load_checkpoint_from_path(&self, path: &std::path::Path) -> Option<crate::Checkpoint> {
147        match fs::read(path) {
148            Ok(bytes) => match rmp_serde::from_slice::<crate::Checkpoint>(&bytes) {
149                Ok(checkpoint) => Some(checkpoint),
150                Err(e) => {
151                    warn!("Failed to deserialize checkpoint from {:?}: {}", path, e);
152                    None
153                }
154            },
155            Err(e) => {
156                warn!("Failed to read checkpoint file {:?}: {}", path, e);
157                None
158            }
159        }
160    }
161
162    /// Sets the maximum number of concurrent downloads.
163    ///
164    /// This controls how many HTTP requests can be in-flight simultaneously.
165    /// Higher values increase throughput but may overwhelm target servers.
166    ///
167    /// ## Default
168    ///
169    /// Defaults to twice the number of CPU cores, clamped between 4 and 64.
170    pub fn max_concurrent_downloads(mut self, limit: usize) -> Self {
171        self.config.max_concurrent_downloads = limit;
172        self
173    }
174
175    /// Applies guided concurrency defaults for a common crawl shape.
176    pub fn crawl_shape_preset(mut self, preset: CrawlShapePreset) -> Self {
177        self.config = self.config.with_crawl_shape_preset(preset);
178        self
179    }
180
181    /// Sets the maximum number of outstanding requests tracked by the scheduler.
182    ///
183    /// This includes queued requests plus requests already handed off for download.
184    /// Lower values keep the frontier tighter and reduce internal request buildup.
185    pub fn max_pending_requests(mut self, limit: usize) -> Self {
186        self.config.max_pending_requests = limit;
187        self
188    }
189
190    /// Sets the number of worker tasks dedicated to parsing responses.
191    ///
192    /// Parser workers process HTTP responses concurrently, calling the
193    /// spider's [`parse`](Spider::parse) method to extract items and
194    /// discover new URLs.
195    ///
196    /// ## Default
197    ///
198    /// Defaults to the number of CPU cores, clamped between 4 and 16.
199    pub fn max_parser_workers(mut self, limit: usize) -> Self {
200        self.config.parser_workers = limit;
201        self
202    }
203
204    /// Sets the maximum number of concurrent item processing pipelines.
205    ///
206    /// This controls how many items can be processed by pipelines simultaneously.
207    ///
208    /// ## Default
209    ///
210    /// Defaults to the number of CPU cores, with a maximum of 8.
211    pub fn max_concurrent_pipelines(mut self, limit: usize) -> Self {
212        self.config.max_concurrent_pipelines = limit;
213        self
214    }
215
216    /// Sets the capacity of internal communication channels.
217    ///
218    /// This controls the buffer size for channels between the downloader,
219    /// parser, and pipeline components. Higher values can improve throughput
220    /// at the cost of increased memory usage.
221    ///
222    /// ## Default
223    ///
224    /// Defaults to 1000.
225    pub fn channel_capacity(mut self, capacity: usize) -> Self {
226        self.config.channel_capacity = capacity;
227        self
228    }
229
230    /// Sets the parser output batch size.
231    ///
232    /// Larger batches can reduce coordination overhead when pages emit many
233    /// items or follow-up requests, while smaller batches tend to improve
234    /// latency and memory locality.
235    pub fn output_batch_size(mut self, batch_size: usize) -> Self {
236        self.config.output_batch_size = batch_size;
237        self
238    }
239
240    /// Sets the downloader response-channel backpressure threshold.
241    ///
242    /// When the downloader-to-parser channel reaches this threshold, the
243    /// runtime starts applying backpressure so downloaded responses do not pile
244    /// up unboundedly in memory.
245    pub fn response_backpressure_threshold(mut self, threshold: usize) -> Self {
246        self.config.response_backpressure_threshold = threshold;
247        self
248    }
249
250    /// Sets the parser item-channel backpressure threshold.
251    ///
252    /// This primarily matters when parsing is faster than downstream pipeline
253    /// processing. Lower thresholds keep memory tighter; higher thresholds let
254    /// parsers run further ahead.
255    pub fn item_backpressure_threshold(mut self, threshold: usize) -> Self {
256        self.config.item_backpressure_threshold = threshold;
257        self
258    }
259
260    /// Controls whether retries release downloader permits before waiting.
261    ///
262    /// Enabling this is usually better for throughput because a sleeping retry
263    /// does not occupy scarce downloader concurrency. Disabling it can be
264    /// useful when you want retries to count fully against download capacity.
265    pub fn retry_release_permit(mut self, enabled: bool) -> Self {
266        self.config.retry_release_permit = enabled;
267        self
268    }
269
270    /// Enables or disables live, in-place statistics updates on terminal stdout.
271    ///
272    /// When enabled, spider-* logs are forced to `LevelFilter::Off` during build
273    /// to avoid interleaving with the live terminal renderer.
274    pub fn live_stats(mut self, enabled: bool) -> Self {
275        self.config.live_stats = enabled;
276        self
277    }
278
279    /// Sets the refresh interval for live statistics updates.
280    ///
281    /// Shorter intervals make the terminal view feel more responsive, while
282    /// longer intervals reduce redraw overhead.
283    pub fn live_stats_interval(mut self, interval: Duration) -> Self {
284        self.config.live_stats_interval = interval;
285        self
286    }
287
288    /// Sets which scraped item fields should be shown in live stats preview.
289    ///
290    /// Field names support dot notation for nested JSON objects such as
291    /// `title`, `source_url`, or `metadata.Japanese`.
292    ///
293    /// You can also set aliases with `label=path`, for example
294    /// `url=source_url` or `jp=metadata.Japanese`.
295    pub fn live_stats_preview_fields(
296        mut self,
297        fields: impl IntoIterator<Item = impl Into<String>>,
298    ) -> Self {
299        self.config.live_stats_preview_fields = Some(fields.into_iter().map(Into::into).collect());
300        self
301    }
302
303    /// Sets the maximum grace period for crawler shutdown before forcing task abort.
304    ///
305    /// This gives pipelines, checkpoint writes, and other in-flight work time
306    /// to finish cleanly after shutdown begins.
307    pub fn shutdown_grace_period(mut self, grace_period: Duration) -> Self {
308        self.config.shutdown_grace_period = grace_period;
309        self
310    }
311
312    /// Stops the crawl after `limit` scraped items have been admitted for processing.
313    ///
314    /// This is especially useful for smoke runs, local previews, and
315    /// documentation examples where you want predictable bounded work.
316    pub fn limit(mut self, limit: usize) -> Self {
317        self.config.item_limit = Some(limit);
318        self
319    }
320
321    /// Sets the runtime-managed discovery mode.
322    pub fn discovery_mode(mut self, mode: DiscoveryMode) -> Self {
323        self.config.discovery.mode = mode;
324        self
325    }
326
327    /// Replaces the full discovery configuration.
328    pub fn discovery(mut self, discovery: DiscoveryConfig) -> Self {
329        self.config.discovery = discovery;
330        self
331    }
332
333    /// Replaces runtime discovery rules.
334    pub fn discovery_rules(mut self, rules: impl IntoIterator<Item = DiscoveryRule>) -> Self {
335        self.config.discovery.rules = rules.into_iter().collect();
336        self
337    }
338
339    /// Adds a runtime discovery rule.
340    pub fn add_discovery_rule(mut self, rule: DiscoveryRule) -> Self {
341        self.config.discovery.rules.push(rule);
342        self
343    }
344
345    /// Enables or disables sitemap parsing.
346    pub fn enable_sitemaps(mut self, enabled: bool) -> Self {
347        self.config.discovery.discover_sitemaps = enabled;
348        self
349    }
350
351    /// Enables or disables page metadata extraction.
352    pub fn extract_page_metadata(mut self, enabled: bool) -> Self {
353        self.config.discovery.extract_page_metadata = enabled;
354        self
355    }
356
357    /// Sets the maximum nested sitemap depth.
358    pub fn max_sitemap_depth(mut self, depth: usize) -> Self {
359        self.config.discovery.max_sitemap_depth = depth;
360        self
361    }
362
363    /// Overrides the link extraction options used by runtime discovery.
364    pub fn discovery_link_options(mut self, options: LinkExtractOptions) -> Self {
365        self.config.discovery.link_extract_options = options;
366        self
367    }
368
369    /// Sets whether runtime discovery should keep links on the same site only.
370    pub fn discover_same_site_only(mut self, enabled: bool) -> Self {
371        self.config.discovery.link_extract_options.same_site_only = enabled;
372        self
373    }
374
375    /// Sets whether runtime discovery should scan text nodes for plain-text URLs.
376    pub fn discover_text_links(mut self, enabled: bool) -> Self {
377        self.config
378            .discovery
379            .link_extract_options
380            .include_text_links = enabled;
381        self
382    }
383
384    /// Adds glob-style allow patterns to runtime discovery.
385    pub fn discover_allow_patterns(
386        mut self,
387        patterns: impl IntoIterator<Item = impl Into<String>>,
388    ) -> Self {
389        self.config.discovery.link_extract_options = self
390            .config
391            .discovery
392            .link_extract_options
393            .with_allow_patterns(patterns);
394        self
395    }
396
397    /// Adds glob-style deny patterns to runtime discovery.
398    pub fn discover_deny_patterns(
399        mut self,
400        patterns: impl IntoIterator<Item = impl Into<String>>,
401    ) -> Self {
402        self.config.discovery.link_extract_options = self
403            .config
404            .discovery
405            .link_extract_options
406            .with_deny_patterns(patterns);
407        self
408    }
409
410    /// Restricts runtime discovery to the provided domains or subdomains.
411    pub fn discover_allow_domains(
412        mut self,
413        domains: impl IntoIterator<Item = impl Into<String>>,
414    ) -> Self {
415        self.config.discovery.link_extract_options = self
416            .config
417            .discovery
418            .link_extract_options
419            .with_allow_domains(domains);
420        self
421    }
422
423    /// Excludes runtime discovery for the provided domains or subdomains.
424    pub fn discover_deny_domains(
425        mut self,
426        domains: impl IntoIterator<Item = impl Into<String>>,
427    ) -> Self {
428        self.config.discovery.link_extract_options = self
429            .config
430            .discovery
431            .link_extract_options
432            .with_deny_domains(domains);
433        self
434    }
435
436    /// Restricts runtime discovery to the provided URL path prefixes.
437    pub fn discover_allow_path_prefixes(
438        mut self,
439        prefixes: impl IntoIterator<Item = impl Into<String>>,
440    ) -> Self {
441        self.config.discovery.link_extract_options = self
442            .config
443            .discovery
444            .link_extract_options
445            .with_allow_path_prefixes(prefixes);
446        self
447    }
448
449    /// Excludes runtime discovery for the provided URL path prefixes.
450    pub fn discover_deny_path_prefixes(
451        mut self,
452        prefixes: impl IntoIterator<Item = impl Into<String>>,
453    ) -> Self {
454        self.config.discovery.link_extract_options = self
455            .config
456            .discovery
457            .link_extract_options
458            .with_deny_path_prefixes(prefixes);
459        self
460    }
461
462    /// Restricts runtime discovery to specific HTML tags.
463    pub fn discover_allowed_tags(
464        mut self,
465        tags: impl IntoIterator<Item = impl Into<String>>,
466    ) -> Self {
467        self.config.discovery.link_extract_options = self
468            .config
469            .discovery
470            .link_extract_options
471            .with_allowed_tags(tags);
472        self
473    }
474
475    /// Restricts runtime discovery to specific HTML attributes.
476    pub fn discover_allowed_attributes(
477        mut self,
478        attributes: impl IntoIterator<Item = impl Into<String>>,
479    ) -> Self {
480        self.config.discovery.link_extract_options = self
481            .config
482            .discovery
483            .link_extract_options
484            .with_allowed_attributes(attributes);
485        self
486    }
487
488    /// Restricts runtime discovery to specific link types.
489    pub fn discover_allowed_link_types(
490        mut self,
491        link_types: impl IntoIterator<Item = LinkType>,
492    ) -> Self {
493        self.config.discovery.link_extract_options = self
494            .config
495            .discovery
496            .link_extract_options
497            .with_allowed_link_types(link_types);
498        self
499    }
500
501    /// Excludes specific link types from runtime discovery.
502    pub fn discover_denied_link_types(
503        mut self,
504        link_types: impl IntoIterator<Item = LinkType>,
505    ) -> Self {
506        self.config.discovery.link_extract_options = self
507            .config
508            .discovery
509            .link_extract_options
510            .with_denied_link_types(link_types);
511        self
512    }
513
514    /// Sets a custom downloader implementation.
515    ///
516    /// Use this method to provide a custom [`Downloader`] implementation
517    /// instead of the default [`ReqwestClientDownloader`].
518    ///
519    /// Reach for this when transport behavior itself needs to change, such as
520    /// request signing, alternate HTTP stacks, downloader-level tracing, or
521    /// protocol-specific request execution.
522    pub fn downloader(mut self, downloader: D) -> Self {
523        self.downloader = downloader;
524        self
525    }
526
527    /// Adds a middleware to the crawler's middleware stack.
528    ///
529    /// Middlewares intercept and modify requests before they are sent and
530    /// responses after they are received. They are executed in the order
531    /// they are added.
532    ///
533    /// Middleware is the right layer for cross-cutting HTTP behavior such as
534    /// retry policy, rate limiting, cookies, user-agent management, cache
535    /// lookup, or `robots.txt` enforcement.
536    ///
537    /// ## Example
538    ///
539    /// ```rust,ignore
540    /// let crawler = CrawlerBuilder::new(MySpider)
541    ///     .add_middleware(RateLimitMiddleware::default())
542    ///     .add_middleware(RetryMiddleware::new())
543    ///     .build()
544    ///     .await?;
545    /// ```
546    pub fn add_middleware<M>(mut self, middleware: M) -> Self
547    where
548        M: Middleware<D::Client> + Send + Sync + 'static,
549    {
550        self.middlewares.push(Box::new(middleware));
551        self
552    }
553
554    /// Adds a pipeline to the crawler's pipeline stack.
555    ///
556    /// Pipelines process scraped items after they are extracted by the spider.
557    /// They can be used for validation, transformation, deduplication, or
558    /// storage (e.g., writing to files or databases).
559    ///
560    /// Pipelines are ordered. A common pattern is transform first, validate
561    /// second, deduplicate next, and write to outputs last.
562    ///
563    /// ## Example
564    ///
565    /// ```rust,ignore
566    /// let crawler = CrawlerBuilder::new(MySpider)
567    ///     .add_pipeline(ConsolePipeline::new())
568    ///     .add_pipeline(JsonPipeline::new("output.json")?)
569    ///     .build()
570    ///     .await?;
571    /// ```
572    pub fn add_pipeline<P>(mut self, pipeline: P) -> Self
573    where
574        P: Pipeline<S::Item> + 'static,
575    {
576        self.pipelines.push(Box::new(pipeline));
577        self
578    }
579
580    /// Sets the log level for `spider-*` library crates.
581    ///
582    /// This configures the logging level specifically for the spider-lib ecosystem
583    /// (spider-core, spider-middleware, spider-pipeline, spider-util, spider-downloader).
584    /// Logs from other dependencies (e.g., reqwest, tokio) will not be affected.
585    ///
586    /// ## Log Levels
587    ///
588    /// - `LevelFilter::Error` - Only error messages
589    /// - `LevelFilter::Warn` - Warnings and errors
590    /// - `LevelFilter::Info` - Informational messages, warnings, and errors
591    /// - `LevelFilter::Debug` - Debug messages and above
592    /// - `LevelFilter::Trace` - All messages including trace
593    ///
594    /// ## Example
595    ///
596    /// ```rust,ignore
597    /// use log::LevelFilter;
598    ///
599    /// let crawler = CrawlerBuilder::new(MySpider)
600    ///     .log_level(LevelFilter::Debug)
601    ///     .build()
602    ///     .await?;
603    /// ```
604    pub fn log_level(mut self, level: LevelFilter) -> Self {
605        self.log_level = Some(level);
606        self
607    }
608
609    /// Sets the path for saving and loading checkpoints.
610    ///
611    /// When enabled, the crawler periodically saves its state to this file,
612    /// allowing crawls to be resumed after interruption.
613    ///
614    /// Requires the `checkpoint` feature to be enabled.
615    ///
616    /// If a checkpoint file already exists at build time, the builder will
617    /// attempt to restore scheduler and pipeline state from it.
618    pub fn with_checkpoint_path<P: AsRef<Path>>(mut self, path: P) -> Self {
619        self.checkpoint_config.path = Some(path.as_ref().to_path_buf());
620        self
621    }
622
623    /// Sets the interval between automatic checkpoint saves.
624    ///
625    /// When enabled, the crawler saves its state at this interval.
626    /// Shorter intervals provide more frequent recovery points but may
627    /// impact performance.
628    ///
629    /// Requires the `checkpoint` feature to be enabled.
630    pub fn with_checkpoint_interval(mut self, interval: Duration) -> Self {
631        self.checkpoint_config.interval = Some(interval);
632        self
633    }
634
635    /// Builds the [`Crawler`] instance.
636    ///
637    /// This method finalizes the crawler configuration and initializes all
638    /// components. It performs validation and sets up default values where
639    /// necessary.
640    ///
641    /// Build time is where the runtime:
642    /// - validates concurrency and channel settings
643    /// - initializes logging and live-stats behavior
644    /// - restores checkpoint state if configured
645    /// - constructs the scheduler and runtime handles
646    ///
647    /// # Errors
648    ///
649    /// Returns a [`SpiderError::ConfigurationError`] if:
650    /// - `max_concurrent_downloads` is 0
651    /// - `parser_workers` is 0
652    /// - No spider was provided to the builder
653    ///
654    /// # Example
655    ///
656    /// ```rust,ignore
657    /// let crawler = CrawlerBuilder::new(MySpider)
658    ///     .max_concurrent_downloads(10)
659    ///     .build()
660    ///     .await?;
661    /// ```
662    pub async fn build(mut self) -> Result<Crawler<S, D::Client>, SpiderError>
663    where
664        D: Downloader + Send + Sync + 'static,
665        D::Client: Send + Sync + Clone,
666        S::Item: Send + Sync + 'static,
667    {
668        let spider = self.take_spider()?;
669        self.init_default_pipeline();
670
671        // Live stats redraw on stdout and should not interleave with spider-* logs.
672        // Force spider-* logs to Off when live stats mode is enabled.
673        let effective_log_level = if self.config.live_stats {
674            Some(LevelFilter::Off)
675        } else {
676            self.log_level
677        };
678
679        // Initialize logging for spider-* crates if an effective log level is configured.
680        if let Some(level) = effective_log_level {
681            self.init_logging(level);
682        }
683
684        // Validate config
685        self.config
686            .validate()
687            .map_err(SpiderError::ConfigurationError)?;
688
689        // Restore checkpoint and get scheduler state
690        #[cfg(feature = "checkpoint")]
691        let (scheduler_state, pipeline_states) = self.restore_checkpoint()?;
692        #[cfg(not(feature = "checkpoint"))]
693        let scheduler_state: Option<()> = None;
694
695        // Restore pipeline states if checkpoint was loaded
696        #[cfg(feature = "checkpoint")]
697        {
698            if let Some(states) = pipeline_states {
699                for (name, state) in states {
700                    if let Some(pipeline) = self.pipelines.iter().find(|p| p.name() == name) {
701                        pipeline.restore_state(state).await?;
702                    } else {
703                        warn!("Checkpoint contains state for unknown pipeline: {}", name);
704                    }
705                }
706            }
707        }
708
709        // Get cookie store if feature is enabled
710        #[cfg(feature = "cookie-store")]
711        let cookie_store = {
712            #[cfg(feature = "checkpoint")]
713            {
714                let (_, cookie_store) = self.restore_cookie_store().await?;
715                cookie_store
716            }
717            #[cfg(not(feature = "checkpoint"))]
718            {
719                Some(crate::CookieStore::default())
720            }
721        };
722
723        // Create scheduler with or without checkpoint state
724        let (scheduler_arc, req_rx) =
725            Scheduler::new(scheduler_state, self.config.max_pending_requests);
726        let downloader_arc = Arc::new(self.downloader);
727        let stats = Arc::new(StatCollector::new(
728            self.config.live_stats_preview_fields.clone(),
729        ));
730
731        // Build crawler with or without cookie store based on feature flag
732        #[cfg(feature = "cookie-store")]
733        let crawler = Crawler::new(
734            scheduler_arc,
735            req_rx,
736            downloader_arc,
737            self.middlewares,
738            spider,
739            self.pipelines,
740            self.config,
741            #[cfg(feature = "checkpoint")]
742            self.checkpoint_config,
743            stats,
744            Arc::new(tokio::sync::RwLock::new(cookie_store.unwrap_or_default())),
745        );
746
747        #[cfg(not(feature = "cookie-store"))]
748        let crawler = Crawler::new(
749            scheduler_arc,
750            req_rx,
751            downloader_arc,
752            self.middlewares,
753            spider,
754            self.pipelines,
755            self.config,
756            #[cfg(feature = "checkpoint")]
757            self.checkpoint_config,
758            stats,
759        );
760
761        Ok(crawler)
762    }
763
764    /// Restores checkpoint state from disk (checkpoint feature only).
765    ///
766    /// This internal method loads a previously saved checkpoint file and
767    /// restores the scheduler state and pipeline states.
768    ///
769    /// # Returns
770    ///
771    /// Returns a tuple of `(SchedulerCheckpoint, Option<HashMap<String, Value>>)`.
772    /// If no checkpoint path is configured or the file doesn't exist, returns
773    /// default values.
774    ///
775    /// # Errors
776    ///
777    /// Returns a [`SpiderError`] if deserialization fails.
778    /// Note: Checkpoint file read/deserialization errors are logged as warnings
779    /// but do not fail the operation—the crawl proceeds without checkpoint data.
780    #[cfg(feature = "checkpoint")]
781    fn restore_checkpoint(&mut self) -> Result<RestoreResult, SpiderError> {
782        let mut scheduler_state = None;
783        let mut pipeline_states = None;
784
785        if let Some(path) = &self.checkpoint_config.path {
786            info!("Attempting to restore checkpoint from {:?}", path);
787            if let Some(checkpoint) = self.load_checkpoint_from_path(path) {
788                scheduler_state = Some(checkpoint.scheduler);
789                pipeline_states = Some(checkpoint.pipelines);
790            }
791        }
792
793        Ok((scheduler_state, pipeline_states))
794    }
795
796    /// Restores cookie store from checkpoint (checkpoint + cookie-store features).
797    #[cfg(all(feature = "checkpoint", feature = "cookie-store"))]
798    async fn restore_cookie_store(
799        &mut self,
800    ) -> Result<(Option<()>, Option<crate::CookieStore>), SpiderError> {
801        let mut cookie_store = None;
802
803        if let Some(path) = &self.checkpoint_config.path {
804            info!(
805                "Attempting to restore cookie store from checkpoint {:?}",
806                path
807            );
808            if let Some(checkpoint) = self.load_checkpoint_from_path(path) {
809                cookie_store = Some(checkpoint.cookie_store);
810            }
811        }
812
813        Ok((None, cookie_store))
814    }
815
816    /// Extracts the spider from the builder.
817    ///
818    /// # Errors
819    ///
820    /// Returns a [`SpiderError::ConfigurationError`] if:
821    /// - `max_concurrent_downloads` is 0
822    /// - `parser_workers` is 0
823    /// - No spider was provided
824    fn take_spider(&mut self) -> Result<S, SpiderError> {
825        if self.config.max_concurrent_downloads == 0 {
826            return Err(SpiderError::ConfigurationError(
827                "max_concurrent_downloads must be greater than 0.".to_string(),
828            ));
829        }
830        if self.config.max_pending_requests == 0 {
831            return Err(SpiderError::ConfigurationError(
832                "max_pending_requests must be greater than 0.".to_string(),
833            ));
834        }
835        if self.config.parser_workers == 0 {
836            return Err(SpiderError::ConfigurationError(
837                "parser_workers must be greater than 0.".to_string(),
838            ));
839        }
840        self.spider.take().ok_or_else(|| {
841            SpiderError::ConfigurationError("Crawler must have a spider.".to_string())
842        })
843    }
844
845    /// Initializes the pipeline stack with a default [`ConsolePipeline`] if empty.
846    ///
847    /// This ensures that scraped items are always output somewhere, even if
848    /// no explicit pipelines are configured by the user.
849    fn init_default_pipeline(&mut self) {
850        if self.pipelines.is_empty() {
851            use spider_pipeline::console::ConsolePipeline;
852            self.pipelines.push(Box::new(ConsolePipeline::new()));
853        }
854    }
855
856    /// Initializes logging for spider-* crates only.
857    ///
858    /// This sets up env_logger with a filter that only enables logging for
859    /// crates within the spider-lib ecosystem.
860    fn init_logging(&self, level: LevelFilter) {
861        use env_logger::{Builder, Env};
862
863        let mut builder = Builder::from_env(Env::default().default_filter_or("off"));
864
865        // Set filter specifically for spider-* crates
866        builder.filter_module("spider_core", level);
867        builder.filter_module("spider_middleware", level);
868        builder.filter_module("spider_pipeline", level);
869        builder.filter_module("spider_util", level);
870        builder.filter_module("spider_downloader", level);
871        builder.filter_module("spider_macro", level);
872
873        builder.init();
874    }
875}
spider_core/builder.rs

spider_core/
builder.rs