Skip to main content

spider_core/
lib.rs

1//! # spider-core
2//!
3//! The core engine of the `spider-lib` web scraping framework.
4//!
5//! This crate provides the fundamental components for building web scrapers,
6//! including the main `Crawler`, `Scheduler`, `Spider` trait, and other
7//! essential infrastructure for managing the crawling process.
8//!
9//! ## Overview
10//!
11//! The `spider-core` crate implements the central orchestration layer of the
12//! web scraping framework. It manages the flow of requests and responses,
13//! coordinates concurrent operations, and provides the foundation for
14//! middleware and pipeline systems.
15//!
16//! ## Key Components
17//!
18//! - **Crawler**: The main orchestrator that manages the crawling process
19//! - **Scheduler**: Handles request queuing and duplicate detection
20//! - **Spider**: Trait defining the interface for custom scraping logic
21//! - **CrawlerBuilder**: Fluent API for configuring and building crawlers
22//! - **Middleware**: Interceptors for processing requests and responses
23//! - **Pipeline**: Processors for scraped items
24//! - **Stats**: Collection and reporting of crawl statistics
25//!
26//! ## Usage
27//!
28//! Most users will interact with the components re-exported from this crate
29//! through the main `spider-lib` facade. However, this crate can be used
30//! independently for fine-grained control over the crawling process.
31//!
32//! ```rust,ignore
33//! use spider_core::{Crawler, CrawlerBuilder, Spider, Scheduler};
34//! use spider_util::{request::Request, response::Response, error::SpiderError};
35//!
36//! #[derive(Default)]
37//! struct MySpider;
38//!
39//! #[spider_macro::scraped_item]
40//! struct MyItem {
41//!     title: String,
42//!     url: String,
43//! }
44//!
45//! #[async_trait::async_trait]
46//! impl Spider for MySpider {
47//!     type Item = MyItem;
48//!
49//!     fn start_urls(&self) -> Vec<&'static str> {
50//!         vec!["https://example.com"]
51//!     }
52//!
53//!     async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
54//!         // Custom parsing logic here
55//!         todo!()
56//!     }
57//! }
58//!
59//! async fn run_crawler() -> Result<(), SpiderError> {
60//!     let crawler = CrawlerBuilder::new(MySpider).build().await?;
61//!     crawler.start_crawl().await
62//! }
63//! ```
64
65pub mod builder;
66#[cfg(feature = "checkpoint")]
67pub mod checkpoint;
68pub mod crawler;
69pub mod prelude;
70pub mod scheduler;
71pub mod spider;
72pub mod state;
73pub mod stats;
74
75// Re-export SchedulerCheckpoint and Checkpoint (when checkpoint feature is enabled)
76#[cfg(feature = "checkpoint")]
77pub use checkpoint::{Checkpoint, SchedulerCheckpoint};
78
79pub use spider_downloader::{Downloader, ReqwestClientDownloader, SimpleHttpClient};
80
81// Re-export CookieStore (when cookie-store feature is enabled)
82#[cfg(feature = "cookie-store")]
83pub use cookie_store::CookieStore;
84
85pub use builder::CrawlerBuilder;
86pub use crawler::Crawler;
87pub use scheduler::Scheduler;
88pub use spider_macro::scraped_item;
89
90pub use async_trait::async_trait;
91pub use dashmap::DashMap;
92pub use spider::Spider;
93pub use tokio;