Skip to main content

spider_core/
lib.rs

1//! # spider-core
2//!
3//! The core engine of the `spider-lib` web scraping framework.
4//!
5//! This crate provides the fundamental components for building web scrapers,
6//! including the main `Crawler`, `Scheduler`, `Spider` trait, and other
7//! essential infrastructure for managing the crawling process.
8//!
9//! ## Overview
10//!
11//! The `spider-core` crate implements the central orchestration layer of the
12//! web scraping framework. It manages the flow of requests and responses,
13//! coordinates concurrent operations, and provides the foundation for
14//! middleware and pipeline systems.
15//!
16//! ## Key Components
17//!
18//! - **Crawler**: The main orchestrator that manages the crawling process
19//! - **Scheduler**: Handles request queuing and duplicate detection
20//! - **Spider**: Trait defining the interface for custom scraping logic
21//! - **CrawlerBuilder**: Fluent API for configuring and building crawlers
22//! - **Middleware**: Interceptors for processing requests and responses
23//! - **Pipeline**: Processors for scraped items
24//! - **Stats**: Collection and reporting of crawl statistics
25//!
26//! ## Usage
27//!
28//! Most users will interact with the components re-exported from this crate
29//! through the main `spider-lib` facade. However, this crate can be used
30//! independently for fine-grained control over the crawling process.
31//!
32//! ```rust,ignore
33//! use spider_core::{Crawler, CrawlerBuilder, Spider, Scheduler};
34//! use spider_util::{request::Request, response::Response, error::SpiderError};
35//!
36//! #[derive(Default)]
37//! struct MySpider;
38//!
39//! #[spider_macro::scraped_item]
40//! struct MyItem {
41//!     title: String,
42//!     url: String,
43//! }
44//!
45//! #[async_trait::async_trait]
46//! impl Spider for MySpider {
47//!     type Item = MyItem;
48//!
49//!     fn start_urls(&self) -> Vec<&'static str> {
50//!         vec!["https://example.com"]
51//!     }
52//!
53//!     async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
54//!         // Custom parsing logic here
55//!         todo!()
56//!     }
57//! }
58//!
59//! async fn run_crawler() -> Result<(), SpiderError> {
60//!     let crawler = CrawlerBuilder::new(MySpider).build().await?;
61//!     crawler.start_crawl().await
62//! }
63//! ```
64
65pub mod builder;
66pub mod checkpoint;
67pub mod crawler;
68pub mod prelude;
69pub mod scheduler;
70pub mod spider;
71pub mod state;
72pub mod stats;
73
74// Re-export SchedulerCheckpoint and Checkpoint
75pub use checkpoint::{Checkpoint, SchedulerCheckpoint};
76pub use spider_downloader::{Downloader, ReqwestClientDownloader, SimpleHttpClient};
77
78// Re-export CookieStore
79pub use cookie_store::CookieStore;
80
81pub use builder::CrawlerBuilder;
82pub use crawler::Crawler;
83pub use scheduler::Scheduler;
84pub use spider_macro::scraped_item;
85
86pub use async_trait::async_trait;
87pub use dashmap::DashMap;
88pub use spider::Spider;
89pub use tokio;