Skip to main content

spider_core/
lib.rs

1//! # spider-core
2//!
3//! `spider-core` is the runtime crate behind the rest of the workspace.
4//! It owns the crawler loop, scheduling, shared runtime state, statistics, and
5//! the [`Spider`] trait used to describe crawl behavior.
6//!
7//! If you are building an application, `spider-lib` is usually the easier
8//! starting point. Depend on `spider-core` directly when you want the runtime
9//! API without the facade crate.
10//!
11//! ## Example
12//!
13//! ```rust,ignore
14//! use spider_core::{async_trait, CrawlerBuilder, Spider};
15//! use spider_util::{response::Response, error::SpiderError, item::ParseOutput};
16//!
17//! #[spider_macro::scraped_item]
18//! struct Item {
19//!     title: String,
20//! }
21//!
22//! struct MySpider;
23//!
24//! #[async_trait]
25//! impl Spider for MySpider {
26//!     type Item = Item;
27//!     type State = ();
28//!
29//!     fn start_requests(&self) -> Result<spider_core::StartRequests<'_>, SpiderError> {
30//!         Ok(spider_core::StartRequests::Urls(vec!["https://example.com"]))
31//!     }
32//!
33//!     async fn parse(
34//!         &self,
35//!         _response: Response,
36//!         _state: &Self::State,
37//!     ) -> Result<ParseOutput<Self::Item>, SpiderError> {
38//!         Ok(ParseOutput::new())
39//!     }
40//! }
41//!
42//! async fn run() -> Result<(), SpiderError> {
43//!     let crawler = CrawlerBuilder::new(MySpider).build().await?;
44//!     crawler.start_crawl().await
45//! }
46//! ```
47
48pub mod builder;
49#[cfg(feature = "checkpoint")]
50pub mod checkpoint;
51pub mod config;
52pub mod discovery;
53pub mod engine;
54pub mod prelude;
55pub mod scheduler;
56pub mod spider;
57pub mod state;
58pub mod stats;
59
60/// Routes parse logic based on the discovery rule name attached to a response.
61///
62/// This is a lightweight helper for rule-based crawling. The response is only
63/// consumed by the matched branch, so each branch may move it into a dedicated
64/// parse helper.
65#[macro_export]
66macro_rules! route_by_rule {
67    ($response:expr, _ => $default:expr $(,)?) => {
68        $default
69    };
70    ($response:expr, $rule:literal => $handler:expr, $($rest:tt)+) => {{
71        if $response.matches_discovery_rule($rule) {
72            $handler
73        } else {
74            $crate::route_by_rule!($response, $($rest)+)
75        }
76    }};
77}
78
79// Re-export SchedulerCheckpoint and Checkpoint (when checkpoint feature is enabled)
80#[cfg(feature = "checkpoint")]
81pub use checkpoint::{Checkpoint, SchedulerCheckpoint};
82
83pub use spider_downloader::{Downloader, HttpClient, ReqwestClientDownloader};
84
85// Re-export CookieStore (when cookie-store feature is enabled)
86#[cfg(feature = "cookie-store")]
87pub use cookie_store::CookieStore;
88
89pub use builder::CrawlerBuilder;
90pub use config::{CrawlShapePreset, CrawlerConfig, DiscoveryConfig, DiscoveryMode, DiscoveryRule};
91pub use engine::Crawler;
92pub use scheduler::Scheduler;
93pub use spider_macro::scraped_item;
94
95pub use async_trait::async_trait;
96pub use dashmap::DashMap;
97pub use spider::{Spider, StartRequestIter, StartRequests};
98pub use state::{
99    ConcurrentMap, ConcurrentVec, Counter, Counter64, Flag, StateAccessMetrics, VisitedUrls,
100};
101pub use tokio;