Skip to main content

spider_lib/
lib.rs

1//! # spider-lib
2//!
3//! A Rust-based web scraping framework inspired by Scrapy.
4//!
5//! `spider-lib` is an asynchronous web scraping library for Rust.
6//! It integrates core engine, macros, middleware, and pipelines into a unified library.
7//!
8//! ## Quick Start
9//!
10//! ```toml
11//! [dependencies]
12//! spider-lib = "1.1.1"
13//! serde = { version = "1.0", features = ["derive"] }
14//! serde_json = "1.0"
15//! ```
16//!
17//! ```rust,ignore
18//! use spider_lib::prelude::*;
19//! use std::sync::Arc;
20//! use std::sync::atomic::{AtomicUsize, Ordering};
21//! use dashmap::DashMap;
22//!
23//! #[scraped_item]
24//! struct Quote {
25//!     text: String,
26//!     author: String,
27//! }
28//!
29//! struct QuotesSpider;
30//!
31//! // State for tracking information during crawling
32//! #[derive(Clone, Default)]
33//! struct QuotesSpiderState {
34//!     page_count: Arc<AtomicUsize>,
35//!     visited_urls: Arc<DashMap<String, bool>>,
36//! }
37//!
38//! impl QuotesSpiderState {
39//!     fn increment_page_count(&self) {
40//!         self.page_count.fetch_add(1, Ordering::SeqCst);
41//!     }
42//!     
43//!     fn mark_url_visited(&self, url: String) {
44//!         self.visited_urls.insert(url, true);
45//!     }
46//! }
47//!
48//! #[async_trait]
49//! impl Spider for QuotesSpider {
50//!     type Item = Quote;
51//!     type State = QuotesSpiderState;
52//!
53//!     fn start_urls(&self) -> Vec<&'static str> {
54//!         vec!["http://quotes.toscrape.com/"]
55//!     }
56//!
57//!     async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
58//!         // Update state - can be done concurrently without blocking the spider
59//!         state.increment_page_count();
60//!         state.mark_url_visited(response.url.to_string());
61//!         
62//!         // parsing logic
63//!         todo!()
64//!     }
65//! }
66//! ```
67//!
68//! **Note**: Notice that the `Spider` implementation now uses an immutable reference (`&self`)
69//! and receives a separate state parameter (`state: &Self::State`). This enables more efficient
70//! concurrent crawling by eliminating the need for mutex locks on the spider itself.
71
72pub mod prelude;
73pub use prelude::*;
74
75// Re-export procedural macros
76pub use spider_macro::scraped_item;