spider_lib/lib.rs
1//! # spider-lib
2//!
3//! A Rust-based web scraping framework inspired by Scrapy.
4//!
5//! `spider-lib` is an asynchronous web scraping library for Rust.
6//! It integrates core engine, macros, middleware, and pipelines into a unified library.
7//!
8//! ## Quick Start
9//!
10//! ```toml
11//! [dependencies]
12//! spider-lib = "1.1.1"
13//! serde = { version = "1.0", features = ["derive"] }
14//! serde_json = "1.0"
15//! ```
16//!
17//! ```rust,ignore
18//! use spider_lib::prelude::*;
19//! use std::sync::Arc;
20//! use std::sync::atomic::{AtomicUsize, Ordering};
21//! use dashmap::DashMap;
22//!
23//! #[scraped_item]
24//! struct Quote {
25//! text: String,
26//! author: String,
27//! }
28//!
29//! struct QuotesSpider;
30//!
31//! // State for tracking information during crawling
32//! #[derive(Clone, Default)]
33//! struct QuotesSpiderState {
34//! page_count: Arc<AtomicUsize>,
35//! visited_urls: Arc<DashMap<String, bool>>,
36//! }
37//!
38//! impl QuotesSpiderState {
39//! fn increment_page_count(&self) {
40//! self.page_count.fetch_add(1, Ordering::SeqCst);
41//! }
42//!
43//! fn mark_url_visited(&self, url: String) {
44//! self.visited_urls.insert(url, true);
45//! }
46//! }
47//!
48//! #[async_trait]
49//! impl Spider for QuotesSpider {
50//! type Item = Quote;
51//! type State = QuotesSpiderState;
52//!
53//! fn start_urls(&self) -> Vec<&'static str> {
54//! vec!["http://quotes.toscrape.com/"]
55//! }
56//!
57//! async fn parse(&self, response: Response, state: &Self::State) -> Result<ParseOutput<Self::Item>, SpiderError> {
58//! // Update state - can be done concurrently without blocking the spider
59//! state.increment_page_count();
60//! state.mark_url_visited(response.url.to_string());
61//!
62//! // parsing logic
63//! todo!()
64//! }
65//! }
66//! ```
67//!
68//! **Note**: Notice that the `Spider` implementation now uses an immutable reference (`&self`)
69//! and receives a separate state parameter (`state: &Self::State`). This enables more efficient
70//! concurrent crawling by eliminating the need for mutex locks on the spider itself.
71
72pub mod prelude;
73pub use prelude::*;