Skip to main content

quotes/
quotes_scraper.rs

1//! An example web scraper for quotes.toscrape.com using the `spider-lib` framework.
2//!
3//! This example demonstrates how to:
4//! - Define a `ScrapedItem` to structure extracted data.
5//! - Implement a `Spider` to define start URLs and parsing logic.
6//! - Configure a `Crawler` with various built-in middlewares (e.g., HTTP caching,
7//!   rate limiting, retries, user-agent rotation, robots.txt, referer) and pipelines
8//!   (e.g., deduplication, CSV export).
9//! - Enable checkpointing for crawl state persistence.
10//!
11//! The scraper extracts quotes, authors, and tags from the website and saves them to a CSV file.
12use std::time::Duration;
13
14use spider_lib::{
15    CsvExporterPipeline, DeduplicationPipeline, HttpCacheMiddleware, RateLimitMiddleware,
16    RefererMiddleware, RetryMiddleware, RobotsTxtMiddleware, UserAgentMiddleware, prelude::*,
17    utils::ToSelector,
18};
19
20#[scraped_item]
21pub struct QuoteItem {
22    pub text: String,
23    pub author: String,
24    pub tags: String,
25}
26
27pub struct QuotesSpider;
28
29#[async_trait]
30impl Spider for QuotesSpider {
31    type Item = QuoteItem;
32
33    fn start_urls(&self) -> Vec<&'static str> {
34        vec!["http://quotes.toscrape.com/"]
35    }
36
37    async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
38        let html = response.to_html()?;
39        let mut output = ParseOutput::new();
40
41        for quote_element in html.select(&".quote".to_selector()?) {
42            let text = quote_element
43                .select(&".text".to_selector()?)
44                .next()
45                .map(|e| e.text().collect::<String>().trim().to_string())
46                .unwrap_or_default();
47
48            let author = quote_element
49                .select(&".author".to_selector()?)
50                .next()
51                .map(|e| e.text().collect::<String>().trim().to_string())
52                .unwrap_or_default();
53
54            let tags: Vec<String> = quote_element
55                .select(&".tags .tag".to_selector()?)
56                .map(|e| e.text().collect::<String>().trim().to_string())
57                .collect();
58            let tags_str = tags.join(", ");
59
60            let item = QuoteItem {
61                text,
62                author,
63                tags: tags_str,
64            };
65            output.add_item(item);
66        }
67
68        if let Some(next_href) = html
69            .select(&".next > a".to_selector()?)
70            .next()
71            .and_then(|a| a.attr("href"))
72        {
73            let next_url = response.url.join(next_href)?;
74            let next_request = Request::new(next_url);
75            output.add_request(next_request);
76        }
77
78        Ok(output)
79    }
80}
81
82#[tokio::main]
83async fn main() -> Result<(), SpiderError> {
84    tracing_subscriber::fmt()
85        .with_env_filter("info,spider_lib=debug")
86        .without_time()
87        .init();
88
89    let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90        .add_pipeline(DeduplicationPipeline::new(&["text"]))
91        .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92        .add_middleware(HttpCacheMiddleware::builder().build()?)
93        .add_middleware(
94            RateLimitMiddleware::builder()
95                .use_token_bucket_limiter(5)
96                .build(),
97        )
98        .add_middleware(RetryMiddleware::default().max_retries(2))
99        .add_middleware(UserAgentMiddleware::builder().build()?)
100        .add_middleware(RobotsTxtMiddleware::new())
101        .add_middleware(
102            RefererMiddleware::new()
103                .same_origin_only(true)
104                .max_chain_length(100)
105                .include_fragment(false),
106        )
107        .with_checkpoint_path("output/quotes.bin")
108        .with_checkpoint_interval(Duration::from_secs(15))
109        .max_concurrent_downloads(5)
110        .max_parser_workers(2)
111        .max_concurrent_pipelines(2)
112        .build()
113        .await?;
114
115    crawler.start_crawl().await?;
116
117    Ok(())
118}