Skip to main content

quotes/
quotes_scraper.rs

1use std::time::Duration;
2
3use spider_lib::{
4    CrawlerBuilder, CsvExporterPipeline, DeduplicationPipeline, HttpCacheMiddleware, ParseOutput,
5    RefererMiddleware, Request, ReqwestClientDownloader, Response, RobotsTxtMiddleware,
6    ScrapedItem, Spider, SpiderError, UserAgentMiddleware, async_trait, scraped_item,
7    utils::ToSelector,
8};
9
10#[scraped_item]
11pub struct QuoteItem {
12    pub text: String,
13    pub author: String,
14    pub tags: String,
15}
16
17pub struct QuotesSpider;
18
19#[async_trait]
20impl Spider for QuotesSpider {
21    type Item = QuoteItem;
22
23    fn start_urls(&self) -> Vec<&'static str> {
24        vec!["http://quotes.toscrape.com/"]
25    }
26
27    async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
28        let html = response.to_html()?;
29        let mut output = ParseOutput::new();
30
31        for quote_element in html.select(&".quote".to_selector()?) {
32            let text = quote_element
33                .select(&".text".to_selector()?)
34                .next()
35                .map(|e| e.text().collect::<String>().trim().to_string())
36                .unwrap_or_default();
37
38            let author = quote_element
39                .select(&".author".to_selector()?)
40                .next()
41                .map(|e| e.text().collect::<String>().trim().to_string())
42                .unwrap_or_default();
43
44            let tags: Vec<String> = quote_element
45                .select(&".tags .tag".to_selector()?)
46                .map(|e| e.text().collect::<String>().trim().to_string())
47                .collect();
48            let tags_str = tags.join(", ");
49
50            let item = QuoteItem {
51                text,
52                author,
53                tags: tags_str,
54            };
55            output.add_item(item);
56        }
57
58        if let Some(next_href) = html
59            .select(&".next > a".to_selector()?)
60            .next()
61            .and_then(|a| a.attr("href"))
62        {
63            let next_url = response.url.join(next_href)?;
64            let next_request = Request::new(next_url);
65            output.add_request(next_request);
66        }
67
68        Ok(output)
69    }
70}
71
72#[tokio::main]
73async fn main() -> Result<(), SpiderError> {
74    tracing_subscriber::fmt()
75        .with_env_filter("info,spider_lib=debug")
76        .without_time()
77        .init();
78
79    let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
80        .add_pipeline(DeduplicationPipeline::new(&["text"]))
81        .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
82        .add_middleware(HttpCacheMiddleware::builder().build()?)
83        .add_middleware(UserAgentMiddleware::builder().build()?)
84        .add_middleware(RobotsTxtMiddleware::new())
85        .add_middleware(
86            RefererMiddleware::new()
87                .same_origin_only(true)
88                .max_chain_length(100)
89                .include_fragment(false),
90        )
91        .with_checkpoint_path("output/quotes.bin")
92        .with_checkpoint_interval(Duration::from_secs(15))
93        .max_concurrent_downloads(5)
94        .max_parser_workers(2)
95        .max_concurrent_pipelines(2)
96        .build()
97        .await?;
98
99    crawler.start_crawl().await?;
100
101    Ok(())
102}