1use std::time::Duration;
13
14use spider_lib::{
15 CsvExporterPipeline, DeduplicationPipeline, HttpCacheMiddleware, RateLimitMiddleware,
16 RefererMiddleware, RetryMiddleware, RobotsTxtMiddleware, UserAgentMiddleware, prelude::*,
17 utils::ToSelector,
18};
19
20#[scraped_item]
21pub struct QuoteItem {
22 pub text: String,
23 pub author: String,
24 pub tags: String,
25}
26
27pub struct QuotesSpider;
28
29#[async_trait]
30impl Spider for QuotesSpider {
31 type Item = QuoteItem;
32
33 fn start_urls(&self) -> Vec<&'static str> {
34 vec!["http://quotes.toscrape.com/"]
35 }
36
37 async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
38 let html = response.to_html()?;
39 let mut output = ParseOutput::new();
40
41 for quote_element in html.select(&".quote".to_selector()?) {
42 let text = quote_element
43 .select(&".text".to_selector()?)
44 .next()
45 .map(|e| e.text().collect::<String>().trim().to_string())
46 .unwrap_or_default();
47
48 let author = quote_element
49 .select(&".author".to_selector()?)
50 .next()
51 .map(|e| e.text().collect::<String>().trim().to_string())
52 .unwrap_or_default();
53
54 let tags: Vec<String> = quote_element
55 .select(&".tags .tag".to_selector()?)
56 .map(|e| e.text().collect::<String>().trim().to_string())
57 .collect();
58 let tags_str = tags.join(", ");
59
60 let item = QuoteItem {
61 text,
62 author,
63 tags: tags_str,
64 };
65 output.add_item(item);
66 }
67
68 if let Some(next_href) = html
69 .select(&".next > a".to_selector()?)
70 .next()
71 .and_then(|a| a.attr("href"))
72 {
73 let next_url = response.url.join(next_href)?;
74 let next_request = Request::new(next_url);
75 output.add_request(next_request);
76 }
77
78 Ok(output)
79 }
80}
81
82#[tokio::main]
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}