1use std::time::Duration;
2
3use spider_lib::{
4 CrawlerBuilder, CsvExporterPipeline, DeduplicationPipeline, HttpCacheMiddleware, ParseOutput,
5 RefererMiddleware, Request, ReqwestClientDownloader, Response, RobotsTxtMiddleware,
6 ScrapedItem, Spider, SpiderError, UserAgentMiddleware, async_trait, scraped_item,
7 utils::ToSelector,
8};
9
10#[scraped_item]
11pub struct QuoteItem {
12 pub text: String,
13 pub author: String,
14 pub tags: String,
15}
16
17pub struct QuotesSpider;
18
19#[async_trait]
20impl Spider for QuotesSpider {
21 type Item = QuoteItem;
22
23 fn start_urls(&self) -> Vec<&'static str> {
24 vec!["http://quotes.toscrape.com/"]
25 }
26
27 async fn parse(&mut self, response: Response) -> Result<ParseOutput<Self::Item>, SpiderError> {
28 let html = response.to_html()?;
29 let mut output = ParseOutput::new();
30
31 for quote_element in html.select(&".quote".to_selector()?) {
32 let text = quote_element
33 .select(&".text".to_selector()?)
34 .next()
35 .map(|e| e.text().collect::<String>().trim().to_string())
36 .unwrap_or_default();
37
38 let author = quote_element
39 .select(&".author".to_selector()?)
40 .next()
41 .map(|e| e.text().collect::<String>().trim().to_string())
42 .unwrap_or_default();
43
44 let tags: Vec<String> = quote_element
45 .select(&".tags .tag".to_selector()?)
46 .map(|e| e.text().collect::<String>().trim().to_string())
47 .collect();
48 let tags_str = tags.join(", ");
49
50 let item = QuoteItem {
51 text,
52 author,
53 tags: tags_str,
54 };
55 output.add_item(item);
56 }
57
58 if let Some(next_href) = html
59 .select(&".next > a".to_selector()?)
60 .next()
61 .and_then(|a| a.attr("href"))
62 {
63 let next_url = response.url.join(next_href)?;
64 let next_request = Request::new(next_url);
65 output.add_request(next_request);
66 }
67
68 Ok(output)
69 }
70}
71
72#[tokio::main]
73async fn main() -> Result<(), SpiderError> {
74 tracing_subscriber::fmt()
75 .with_env_filter("info,spider_lib=debug")
76 .without_time()
77 .init();
78
79 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
80 .add_pipeline(DeduplicationPipeline::new(&["text"]))
81 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
82 .add_middleware(HttpCacheMiddleware::builder().build()?)
83 .add_middleware(UserAgentMiddleware::builder().build()?)
84 .add_middleware(RobotsTxtMiddleware::new())
85 .add_middleware(
86 RefererMiddleware::new()
87 .same_origin_only(true)
88 .max_chain_length(100)
89 .include_fragment(false),
90 )
91 .with_checkpoint_path("output/quotes.bin")
92 .with_checkpoint_interval(Duration::from_secs(15))
93 .max_concurrent_downloads(5)
94 .max_parser_workers(2)
95 .max_concurrent_pipelines(2)
96 .build()
97 .await?;
98
99 crawler.start_crawl().await?;
100
101 Ok(())
102}