1use anyhow::Result;
2use async_trait::async_trait;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use spider_lib::{
7 CrawlOutput, CsvExporterPipeline, EngineBuilder, Request, Response, ScrapedItem, Spider,
8 SpiderError,
9};
10use std::any::Any;
11use tracing::info;
12use tracing_subscriber::filter::EnvFilter;
13use tracing_subscriber::fmt;
14use url::Url;
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct BookItem {
19 pub title: String,
20 pub price: String,
21 pub availability: String,
22 pub rating: String,
23 pub url: String,
24}
25
26impl ScrapedItem for BookItem {
27 fn as_any(&self) -> &dyn Any {
28 self
29 }
30 fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync> {
31 Box::new(self.clone())
32 }
33 fn to_json_value(&self) -> Value {
34 serde_json::to_value(self).expect("Failed to serialize BookItem to JSON Value")
35 }
36}
37
38pub struct BooksSpider;
40
41#[async_trait]
42impl Spider for BooksSpider {
43 type Item = BookItem;
44
45 fn name(&self) -> &'static str {
46 "books"
47 }
48
49 fn start_urls(&self) -> Vec<Url> {
50 vec![Url::parse("http://books.toscrape.com/").unwrap()]
51 }
52
53 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
54 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
55 let mut output = CrawlOutput::new();
56
57 if response.url.path().ends_with("index.html") && response.url.path().contains("catalogue")
58 {
59 let title = html
61 .select(&Selector::parse("h1").unwrap())
62 .next()
63 .unwrap()
64 .inner_html();
65 let price = html
66 .select(&Selector::parse("p.price_color").unwrap())
67 .next()
68 .unwrap()
69 .inner_html();
70 let availability = html
71 .select(&Selector::parse("p.instock.availability").unwrap())
72 .next()
73 .unwrap()
74 .text()
75 .collect::<String>()
76 .trim()
77 .to_string();
78 let rating = html
79 .select(&Selector::parse("p.star-rating").unwrap())
80 .next()
81 .unwrap()
82 .value()
83 .attr("class")
84 .unwrap()
85 .split_whitespace()
86 .last()
87 .unwrap()
88 .to_string();
89
90 output.add_item(BookItem {
91 title,
92 price,
93 availability,
94 rating,
95 url: response.url.to_string(),
96 });
97 } else {
98 let book_selector = Selector::parse("article.product_pod h3 a").unwrap();
100 let next_page_selector = Selector::parse("li.next a").unwrap();
101
102 let book_links = html.select(&book_selector);
103 let next_page_link = html.select(&next_page_selector);
104
105 for link in book_links.chain(next_page_link) {
106 if let Some(href) = link.value().attr("href") {
107 let mut url = response.url.join(href)?;
108 url.set_fragment(None);
109 output.add_request(Request::new(url.clone(), self.name(), "parse"));
110 }
111 }
112 }
113
114 Ok(output)
115 }
116}
117
118#[tokio::main]
119async fn main() -> Result<(), anyhow::Error> {
120 let filter = EnvFilter::new("info,spider_lib=debug");
121 fmt().with_env_filter(filter).init();
122
123 info!("Starting RustScraper example...");
124
125 let engine = EngineBuilder::new(BooksSpider)
126 .add_pipeline(Box::new(CsvExporterPipeline::new("books.csv")))
127 .max_concurrent_downloads(10)
128 .build()
129 .await?;
130
131 engine.start_crawl().await?;
132
133 info!("RustScraper example finished.");
134
135 Ok(())
136}