books/
books.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use spider_lib::{
7    CrawlOutput, CsvExporterPipeline, EngineBuilder, Request, Response, ScrapedItem, Spider,
8    SpiderError,
9};
10use std::any::Any;
11use tracing::info;
12use tracing_subscriber::filter::EnvFilter;
13use tracing_subscriber::fmt;
14use url::Url;
15
16// Item Struct specific to this example
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct BookItem {
19    pub title: String,
20    pub price: String,
21    pub availability: String,
22    pub rating: String,
23    pub url: String,
24}
25
26impl ScrapedItem for BookItem {
27    fn as_any(&self) -> &dyn Any {
28        self
29    }
30    fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync> {
31        Box::new(self.clone())
32    }
33    fn to_json_value(&self) -> Value {
34        serde_json::to_value(self).expect("Failed to serialize BookItem to JSON Value")
35    }
36}
37
38// Custom Spider implementation for the example
39pub struct BooksSpider;
40
41#[async_trait]
42impl Spider for BooksSpider {
43    type Item = BookItem;
44
45    fn name(&self) -> &'static str {
46        "books"
47    }
48
49    fn start_urls(&self) -> Vec<Url> {
50        vec![Url::parse("http://books.toscrape.com/").unwrap()]
51    }
52
53    async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
54        let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
55        let mut output = CrawlOutput::new();
56
57        if response.url.path().ends_with("index.html") && response.url.path().contains("catalogue")
58        {
59            // Book page
60            let title = html
61                .select(&Selector::parse("h1").unwrap())
62                .next()
63                .unwrap()
64                .inner_html();
65            let price = html
66                .select(&Selector::parse("p.price_color").unwrap())
67                .next()
68                .unwrap()
69                .inner_html();
70            let availability = html
71                .select(&Selector::parse("p.instock.availability").unwrap())
72                .next()
73                .unwrap()
74                .text()
75                .collect::<String>()
76                .trim()
77                .to_string();
78            let rating = html
79                .select(&Selector::parse("p.star-rating").unwrap())
80                .next()
81                .unwrap()
82                .value()
83                .attr("class")
84                .unwrap()
85                .split_whitespace()
86                .last()
87                .unwrap()
88                .to_string();
89
90            output.add_item(BookItem {
91                title,
92                price,
93                availability,
94                rating,
95                url: response.url.to_string(),
96            });
97        } else {
98            // Book list page
99            let book_selector = Selector::parse("article.product_pod h3 a").unwrap();
100            let next_page_selector = Selector::parse("li.next a").unwrap();
101
102            let book_links = html.select(&book_selector);
103            let next_page_link = html.select(&next_page_selector);
104
105            for link in book_links.chain(next_page_link) {
106                if let Some(href) = link.value().attr("href") {
107                    let mut url = response.url.join(href)?;
108                    url.set_fragment(None);
109                    output.add_request(Request::new(url.clone(), self.name(), "parse"));
110                }
111            }
112        }
113
114        Ok(output)
115    }
116}
117
118#[tokio::main]
119async fn main() -> Result<(), anyhow::Error> {
120    let filter = EnvFilter::new("info,spider_lib=debug");
121    fmt().with_env_filter(filter).init();
122
123    info!("Starting RustScraper example...");
124
125    let engine = EngineBuilder::new(BooksSpider)
126        .add_pipeline(Box::new(CsvExporterPipeline::new("books.csv")))
127        .max_concurrent_downloads(10)
128        .build()
129        .await?;
130
131    engine.start_crawl().await?;
132
133    info!("RustScraper example finished.");
134
135    Ok(())
136}