quotes/
quotes.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use spider_lib::{
7    CrawlOutput, EngineBuilder, JsonlWriterPipeline, Request, Response, ScrapedItem, Spider,
8    SpiderError,
9};
10use std::any::Any;
11use tracing::{debug, info};
12use tracing_subscriber::filter::EnvFilter;
13use tracing_subscriber::fmt;
14use url::Url;
15
16// Item Struct specific to this example
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct QuoteItem {
19    pub text: String,
20    pub author: String,
21    pub tags: Vec<String>,
22}
23
24impl ScrapedItem for QuoteItem {
25    fn as_any(&self) -> &dyn Any {
26        self
27    }
28    fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync> {
29        Box::new(self.clone())
30    }
31    fn to_json_value(&self) -> Value {
32        serde_json::to_value(self).expect("Failed to serialize QuoteItem to JSON Value")
33    }
34}
35
36// Custom Spider implementation for the example
37pub struct MySpider;
38
39#[async_trait]
40impl Spider for MySpider {
41    type Item = QuoteItem;
42
43    fn name(&self) -> &'static str {
44        "myspider"
45    }
46
47    fn start_urls(&self) -> Vec<Url> {
48        vec![Url::parse("http://quotes.toscrape.com/").unwrap()]
49    }
50
51    async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
52        debug!("Parsing response from {}", response.url);
53        let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
54        let quote_selector = Selector::parse("div.quote")
55            .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
56        let text_selector = Selector::parse("span.text")
57            .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
58        let author_selector = Selector::parse("small.author")
59            .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
60        let tags_selector = Selector::parse("div.tags a.tag")
61            .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
62
63        let mut output = CrawlOutput::new();
64
65        for quote_element in html.select(&quote_selector) {
66            let text = quote_element
67                .select(&text_selector)
68                .next()
69                .map(|e| e.inner_html());
70            let author = quote_element
71                .select(&author_selector)
72                .next()
73                .map(|e| e.inner_html());
74            let tags: Vec<String> = quote_element
75                .select(&tags_selector)
76                .map(|e| e.inner_html())
77                .collect();
78
79            if let (Some(text), Some(author)) = (text, author) {
80                output.add_item(QuoteItem {
81                    text,
82                    author,
83                    tags,
84                });
85            }
86        }
87
88        let next_page_selector = Selector::parse("li.next a")
89            .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
90        if let Some(next_page_element) = html.select(&next_page_selector).next()
91            && let Some(href) = next_page_element.value().attr("href")
92        {
93            let mut next_url = response.url.join(href)?;
94            next_url.set_fragment(None);
95            output.add_request(Request::new(next_url.clone(), self.name(), "parse"));
96            debug!("Following next page: {}", next_url);
97        }
98
99        Ok(output)
100    }
101}
102
103#[tokio::main]
104async fn main() -> Result<(), anyhow::Error> {
105    let filter = EnvFilter::new("info,spider_lib=debug");
106    fmt().with_env_filter(filter).init();
107
108    info!("Starting RustScraper example...");
109
110    let engine = EngineBuilder::new(MySpider)
111        .add_pipeline(Box::new(JsonlWriterPipeline::<QuoteItem>::new(
112            "result.jsonl",
113        )?))
114        .build()
115        .await?;
116
117    engine.start_crawl().await?;
118
119    info!("RustScraper example finished.");
120
121    Ok(())
122}