1use anyhow::Result;
2use async_trait::async_trait;
3use scraper::{Html, Selector};
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use spider_lib::{
7 CrawlOutput, EngineBuilder, JsonlWriterPipeline, Request, Response, ScrapedItem, Spider,
8 SpiderError,
9};
10use std::any::Any;
11use tracing::{debug, info};
12use tracing_subscriber::filter::EnvFilter;
13use tracing_subscriber::fmt;
14use url::Url;
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct QuoteItem {
19 pub text: String,
20 pub author: String,
21 pub tags: Vec<String>,
22}
23
24impl ScrapedItem for QuoteItem {
25 fn as_any(&self) -> &dyn Any {
26 self
27 }
28 fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync> {
29 Box::new(self.clone())
30 }
31 fn to_json_value(&self) -> Value {
32 serde_json::to_value(self).expect("Failed to serialize QuoteItem to JSON Value")
33 }
34}
35
36pub struct MySpider;
38
39#[async_trait]
40impl Spider for MySpider {
41 type Item = QuoteItem;
42
43 fn name(&self) -> &'static str {
44 "myspider"
45 }
46
47 fn start_urls(&self) -> Vec<Url> {
48 vec![Url::parse("http://quotes.toscrape.com/").unwrap()]
49 }
50
51 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
52 debug!("Parsing response from {}", response.url);
53 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
54 let quote_selector = Selector::parse("div.quote")
55 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
56 let text_selector = Selector::parse("span.text")
57 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
58 let author_selector = Selector::parse("small.author")
59 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
60 let tags_selector = Selector::parse("div.tags a.tag")
61 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
62
63 let mut output = CrawlOutput::new();
64
65 for quote_element in html.select("e_selector) {
66 let text = quote_element
67 .select(&text_selector)
68 .next()
69 .map(|e| e.inner_html());
70 let author = quote_element
71 .select(&author_selector)
72 .next()
73 .map(|e| e.inner_html());
74 let tags: Vec<String> = quote_element
75 .select(&tags_selector)
76 .map(|e| e.inner_html())
77 .collect();
78
79 if let (Some(text), Some(author)) = (text, author) {
80 output.add_item(QuoteItem {
81 text,
82 author,
83 tags,
84 });
85 }
86 }
87
88 let next_page_selector = Selector::parse("li.next a")
89 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
90 if let Some(next_page_element) = html.select(&next_page_selector).next()
91 && let Some(href) = next_page_element.value().attr("href")
92 {
93 let mut next_url = response.url.join(href)?;
94 next_url.set_fragment(None);
95 output.add_request(Request::new(next_url.clone(), self.name(), "parse"));
96 debug!("Following next page: {}", next_url);
97 }
98
99 Ok(output)
100 }
101}
102
103#[tokio::main]
104async fn main() -> Result<(), anyhow::Error> {
105 let filter = EnvFilter::new("info,spider_lib=debug");
106 fmt().with_env_filter(filter).init();
107
108 info!("Starting RustScraper example...");
109
110 let engine = EngineBuilder::new(MySpider)
111 .add_pipeline(Box::new(JsonlWriterPipeline::<QuoteItem>::new(
112 "result.jsonl",
113 )?))
114 .build()
115 .await?;
116
117 engine.start_crawl().await?;
118
119 info!("RustScraper example finished.");
120
121 Ok(())
122}