pub struct CrawlOutput<I> {
pub items: Vec<I>,
pub requests: Vec<Request>,
}Fields§
§items: Vec<I>§requests: Vec<Request>Implementations§
Source§impl<I> CrawlOutput<I>
impl<I> CrawlOutput<I>
Sourcepub fn new() -> Self
pub fn new() -> Self
Examples found in repository?
examples/quotes.rs (line 63)
51 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
52 debug!("Parsing response from {}", response.url);
53 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
54 let quote_selector = Selector::parse("div.quote")
55 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
56 let text_selector = Selector::parse("span.text")
57 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
58 let author_selector = Selector::parse("small.author")
59 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
60 let tags_selector = Selector::parse("div.tags a.tag")
61 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
62
63 let mut output = CrawlOutput::new();
64
65 for quote_element in html.select("e_selector) {
66 let text = quote_element
67 .select(&text_selector)
68 .next()
69 .map(|e| e.inner_html());
70 let author = quote_element
71 .select(&author_selector)
72 .next()
73 .map(|e| e.inner_html());
74 let tags: Vec<String> = quote_element
75 .select(&tags_selector)
76 .map(|e| e.inner_html())
77 .collect();
78
79 if let (Some(text), Some(author)) = (text, author) {
80 output.add_item(QuoteItem {
81 text,
82 author,
83 tags,
84 });
85 }
86 }
87
88 let next_page_selector = Selector::parse("li.next a")
89 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
90 if let Some(next_page_element) = html.select(&next_page_selector).next()
91 && let Some(href) = next_page_element.value().attr("href")
92 {
93 let mut next_url = response.url.join(href)?;
94 next_url.set_fragment(None);
95 output.add_request(Request::new(next_url.clone(), self.name(), "parse"));
96 debug!("Following next page: {}", next_url);
97 }
98
99 Ok(output)
100 }More examples
examples/books.rs (line 55)
53 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
54 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
55 let mut output = CrawlOutput::new();
56
57 if response.url.path().ends_with("index.html") && response.url.path().contains("catalogue")
58 {
59 // Book page
60 let title = html
61 .select(&Selector::parse("h1").unwrap())
62 .next()
63 .unwrap()
64 .inner_html();
65 let price = html
66 .select(&Selector::parse("p.price_color").unwrap())
67 .next()
68 .unwrap()
69 .inner_html();
70 let availability = html
71 .select(&Selector::parse("p.instock.availability").unwrap())
72 .next()
73 .unwrap()
74 .text()
75 .collect::<String>()
76 .trim()
77 .to_string();
78 let rating = html
79 .select(&Selector::parse("p.star-rating").unwrap())
80 .next()
81 .unwrap()
82 .value()
83 .attr("class")
84 .unwrap()
85 .split_whitespace()
86 .last()
87 .unwrap()
88 .to_string();
89
90 output.add_item(BookItem {
91 title,
92 price,
93 availability,
94 rating,
95 url: response.url.to_string(),
96 });
97 } else {
98 // Book list page
99 let book_selector = Selector::parse("article.product_pod h3 a").unwrap();
100 let next_page_selector = Selector::parse("li.next a").unwrap();
101
102 let book_links = html.select(&book_selector);
103 let next_page_link = html.select(&next_page_selector);
104
105 for link in book_links.chain(next_page_link) {
106 if let Some(href) = link.value().attr("href") {
107 let mut url = response.url.join(href)?;
108 url.set_fragment(None);
109 output.add_request(Request::new(url.clone(), self.name(), "parse"));
110 }
111 }
112 }
113
114 Ok(output)
115 }Sourcepub fn add_item(&mut self, item: I)
pub fn add_item(&mut self, item: I)
Examples found in repository?
examples/quotes.rs (lines 80-84)
51 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
52 debug!("Parsing response from {}", response.url);
53 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
54 let quote_selector = Selector::parse("div.quote")
55 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
56 let text_selector = Selector::parse("span.text")
57 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
58 let author_selector = Selector::parse("small.author")
59 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
60 let tags_selector = Selector::parse("div.tags a.tag")
61 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
62
63 let mut output = CrawlOutput::new();
64
65 for quote_element in html.select("e_selector) {
66 let text = quote_element
67 .select(&text_selector)
68 .next()
69 .map(|e| e.inner_html());
70 let author = quote_element
71 .select(&author_selector)
72 .next()
73 .map(|e| e.inner_html());
74 let tags: Vec<String> = quote_element
75 .select(&tags_selector)
76 .map(|e| e.inner_html())
77 .collect();
78
79 if let (Some(text), Some(author)) = (text, author) {
80 output.add_item(QuoteItem {
81 text,
82 author,
83 tags,
84 });
85 }
86 }
87
88 let next_page_selector = Selector::parse("li.next a")
89 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
90 if let Some(next_page_element) = html.select(&next_page_selector).next()
91 && let Some(href) = next_page_element.value().attr("href")
92 {
93 let mut next_url = response.url.join(href)?;
94 next_url.set_fragment(None);
95 output.add_request(Request::new(next_url.clone(), self.name(), "parse"));
96 debug!("Following next page: {}", next_url);
97 }
98
99 Ok(output)
100 }More examples
examples/books.rs (lines 90-96)
53 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
54 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
55 let mut output = CrawlOutput::new();
56
57 if response.url.path().ends_with("index.html") && response.url.path().contains("catalogue")
58 {
59 // Book page
60 let title = html
61 .select(&Selector::parse("h1").unwrap())
62 .next()
63 .unwrap()
64 .inner_html();
65 let price = html
66 .select(&Selector::parse("p.price_color").unwrap())
67 .next()
68 .unwrap()
69 .inner_html();
70 let availability = html
71 .select(&Selector::parse("p.instock.availability").unwrap())
72 .next()
73 .unwrap()
74 .text()
75 .collect::<String>()
76 .trim()
77 .to_string();
78 let rating = html
79 .select(&Selector::parse("p.star-rating").unwrap())
80 .next()
81 .unwrap()
82 .value()
83 .attr("class")
84 .unwrap()
85 .split_whitespace()
86 .last()
87 .unwrap()
88 .to_string();
89
90 output.add_item(BookItem {
91 title,
92 price,
93 availability,
94 rating,
95 url: response.url.to_string(),
96 });
97 } else {
98 // Book list page
99 let book_selector = Selector::parse("article.product_pod h3 a").unwrap();
100 let next_page_selector = Selector::parse("li.next a").unwrap();
101
102 let book_links = html.select(&book_selector);
103 let next_page_link = html.select(&next_page_selector);
104
105 for link in book_links.chain(next_page_link) {
106 if let Some(href) = link.value().attr("href") {
107 let mut url = response.url.join(href)?;
108 url.set_fragment(None);
109 output.add_request(Request::new(url.clone(), self.name(), "parse"));
110 }
111 }
112 }
113
114 Ok(output)
115 }Sourcepub fn add_request(&mut self, request: Request)
pub fn add_request(&mut self, request: Request)
Examples found in repository?
examples/quotes.rs (line 95)
51 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
52 debug!("Parsing response from {}", response.url);
53 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
54 let quote_selector = Selector::parse("div.quote")
55 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
56 let text_selector = Selector::parse("span.text")
57 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
58 let author_selector = Selector::parse("small.author")
59 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
60 let tags_selector = Selector::parse("div.tags a.tag")
61 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
62
63 let mut output = CrawlOutput::new();
64
65 for quote_element in html.select("e_selector) {
66 let text = quote_element
67 .select(&text_selector)
68 .next()
69 .map(|e| e.inner_html());
70 let author = quote_element
71 .select(&author_selector)
72 .next()
73 .map(|e| e.inner_html());
74 let tags: Vec<String> = quote_element
75 .select(&tags_selector)
76 .map(|e| e.inner_html())
77 .collect();
78
79 if let (Some(text), Some(author)) = (text, author) {
80 output.add_item(QuoteItem {
81 text,
82 author,
83 tags,
84 });
85 }
86 }
87
88 let next_page_selector = Selector::parse("li.next a")
89 .map_err(|e| SpiderError::HtmlParseError(format!("{:?}", e)))?;
90 if let Some(next_page_element) = html.select(&next_page_selector).next()
91 && let Some(href) = next_page_element.value().attr("href")
92 {
93 let mut next_url = response.url.join(href)?;
94 next_url.set_fragment(None);
95 output.add_request(Request::new(next_url.clone(), self.name(), "parse"));
96 debug!("Following next page: {}", next_url);
97 }
98
99 Ok(output)
100 }More examples
examples/books.rs (line 109)
53 async fn parse(&self, response: Response) -> Result<CrawlOutput<Self::Item>, SpiderError> {
54 let html = Html::parse_document(std::str::from_utf8(&response.body).unwrap());
55 let mut output = CrawlOutput::new();
56
57 if response.url.path().ends_with("index.html") && response.url.path().contains("catalogue")
58 {
59 // Book page
60 let title = html
61 .select(&Selector::parse("h1").unwrap())
62 .next()
63 .unwrap()
64 .inner_html();
65 let price = html
66 .select(&Selector::parse("p.price_color").unwrap())
67 .next()
68 .unwrap()
69 .inner_html();
70 let availability = html
71 .select(&Selector::parse("p.instock.availability").unwrap())
72 .next()
73 .unwrap()
74 .text()
75 .collect::<String>()
76 .trim()
77 .to_string();
78 let rating = html
79 .select(&Selector::parse("p.star-rating").unwrap())
80 .next()
81 .unwrap()
82 .value()
83 .attr("class")
84 .unwrap()
85 .split_whitespace()
86 .last()
87 .unwrap()
88 .to_string();
89
90 output.add_item(BookItem {
91 title,
92 price,
93 availability,
94 rating,
95 url: response.url.to_string(),
96 });
97 } else {
98 // Book list page
99 let book_selector = Selector::parse("article.product_pod h3 a").unwrap();
100 let next_page_selector = Selector::parse("li.next a").unwrap();
101
102 let book_links = html.select(&book_selector);
103 let next_page_link = html.select(&next_page_selector);
104
105 for link in book_links.chain(next_page_link) {
106 if let Some(href) = link.value().attr("href") {
107 let mut url = response.url.join(href)?;
108 url.set_fragment(None);
109 output.add_request(Request::new(url.clone(), self.name(), "parse"));
110 }
111 }
112 }
113
114 Ok(output)
115 }Trait Implementations§
Source§impl<I: Clone> Clone for CrawlOutput<I>
impl<I: Clone> Clone for CrawlOutput<I>
Source§fn clone(&self) -> CrawlOutput<I>
fn clone(&self) -> CrawlOutput<I>
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl<I: Debug> Debug for CrawlOutput<I>
impl<I: Debug> Debug for CrawlOutput<I>
Auto Trait Implementations§
impl<I> Freeze for CrawlOutput<I>
impl<I> RefUnwindSafe for CrawlOutput<I>where
I: RefUnwindSafe,
impl<I> Send for CrawlOutput<I>where
I: Send,
impl<I> Sync for CrawlOutput<I>where
I: Sync,
impl<I> Unpin for CrawlOutput<I>where
I: Unpin,
impl<I> UnwindSafe for CrawlOutput<I>where
I: UnwindSafe,
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more