url_scraper/
lib.rs

1//! Simple library for quickly fetching a list of URLs from a webpage.
2//! 
3//! # Example
4//! ```rust,no_run
5//! extern crate url_scraper;
6//! use url_scraper::UrlScraper;
7//! 
8//! let scraper = UrlScraper::new("http://phoronix.com/").unwrap();
9//! for (text, url) in scraper.into_iter() {
10//!     println!("{}: {}", text, url);
11//! }
12//!```
13
14extern crate reqwest;
15extern crate scraper;
16
17use reqwest::{Client, Url};
18use scraper::{Html, html::Select, Selector};
19use std::fmt;
20
21/// Stores the HTML document in memory.
22pub struct UrlScraper {
23    url: Url,
24    html: Html,
25    selector: Selector,
26}
27
28impl UrlScraper {
29    /// Constructs a new scraper from a given URL.
30    pub fn new(url: &str) -> Result<Self, Error> {
31        let client = Client::new();
32        Self::new_with_client(url, &client)
33    }
34
35    /// Use an existing `reqwest::Client` to make a request.
36    pub fn new_with_client(url: &str, client: &Client) -> Result<Self, Error> {
37        let url = Url::parse(url)?;
38        let mut resp = client.get(url.clone()).send()?;
39        let html = resp.text()?;
40
41        Ok(Self {
42            url,
43            html: Html::parse_document(&html),
44            selector: Selector::parse("a").expect("failed to create <a> selector"),
45        })
46    }
47
48    /// In case the HTML has already been fetched in advance, this can be used to parse from it directly.
49    pub fn new_with_html(url: &str, html: &str) -> Result<Self, Error> {
50        Ok(Self {
51            url: Url::parse(url)?,
52            html: Html::parse_document(html),
53            selector: Selector::parse("a").expect("failed to create <a> selector"),
54        })
55    }
56
57    /// Fetch the URLs using an iterator.
58    pub fn into_iter<'a>(&'a self) -> UrlIter<'a, 'a> {
59        UrlIter {
60            url: &self.url,
61            data: self.html.select(&self.selector)
62        }
63    }
64}
65
66/// An Iterator that returns `(String, Url)` pairs per iteration.
67pub struct UrlIter<'a, 'b> {
68    url: &'a Url,
69    data: Select<'a, 'b>
70}
71
72impl<'a, 'b> Iterator for UrlIter<'a, 'b> {
73    type Item = (String, Url);
74
75    fn next(&mut self) -> Option<Self::Item> {
76        for element in &mut self.data {
77            if let Some(url) = element.value().attr("href") {
78                if ! url.starts_with('?') {
79                    if let Ok(url) = self.url.join(url) {
80                        return Some((element.inner_html(), url));
81                    }
82                }
83            }
84        }
85
86        None
87    }
88}
89
90#[derive(Debug)]
91pub enum Error {
92    UrlParsing { why: reqwest::UrlError },
93    Request { why: reqwest::Error }
94}
95
96impl fmt::Display for Error {
97    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
98        let error = match *self {
99            Error::UrlParsing { ref why } => format!("failed to parse URL: {}", why),
100            Error::Request { ref why } => format!("failure in request: {}", why),
101        };
102        f.write_str(&error)
103    }
104}
105
106impl From<reqwest::UrlError> for Error {
107    fn from(why: reqwest::UrlError) -> Error {
108        Error::UrlParsing { why }
109    }
110}
111
112impl From<reqwest::Error> for Error {
113    fn from(why: reqwest::Error) -> Error {
114        Error::Request { why }
115    }
116}