1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
extern crate html5ever;
extern crate curl;
extern crate serde_json;

mod http;
pub mod html;
mod opengraph;
mod parser;
mod schema_org;

use http::HTTP;
use html::HTML;

use std::io;
use std::str;
use std::time::Duration;

pub struct Webpage {
    pub http: HTTP, // info about the HTTP transfer
    pub html: HTML, // info from the parsed HTML doc
}

pub struct WebpageOptions {
    pub allow_insecure: bool,
    pub follow_location: bool,
    pub max_redirections: u32,
    pub timeout: Duration,
    pub useragent: String,
}

impl Default for WebpageOptions {
    fn default() -> Self {
        Self {
            allow_insecure: false,
            follow_location: true,
            max_redirections: 5,
            timeout: Duration::from_secs(10),
            useragent: "Webpage - Rust crate - https://crates.io/crates/webpage".to_string(),
        }
    }
}

impl Webpage {
    pub fn from_url(url: &str, options: WebpageOptions) -> Result<Self, io::Error> {
        let http = HTTP::fetch(url, options)?;

        let html = HTML::from_string(
            http.body.clone(),
            Some(http.url.clone())
        )?;

        Ok(Self {
            http,
            html,
        })
    }
}