1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
//! _Small library to fetch info about a web page: title, description, language, HTTP info, links, RSS feeds, Opengraph, Schema.org, and more_
//!
//! ## Usage
//!
//! ```rust
//! use webpage::{Webpage, WebpageOptions};
//!
//! let info = Webpage::from_url("http://example.org", WebpageOptions::default())
//!     .expect("Could not read from URL");
//!
//! // the HTTP transfer info
//! let http = info.http;
//!
//! // assert_eq!(http.ip, "54.192.129.71".to_string());
//! assert!(http.headers[0].starts_with("HTTP"));
//! assert!(http.body.starts_with("<!doctype html>"));
//! assert_eq!(http.url, "http://example.org/".to_string()); // effective url
//! assert_eq!(http.content_type, "text/html; charset=UTF-8".to_string());
//!
//! // the parsed HTML info
//! let html = info.html;
//!
//! assert_eq!(html.title, Some("Example Domain".to_string()));
//! assert_eq!(html.description, None);
//! assert_eq!(html.links.len(), 1);
//! assert_eq!(html.opengraph.og_type, "website".to_string());
//! ```
//!
//! You can also get HTML info about local data:
//!
//! ```rust
//! use webpage::HTML;
//! let html = HTML::from_file("index.html", None);
//! // or let html = HTML::from_string(input, None);
//! ```
//!
//! ## Options
//!
//! The following configurations are available:
//! ```rust
//! pub struct WebpageOptions {
//!     allow_insecure: bool,
//!     follow_location: bool,
//!     max_redirections: u32,
//!     timeout: std::time::Duration,
//!     useragent: String,
//!     headers: Vec<String>,
//! }
//! ```
//!
//! ```rust
//! use webpage::{Webpage, WebpageOptions};
//!
//! let mut options = WebpageOptions::default();
//! options.allow_insecure = true;
//! let info = Webpage::from_url("https://example.org", options).expect("Halp, could not fetch");
//! ```

mod html;
pub use html::{Link, HTML};

#[cfg(feature = "curl")]
mod http;
#[cfg(feature = "curl")]
pub use http::HTTP;

mod opengraph;
pub use opengraph::{Opengraph, OpengraphObject};

mod schema_org;
pub use schema_org::SchemaOrg;

mod parser;

#[cfg(feature = "curl")]
use std::time::Duration;

#[cfg(feature = "serde")]
#[macro_use]
extern crate serde;

/// All gathered info for a webpage
#[derive(Debug)]
#[cfg(feature = "curl")]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[non_exhaustive]
pub struct Webpage {
    /// info about the HTTP transfer
    pub http: HTTP,
    /// info from the parsed HTML doc
    pub html: HTML,
}

/// Configuration options for fetching a webpage
#[derive(Debug)]
#[cfg(feature = "curl")]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[non_exhaustive]
pub struct WebpageOptions {
    /// Allow fetching over invalid and/or self signed HTTPS connections \[false\]
    pub allow_insecure: bool,
    /// Follow HTTP redirects \[true\]
    pub follow_location: bool,
    /// Max number of redirects to follow \[5\]
    pub max_redirections: u32,
    /// Timeout for the HTTP request \[10 secs\]
    pub timeout: Duration,
    /// User agent string used for the request \[webpage-rs - <https://crates.io/crates/webpage>\]
    pub useragent: String,
    /// Custom HTTP headers to send with the request
    pub headers: Vec<String>,
}

#[cfg(feature = "curl")]
impl Default for WebpageOptions {
    fn default() -> Self {
        Self {
            allow_insecure: false,
            follow_location: true,
            max_redirections: 5,
            timeout: Duration::from_secs(10),
            useragent: "webpage-rs - https://crates.io/crates/webpage".to_string(),
            headers: Vec::new(),
        }
    }
}

#[cfg(feature = "curl")]
impl Webpage {
    /// Fetch a webpage from the given URL, and extract HTML info
    ///
    /// ## Examples
    /// ```
    /// use webpage::{Webpage, WebpageOptions};
    ///
    /// let info = Webpage::from_url("http://example.org", WebpageOptions::default());
    /// assert!(info.is_ok())
    /// ```
    pub fn from_url(url: &str, options: WebpageOptions) -> Result<Self, std::io::Error> {
        let http = HTTP::fetch(url, options)?;

        let html = HTML::from_string(http.body.clone(), Some(http.url.clone()))?;

        Ok(Self { http, html })
    }
}