1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
//! _Small library to fetch info about a web page: title, description, language, HTTP info, links, RSS feeds, Opengraph, Schema.org, and more_
//!
//! ## Usage
//!
//! ```rust
//! use webpage::{Webpage, WebpageOptions};
//!
//! let info = Webpage::from_url("http://example.org", WebpageOptions::default())
//! .expect("Could not read from URL");
//!
//! // the HTTP transfer info
//! let http = info.http;
//!
//! // assert_eq!(http.ip, "54.192.129.71".to_string());
//! assert!(http.headers[0].starts_with("HTTP"));
//! assert!(http.body.starts_with("<!doctype html>"));
//! assert_eq!(http.url, "http://example.org/".to_string()); // effective url
//! assert_eq!(http.content_type, "text/html; charset=UTF-8".to_string());
//!
//! // the parsed HTML info
//! let html = info.html;
//!
//! assert_eq!(html.title, Some("Example Domain".to_string()));
//! assert_eq!(html.description, None);
//! assert_eq!(html.links.len(), 1);
//! assert_eq!(html.opengraph.og_type, "website".to_string());
//! ```
//!
//! You can also get HTML info about local data:
//!
//! ```rust
//! use webpage::HTML;
//! let html = HTML::from_file("index.html", None);
//! // or let html = HTML::from_string(input, None);
//! ```
//!
//! ## Options
//!
//! The following configurations are available:
//! ```rust
//! pub struct WebpageOptions {
//! allow_insecure: bool,
//! follow_location: bool,
//! max_redirections: u32,
//! timeout: std::time::Duration,
//! useragent: String,
//! headers: Vec<String>,
//! }
//! ```
//!
//! ```rust
//! use webpage::{Webpage, WebpageOptions};
//!
//! let mut options = WebpageOptions::default();
//! options.allow_insecure = true;
//! let info = Webpage::from_url("https://example.org", options).expect("Halp, could not fetch");
//! ```
mod html;
pub use html::{Link, HTML};
#[cfg(feature = "curl")]
mod http;
#[cfg(feature = "curl")]
pub use http::HTTP;
mod opengraph;
pub use opengraph::{Opengraph, OpengraphObject};
mod schema_org;
pub use schema_org::SchemaOrg;
mod parser;
#[cfg(feature = "curl")]
use std::time::Duration;
#[cfg(feature = "serde")]
#[macro_use]
extern crate serde;
/// All gathered info for a webpage
#[derive(Debug)]
#[cfg(feature = "curl")]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[non_exhaustive]
pub struct Webpage {
/// info about the HTTP transfer
pub http: HTTP,
/// info from the parsed HTML doc
pub html: HTML,
}
/// Configuration options for fetching a webpage
#[derive(Debug)]
#[cfg(feature = "curl")]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[non_exhaustive]
pub struct WebpageOptions {
/// Allow fetching over invalid and/or self signed HTTPS connections \[false\]
pub allow_insecure: bool,
/// Follow HTTP redirects \[true\]
pub follow_location: bool,
/// Max number of redirects to follow \[5\]
pub max_redirections: u32,
/// Timeout for the HTTP request \[10 secs\]
pub timeout: Duration,
/// User agent string used for the request \[webpage-rs - <https://crates.io/crates/webpage>\]
pub useragent: String,
/// Custom HTTP headers to send with the request
pub headers: Vec<String>,
}
#[cfg(feature = "curl")]
impl Default for WebpageOptions {
fn default() -> Self {
Self {
allow_insecure: false,
follow_location: true,
max_redirections: 5,
timeout: Duration::from_secs(10),
useragent: "webpage-rs - https://crates.io/crates/webpage".to_string(),
headers: Vec::new(),
}
}
}
#[cfg(feature = "curl")]
impl Webpage {
/// Fetch a webpage from the given URL, and extract HTML info
///
/// ## Examples
/// ```
/// use webpage::{Webpage, WebpageOptions};
///
/// let info = Webpage::from_url("http://example.org", WebpageOptions::default());
/// assert!(info.is_ok())
/// ```
pub fn from_url(url: &str, options: WebpageOptions) -> Result<Self, std::io::Error> {
let http = HTTP::fetch(url, options)?;
let html = HTML::from_string(http.body.clone(), Some(http.url.clone()))?;
Ok(Self { http, html })
}
}