webpage_info/
lib.rs

1//! # webpage-info
2//!
3//! A modern Rust library to extract metadata from web pages: title, description,
4//! OpenGraph, Schema.org, links, and more.
5//!
6//! ## Features
7//!
8//! - Parse HTML from strings, files, or URLs
9//! - Extract common metadata (title, description, language)
10//! - Parse OpenGraph protocol data
11//! - Parse Schema.org JSON-LD structured data
12//! - Extract all links from the document
13//! - Async HTTP client with configurable options
14//!
15//! ## Quick Start
16//!
17//! ```rust,no_run
18//! use webpage_info::WebpageInfo;
19//!
20//! #[tokio::main]
21//! async fn main() -> webpage_info::Result<()> {
22//!     // Fetch and parse a webpage
23//!     let info = WebpageInfo::fetch("https://example.org").await?;
24//!
25//!     println!("Title: {:?}", info.html.title);
26//!     println!("Description: {:?}", info.html.description);
27//!     println!("Links: {}", info.html.links.len());
28//!
29//!     Ok(())
30//! }
31//! ```
32//!
33//! ## Parsing Local HTML
34//!
35//! ```rust
36//! use webpage_info::HtmlInfo;
37//!
38//! let html = "<html><head><title>Hello</title></head><body>World</body></html>";
39//! let info = HtmlInfo::from_string(html, None).unwrap();
40//! assert_eq!(info.title, Some("Hello".to_string()));
41//! ```
42//!
43//! ## Custom HTTP Options
44//!
45//! ```rust,no_run
46//! use std::time::Duration;
47//! use webpage_info::{WebpageInfo, HttpOptions};
48//!
49//! #[tokio::main]
50//! async fn main() -> webpage_info::Result<()> {
51//!     let options = HttpOptions::new()
52//!         .timeout(Duration::from_secs(60))
53//!         .user_agent("MyBot/1.0")
54//!         .allow_insecure(true);
55//!
56//!     let info = WebpageInfo::fetch_with_options("https://example.org", options).await?;
57//!     Ok(())
58//! }
59//! ```
60//!
61//! ## Without HTTP (parsing only)
62//!
63//! If you don't need HTTP fetching, disable the default `http` feature:
64//!
65//! ```toml
66//! [dependencies]
67//! webpage-info = { version = "1.0", default-features = false }
68//! ```
69
70mod error;
71mod html;
72mod opengraph;
73mod schema_org;
74
75#[cfg(feature = "http")]
76mod http;
77
78pub use error::{Error, Result};
79pub use html::{HtmlInfo, Link};
80pub use opengraph::{Opengraph, OpengraphMedia};
81pub use schema_org::SchemaOrg;
82
83#[cfg(feature = "http")]
84pub use http::{HttpInfo, HttpOptions};
85
86use serde::{Deserialize, Serialize};
87
88/// Complete webpage information including HTTP and HTML data.
89#[cfg(feature = "http")]
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct WebpageInfo {
92    /// HTTP transfer information
93    pub http: HttpInfo,
94
95    /// Parsed HTML information
96    pub html: HtmlInfo,
97}
98
99#[cfg(feature = "http")]
100impl WebpageInfo {
101    /// Fetch a webpage from a URL with default options.
102    ///
103    /// # Example
104    ///
105    /// ```rust,no_run
106    /// use webpage_info::WebpageInfo;
107    ///
108    /// #[tokio::main]
109    /// async fn main() -> webpage_info::Result<()> {
110    ///     let info = WebpageInfo::fetch("https://example.org").await?;
111    ///     println!("Title: {:?}", info.html.title);
112    ///     Ok(())
113    /// }
114    /// ```
115    pub async fn fetch(url: &str) -> Result<Self> {
116        Self::fetch_with_options(url, HttpOptions::default()).await
117    }
118
119    /// Fetch a webpage from a URL with custom HTTP options.
120    ///
121    /// # Example
122    ///
123    /// ```rust,no_run
124    /// use std::time::Duration;
125    /// use webpage_info::{WebpageInfo, HttpOptions};
126    ///
127    /// #[tokio::main]
128    /// async fn main() -> webpage_info::Result<()> {
129    ///     let options = HttpOptions::new()
130    ///         .timeout(Duration::from_secs(60))
131    ///         .user_agent("CustomBot/1.0");
132    ///
133    ///     let info = WebpageInfo::fetch_with_options("https://example.org", options).await?;
134    ///     println!("Status: {}", info.http.status_code);
135    ///     Ok(())
136    /// }
137    /// ```
138    pub async fn fetch_with_options(url: &str, options: HttpOptions) -> Result<Self> {
139        let http_info = http::fetch(url, &options).await?;
140
141        // Validate content type is HTML-ish
142        if let Some(ref ct) = http_info.content_type
143            && !ct.contains("html")
144            && !ct.contains("xml")
145        {
146            return Err(Error::InvalidContentType(ct.clone()));
147        }
148
149        let html = HtmlInfo::from_string(&http_info.body, Some(&http_info.url))?;
150
151        Ok(Self {
152            http: http_info,
153            html,
154        })
155    }
156}