webpage_info/lib.rs
1//! # webpage-info
2//!
3//! A modern Rust library to extract metadata from web pages: title, description,
4//! OpenGraph, Schema.org, links, and more.
5//!
6//! ## Features
7//!
8//! - Parse HTML from strings, files, or URLs
9//! - Extract common metadata (title, description, language)
10//! - Parse OpenGraph protocol data
11//! - Parse Schema.org JSON-LD structured data
12//! - Extract all links from the document
13//! - Async HTTP client with configurable options
14//!
15//! ## Quick Start
16//!
17//! ```rust,no_run
18//! use webpage_info::WebpageInfo;
19//!
20//! #[tokio::main]
21//! async fn main() -> webpage_info::Result<()> {
22//! // Fetch and parse a webpage
23//! let info = WebpageInfo::fetch("https://example.org").await?;
24//!
25//! println!("Title: {:?}", info.html.title);
26//! println!("Description: {:?}", info.html.description);
27//! println!("Links: {}", info.html.links.len());
28//!
29//! Ok(())
30//! }
31//! ```
32//!
33//! ## Parsing Local HTML
34//!
35//! ```rust
36//! use webpage_info::HtmlInfo;
37//!
38//! let html = "<html><head><title>Hello</title></head><body>World</body></html>";
39//! let info = HtmlInfo::from_string(html, None).unwrap();
40//! assert_eq!(info.title, Some("Hello".to_string()));
41//! ```
42//!
43//! ## Custom HTTP Options
44//!
45//! ```rust,no_run
46//! use std::time::Duration;
47//! use webpage_info::{WebpageInfo, HttpOptions};
48//!
49//! #[tokio::main]
50//! async fn main() -> webpage_info::Result<()> {
51//! let options = HttpOptions::new()
52//! .timeout(Duration::from_secs(60))
53//! .user_agent("MyBot/1.0")
54//! .allow_insecure(true);
55//!
56//! let info = WebpageInfo::fetch_with_options("https://example.org", options).await?;
57//! Ok(())
58//! }
59//! ```
60//!
61//! ## Without HTTP (parsing only)
62//!
63//! If you don't need HTTP fetching, disable the default `http` feature:
64//!
65//! ```toml
66//! [dependencies]
67//! webpage-info = { version = "1.0", default-features = false }
68//! ```
69
70mod error;
71mod html;
72mod opengraph;
73mod schema_org;
74
75#[cfg(feature = "http")]
76mod http;
77
78pub use error::{Error, Result};
79pub use html::{HtmlInfo, Link};
80pub use opengraph::{Opengraph, OpengraphMedia};
81pub use schema_org::SchemaOrg;
82
83#[cfg(feature = "http")]
84pub use http::{HttpInfo, HttpOptions};
85
86use serde::{Deserialize, Serialize};
87
88/// Complete webpage information including HTTP and HTML data.
89#[cfg(feature = "http")]
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct WebpageInfo {
92 /// HTTP transfer information
93 pub http: HttpInfo,
94
95 /// Parsed HTML information
96 pub html: HtmlInfo,
97}
98
99#[cfg(feature = "http")]
100impl WebpageInfo {
101 /// Fetch a webpage from a URL with default options.
102 ///
103 /// # Example
104 ///
105 /// ```rust,no_run
106 /// use webpage_info::WebpageInfo;
107 ///
108 /// #[tokio::main]
109 /// async fn main() -> webpage_info::Result<()> {
110 /// let info = WebpageInfo::fetch("https://example.org").await?;
111 /// println!("Title: {:?}", info.html.title);
112 /// Ok(())
113 /// }
114 /// ```
115 pub async fn fetch(url: &str) -> Result<Self> {
116 Self::fetch_with_options(url, HttpOptions::default()).await
117 }
118
119 /// Fetch a webpage from a URL with custom HTTP options.
120 ///
121 /// # Example
122 ///
123 /// ```rust,no_run
124 /// use std::time::Duration;
125 /// use webpage_info::{WebpageInfo, HttpOptions};
126 ///
127 /// #[tokio::main]
128 /// async fn main() -> webpage_info::Result<()> {
129 /// let options = HttpOptions::new()
130 /// .timeout(Duration::from_secs(60))
131 /// .user_agent("CustomBot/1.0");
132 ///
133 /// let info = WebpageInfo::fetch_with_options("https://example.org", options).await?;
134 /// println!("Status: {}", info.http.status_code);
135 /// Ok(())
136 /// }
137 /// ```
138 pub async fn fetch_with_options(url: &str, options: HttpOptions) -> Result<Self> {
139 let http_info = http::fetch(url, &options).await?;
140
141 // Validate content type is HTML-ish
142 if let Some(ref ct) = http_info.content_type
143 && !ct.contains("html")
144 && !ct.contains("xml")
145 {
146 return Err(Error::InvalidContentType(ct.clone()));
147 }
148
149 let html = HtmlInfo::from_string(&http_info.body, Some(&http_info.url))?;
150
151 Ok(Self {
152 http: http_info,
153 html,
154 })
155 }
156}