webpage_info/
html.rs

1//! HTML document parsing and metadata extraction
2
3use std::collections::{HashMap, HashSet};
4use std::fs;
5use std::path::Path;
6use std::sync::OnceLock;
7
8use scraper::{Html, Selector};
9use serde::{Deserialize, Serialize};
10use url::Url;
11
12use crate::error::Result;
13use crate::opengraph::Opengraph;
14use crate::schema_org::SchemaOrg;
15
16const FEED_MIME_TYPES: &[&str] = &[
17    "application/atom+xml",
18    "application/rss+xml",
19    "application/json",
20    "application/xml",
21    "text/xml",
22];
23
24// Security limits to prevent DoS via resource exhaustion
25const MAX_LINKS: usize = 10_000;
26const MAX_SCHEMA_ORG_ITEMS: usize = 100;
27const MAX_TEXT_CONTENT_LEN: usize = 1_000_000; // 1 MB of text
28
29fn title_selector() -> &'static Selector {
30    static SELECTOR: OnceLock<Selector> = OnceLock::new();
31    SELECTOR.get_or_init(|| Selector::parse("title").unwrap())
32}
33
34fn html_selector() -> &'static Selector {
35    static SELECTOR: OnceLock<Selector> = OnceLock::new();
36    SELECTOR.get_or_init(|| Selector::parse("html").unwrap())
37}
38
39fn meta_selector() -> &'static Selector {
40    static SELECTOR: OnceLock<Selector> = OnceLock::new();
41    SELECTOR.get_or_init(|| Selector::parse("meta").unwrap())
42}
43
44fn canonical_selector() -> &'static Selector {
45    static SELECTOR: OnceLock<Selector> = OnceLock::new();
46    SELECTOR.get_or_init(|| Selector::parse(r#"link[rel="canonical"]"#).unwrap())
47}
48
49fn feed_selector() -> &'static Selector {
50    static SELECTOR: OnceLock<Selector> = OnceLock::new();
51    SELECTOR.get_or_init(|| Selector::parse(r#"link[rel="alternate"]"#).unwrap())
52}
53
54fn body_selector() -> &'static Selector {
55    static SELECTOR: OnceLock<Selector> = OnceLock::new();
56    SELECTOR.get_or_init(|| Selector::parse("body").unwrap())
57}
58
59fn exclude_selector() -> &'static Selector {
60    static SELECTOR: OnceLock<Selector> = OnceLock::new();
61    SELECTOR.get_or_init(|| Selector::parse("script, style, noscript").unwrap())
62}
63
64fn link_selector() -> &'static Selector {
65    static SELECTOR: OnceLock<Selector> = OnceLock::new();
66    SELECTOR.get_or_init(|| Selector::parse("a[href]").unwrap())
67}
68
69fn schema_org_selector() -> &'static Selector {
70    static SELECTOR: OnceLock<Selector> = OnceLock::new();
71    SELECTOR.get_or_init(|| Selector::parse(r#"script[type="application/ld+json"]"#).unwrap())
72}
73
74/// Parsed HTML document information.
75#[derive(Debug, Clone, Default, Serialize, Deserialize)]
76pub struct HtmlInfo {
77    /// Document title from `<title>` tag
78    pub title: Option<String>,
79
80    /// Meta description
81    pub description: Option<String>,
82
83    /// Canonical URL from `<link rel="canonical">`
84    pub canonical_url: Option<String>,
85
86    /// RSS/Atom feed URL from `<link rel="alternate" type="application/rss+xml">`
87    pub feed_url: Option<String>,
88
89    /// Document language from `<html lang="...">`
90    pub language: Option<String>,
91
92    /// Text content extracted from the body (tags stripped)
93    pub text_content: String,
94
95    /// All meta tags as key-value pairs
96    pub meta: HashMap<String, String>,
97
98    /// OpenGraph metadata
99    pub opengraph: Opengraph,
100
101    /// Schema.org structured data (JSON-LD)
102    pub schema_org: Vec<SchemaOrg>,
103
104    /// All links found in the document
105    pub links: Vec<Link>,
106}
107
108/// A link found in the HTML document.
109#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
110pub struct Link {
111    /// The URL of the link (resolved if base URL provided)
112    pub url: String,
113
114    /// The anchor text of the link
115    pub text: String,
116
117    /// The rel attribute if present
118    pub rel: Option<String>,
119}
120
121impl HtmlInfo {
122    /// Parse HTML from a string.
123    ///
124    /// # Arguments
125    /// * `html` - The HTML content to parse
126    /// * `base_url` - Optional base URL for resolving relative links
127    ///
128    /// # Example
129    /// ```
130    /// use webpage_info::HtmlInfo;
131    ///
132    /// let html = "<html><head><title>Hello</title></head><body>World</body></html>";
133    /// let info = HtmlInfo::from_string(html, None).unwrap();
134    /// assert_eq!(info.title, Some("Hello".to_string()));
135    /// ```
136    pub fn from_string(html: &str, base_url: Option<&str>) -> Result<Self> {
137        let base = base_url.and_then(|u| Url::parse(u).ok());
138        let document = Html::parse_document(html);
139        Ok(Self::extract(&document, base.as_ref()))
140    }
141
142    /// Parse HTML from a file.
143    ///
144    /// # Arguments
145    /// * `path` - Path to the HTML file
146    /// * `base_url` - Optional base URL for resolving relative links
147    pub fn from_file(path: impl AsRef<Path>, base_url: Option<&str>) -> Result<Self> {
148        let content = fs::read_to_string(path)?;
149        Self::from_string(&content, base_url)
150    }
151
152    /// Extract all information from a parsed HTML document.
153    fn extract(document: &Html, base_url: Option<&Url>) -> Self {
154        let mut info = Self {
155            title: Self::extract_title(document),
156            language: Self::extract_language(document),
157            canonical_url: Self::extract_canonical(document),
158            feed_url: Self::extract_feed(document),
159            text_content: Self::extract_text_content(document),
160            links: Self::extract_links(document, base_url),
161            schema_org: Self::extract_schema_org(document),
162            ..Default::default()
163        };
164
165        // Extract meta tags (sets description, meta, and opengraph)
166        info.extract_meta_tags(document);
167
168        info
169    }
170
171    fn extract_title(document: &Html) -> Option<String> {
172        document
173            .select(title_selector())
174            .next()
175            .map(|el| el.text().collect::<String>().trim().to_string())
176            .filter(|s| !s.is_empty())
177    }
178
179    fn extract_language(document: &Html) -> Option<String> {
180        document
181            .select(html_selector())
182            .next()
183            .and_then(|el| el.value().attr("lang"))
184            .map(|s| s.trim().to_string())
185            .filter(|s| !s.is_empty())
186    }
187
188    fn extract_meta_tags(&mut self, document: &Html) {
189        for element in document.select(meta_selector()) {
190            let el = element.value();
191
192            // Get content value
193            let content = match el.attr("content") {
194                Some(c) => c.trim().to_string(),
195                None => {
196                    // Handle charset meta tag
197                    if let Some(charset) = el.attr("charset") {
198                        self.meta.insert("charset".to_string(), charset.to_string());
199                    }
200                    continue;
201                }
202            };
203
204            // Get property/name
205            let property = el
206                .attr("property")
207                .or_else(|| el.attr("name"))
208                .or_else(|| el.attr("http-equiv"));
209
210            if let Some(prop) = property {
211                let prop = prop.trim().to_string();
212                self.meta.insert(prop.clone(), content.clone());
213
214                // Handle OpenGraph
215                if let Some(og_prop) = prop.strip_prefix("og:") {
216                    self.opengraph.extend(og_prop, content.clone());
217                }
218
219                // Handle description
220                if prop == "description" {
221                    self.description = Some(content);
222                }
223            }
224        }
225    }
226
227    fn extract_canonical(document: &Html) -> Option<String> {
228        document
229            .select(canonical_selector())
230            .next()
231            .and_then(|el| el.value().attr("href"))
232            .map(|s| s.trim().to_string())
233            .filter(|s| !s.is_empty())
234    }
235
236    fn extract_feed(document: &Html) -> Option<String> {
237        for element in document.select(feed_selector()) {
238            let el = element.value();
239            if let Some(link_type) = el.attr("type")
240                && FEED_MIME_TYPES.contains(&link_type)
241            {
242                return el.attr("href").map(|s| s.trim().to_string());
243            }
244        }
245        None
246    }
247
248    fn extract_text_content(document: &Html) -> String {
249        let Some(body) = document.select(body_selector()).next() else {
250            return String::new();
251        };
252
253        // Pre-collect excluded node IDs for O(1) lookup instead of O(n) per text node
254        let excluded_ids: HashSet<_> = document
255            .select(exclude_selector())
256            .map(|el| el.id())
257            .collect();
258
259        let mut text = String::with_capacity(4096); // Pre-allocate reasonable size
260
261        for node in body.descendants() {
262            // Stop if we've reached the size limit
263            if text.len() >= MAX_TEXT_CONTENT_LEN {
264                break;
265            }
266
267            if let Some(text_node) = node.value().as_text() {
268                // Check if any ancestor is excluded (O(depth) instead of O(depth * n))
269                let is_excluded = node.ancestors().any(|a| excluded_ids.contains(&a.id()));
270
271                if !is_excluded {
272                    let trimmed = text_node.trim();
273                    if !trimmed.is_empty() {
274                        if !text.is_empty() {
275                            text.push(' ');
276                        }
277                        // Limit how much we add to stay within bounds
278                        let remaining = MAX_TEXT_CONTENT_LEN.saturating_sub(text.len());
279                        if trimmed.len() <= remaining {
280                            text.push_str(trimmed);
281                        } else {
282                            text.push_str(&trimmed[..remaining]);
283                            break;
284                        }
285                    }
286                }
287            }
288        }
289
290        text
291    }
292
293    fn extract_links(document: &Html, base_url: Option<&Url>) -> Vec<Link> {
294        document
295            .select(link_selector())
296            .filter_map(|element| {
297                let href = element.value().attr("href")?;
298                let href = href.trim();
299
300                // Skip empty and javascript: links
301                if href.is_empty() || href.starts_with("javascript:") {
302                    return None;
303                }
304
305                let url = if let Some(base) = base_url {
306                    base.join(href)
307                        .map(|u| u.to_string())
308                        .unwrap_or_else(|_| href.to_string())
309                } else {
310                    href.to_string()
311                };
312
313                let text = element.text().collect::<String>().trim().to_string();
314                let rel = element.value().attr("rel").map(|s| s.to_string());
315
316                Some(Link { url, text, rel })
317            })
318            .take(MAX_LINKS)
319            .collect()
320    }
321
322    fn extract_schema_org(document: &Html) -> Vec<SchemaOrg> {
323        document
324            .select(schema_org_selector())
325            .flat_map(|element| {
326                let content = element.text().collect::<String>();
327                SchemaOrg::parse(&content)
328            })
329            .take(MAX_SCHEMA_ORG_ITEMS)
330            .collect()
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    #[test]
339    fn test_basic_parsing() {
340        let html = r#"
341            <!DOCTYPE html>
342            <html lang="en">
343            <head>
344                <title>Test Page</title>
345                <meta name="description" content="A test page">
346                <meta property="og:title" content="OG Title">
347                <meta property="og:type" content="article">
348                <link rel="canonical" href="https://example.com/test">
349            </head>
350            <body>
351                <p>Hello World</p>
352                <a href="/about">About Us</a>
353            </body>
354            </html>
355        "#;
356
357        let info = HtmlInfo::from_string(html, Some("https://example.com/")).unwrap();
358
359        assert_eq!(info.title, Some("Test Page".to_string()));
360        assert_eq!(info.description, Some("A test page".to_string()));
361        assert_eq!(info.language, Some("en".to_string()));
362        assert_eq!(
363            info.canonical_url,
364            Some("https://example.com/test".to_string())
365        );
366        assert_eq!(info.opengraph.title, Some("OG Title".to_string()));
367        assert_eq!(info.opengraph.og_type, Some("article".to_string()));
368        assert!(info.text_content.contains("Hello World"));
369        assert_eq!(info.links.len(), 1);
370        assert_eq!(info.links[0].url, "https://example.com/about");
371        assert_eq!(info.links[0].text, "About Us");
372    }
373
374    #[test]
375    fn test_feed_extraction() {
376        let html = r#"
377            <html>
378            <head>
379                <link rel="alternate" type="application/rss+xml" href="/feed.xml">
380            </head>
381            </html>
382        "#;
383
384        let info = HtmlInfo::from_string(html, None).unwrap();
385        assert_eq!(info.feed_url, Some("/feed.xml".to_string()));
386    }
387
388    #[test]
389    fn test_schema_org_extraction() {
390        let html = r#"
391            <html>
392            <head>
393                <script type="application/ld+json">
394                {"@type": "Article", "headline": "Test Article"}
395                </script>
396            </head>
397            </html>
398        "#;
399
400        let info = HtmlInfo::from_string(html, None).unwrap();
401        assert_eq!(info.schema_org.len(), 1);
402        assert_eq!(info.schema_org[0].schema_type, "Article");
403    }
404
405    #[test]
406    fn test_text_excludes_scripts() {
407        let html = r#"
408            <html>
409            <body>
410                <p>Visible text</p>
411                <script>console.log('hidden');</script>
412                <style>.hidden { display: none; }</style>
413                <p>More visible</p>
414            </body>
415            </html>
416        "#;
417
418        let info = HtmlInfo::from_string(html, None).unwrap();
419        assert!(info.text_content.contains("Visible text"));
420        assert!(info.text_content.contains("More visible"));
421        assert!(!info.text_content.contains("console.log"));
422        assert!(!info.text_content.contains(".hidden"));
423    }
424}