halldyll_core/parse/
metadata.rs

1//! Metadata - Metadata extraction
2
3use scraper::{Html, Selector};
4
5/// Metadata extractor
6pub struct MetadataExtractor;
7
8impl Default for MetadataExtractor {
9    fn default() -> Self {
10        Self
11    }
12}
13
14impl MetadataExtractor {
15    /// New extractor
16    pub fn new() -> Self {
17        Self
18    }
19
20    /// Extract all metadata
21    pub fn extract(&self, html: &str) -> PageMetadata {
22        let document = Html::parse_document(html);
23        
24        PageMetadata {
25            title: self.extract_title(&document),
26            description: self.extract_meta(&document, "description"),
27            author: self.extract_meta(&document, "author"),
28            keywords: self.extract_keywords(&document),
29            published_time: self.extract_time(&document, "article:published_time")
30                .or_else(|| self.extract_time(&document, "datePublished")),
31            modified_time: self.extract_time(&document, "article:modified_time")
32                .or_else(|| self.extract_time(&document, "dateModified")),
33            robots: self.extract_meta(&document, "robots"),
34            viewport: self.extract_meta(&document, "viewport"),
35            charset: self.extract_charset(&document),
36            canonical: self.extract_canonical(&document),
37            language: self.extract_language(&document),
38        }
39    }
40
41    /// Extract the title
42    fn extract_title(&self, document: &Html) -> Option<String> {
43        let selector = Selector::parse("title").ok()?;
44        document
45            .select(&selector)
46            .next()
47            .map(|el| el.text().collect::<Vec<_>>().join("").trim().to_string())
48    }
49
50    /// Extract a meta tag by name
51    fn extract_meta(&self, document: &Html, name: &str) -> Option<String> {
52        let selector = Selector::parse(&format!(r#"meta[name="{}"]"#, name)).ok()?;
53        document
54            .select(&selector)
55            .next()
56            .and_then(|el| el.value().attr("content").map(String::from))
57    }
58
59    /// Extract keywords
60    fn extract_keywords(&self, document: &Html) -> Vec<String> {
61        self.extract_meta(document, "keywords")
62            .map(|s| {
63                s.split(',')
64                    .map(|k| k.trim().to_string())
65                    .filter(|k| !k.is_empty())
66                    .collect()
67            })
68            .unwrap_or_default()
69    }
70
71    /// Extract a date/time
72    fn extract_time(&self, document: &Html, property: &str) -> Option<String> {
73        // Essayer property (OpenGraph style)
74        let prop_selector = Selector::parse(&format!(r#"meta[property="{}"]"#, property)).ok();
75        if let Some(sel) = prop_selector {
76            if let Some(el) = document.select(&sel).next() {
77                if let Some(content) = el.value().attr("content") {
78                    return Some(content.to_string());
79                }
80            }
81        }
82
83        // Essayer itemprop (Schema.org style)
84        let itemprop_selector = Selector::parse(&format!(r#"[itemprop="{}"]"#, property)).ok();
85        if let Some(sel) = itemprop_selector {
86            if let Some(el) = document.select(&sel).next() {
87                // Vérifier datetime attribute ou content
88                if let Some(dt) = el.value().attr("datetime") {
89                    return Some(dt.to_string());
90                }
91                if let Some(content) = el.value().attr("content") {
92                    return Some(content.to_string());
93                }
94            }
95        }
96
97        None
98    }
99
100    /// Extract the charset
101    fn extract_charset(&self, document: &Html) -> Option<String> {
102        // <meta charset="...">
103        let charset_selector = Selector::parse("meta[charset]").ok()?;
104        if let Some(el) = document.select(&charset_selector).next() {
105            if let Some(charset) = el.value().attr("charset") {
106                return Some(charset.to_string());
107            }
108        }
109
110        // <meta http-equiv="Content-Type" content="text/html; charset=...">
111        let content_type_selector = Selector::parse(r#"meta[http-equiv="Content-Type"]"#).ok()?;
112        if let Some(el) = document.select(&content_type_selector).next() {
113            if let Some(content) = el.value().attr("content") {
114                if let Some(pos) = content.to_lowercase().find("charset=") {
115                    let charset: String = content[pos + 8..]
116                        .chars()
117                        .take_while(|&c| c != ';' && c != ' ' && c != '"')
118                        .collect();
119                    return Some(charset);
120                }
121            }
122        }
123
124        None
125    }
126
127    /// Extract the canonical link
128    fn extract_canonical(&self, document: &Html) -> Option<String> {
129        let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
130        document
131            .select(&selector)
132            .next()
133            .and_then(|el| el.value().attr("href").map(String::from))
134    }
135
136    /// Extract the language
137    fn extract_language(&self, document: &Html) -> Option<String> {
138        // Attribut lang sur <html>
139        let html_selector = Selector::parse("html").ok()?;
140        if let Some(html) = document.select(&html_selector).next() {
141            if let Some(lang) = html.value().attr("lang") {
142                return Some(lang.to_string());
143            }
144        }
145
146        // Meta language
147        self.extract_meta(document, "language")
148    }
149}
150
151/// Page metadata
152#[derive(Debug, Clone, Default)]
153pub struct PageMetadata {
154    /// Page title
155    pub title: Option<String>,
156    /// Meta description
157    pub description: Option<String>,
158    /// Author
159    pub author: Option<String>,
160    /// Keywords
161    pub keywords: Vec<String>,
162    /// Publication date
163    pub published_time: Option<String>,
164    /// Modification date
165    pub modified_time: Option<String>,
166    /// Robots directives
167    pub robots: Option<String>,
168    /// Viewport
169    pub viewport: Option<String>,
170    /// Charset
171    pub charset: Option<String>,
172    /// Canonical URL
173    pub canonical: Option<String>,
174    /// Language
175    pub language: Option<String>,
176}