halldyll_core/parse/
metadata.rs1use scraper::{Html, Selector};
4
5pub struct MetadataExtractor;
7
8impl Default for MetadataExtractor {
9 fn default() -> Self {
10 Self
11 }
12}
13
14impl MetadataExtractor {
15 pub fn new() -> Self {
17 Self
18 }
19
20 pub fn extract(&self, html: &str) -> PageMetadata {
22 let document = Html::parse_document(html);
23
24 PageMetadata {
25 title: self.extract_title(&document),
26 description: self.extract_meta(&document, "description"),
27 author: self.extract_meta(&document, "author"),
28 keywords: self.extract_keywords(&document),
29 published_time: self.extract_time(&document, "article:published_time")
30 .or_else(|| self.extract_time(&document, "datePublished")),
31 modified_time: self.extract_time(&document, "article:modified_time")
32 .or_else(|| self.extract_time(&document, "dateModified")),
33 robots: self.extract_meta(&document, "robots"),
34 viewport: self.extract_meta(&document, "viewport"),
35 charset: self.extract_charset(&document),
36 canonical: self.extract_canonical(&document),
37 language: self.extract_language(&document),
38 }
39 }
40
41 fn extract_title(&self, document: &Html) -> Option<String> {
43 let selector = Selector::parse("title").ok()?;
44 document
45 .select(&selector)
46 .next()
47 .map(|el| el.text().collect::<Vec<_>>().join("").trim().to_string())
48 }
49
50 fn extract_meta(&self, document: &Html, name: &str) -> Option<String> {
52 let selector = Selector::parse(&format!(r#"meta[name="{}"]"#, name)).ok()?;
53 document
54 .select(&selector)
55 .next()
56 .and_then(|el| el.value().attr("content").map(String::from))
57 }
58
59 fn extract_keywords(&self, document: &Html) -> Vec<String> {
61 self.extract_meta(document, "keywords")
62 .map(|s| {
63 s.split(',')
64 .map(|k| k.trim().to_string())
65 .filter(|k| !k.is_empty())
66 .collect()
67 })
68 .unwrap_or_default()
69 }
70
71 fn extract_time(&self, document: &Html, property: &str) -> Option<String> {
73 let prop_selector = Selector::parse(&format!(r#"meta[property="{}"]"#, property)).ok();
75 if let Some(sel) = prop_selector {
76 if let Some(el) = document.select(&sel).next() {
77 if let Some(content) = el.value().attr("content") {
78 return Some(content.to_string());
79 }
80 }
81 }
82
83 let itemprop_selector = Selector::parse(&format!(r#"[itemprop="{}"]"#, property)).ok();
85 if let Some(sel) = itemprop_selector {
86 if let Some(el) = document.select(&sel).next() {
87 if let Some(dt) = el.value().attr("datetime") {
89 return Some(dt.to_string());
90 }
91 if let Some(content) = el.value().attr("content") {
92 return Some(content.to_string());
93 }
94 }
95 }
96
97 None
98 }
99
100 fn extract_charset(&self, document: &Html) -> Option<String> {
102 let charset_selector = Selector::parse("meta[charset]").ok()?;
104 if let Some(el) = document.select(&charset_selector).next() {
105 if let Some(charset) = el.value().attr("charset") {
106 return Some(charset.to_string());
107 }
108 }
109
110 let content_type_selector = Selector::parse(r#"meta[http-equiv="Content-Type"]"#).ok()?;
112 if let Some(el) = document.select(&content_type_selector).next() {
113 if let Some(content) = el.value().attr("content") {
114 if let Some(pos) = content.to_lowercase().find("charset=") {
115 let charset: String = content[pos + 8..]
116 .chars()
117 .take_while(|&c| c != ';' && c != ' ' && c != '"')
118 .collect();
119 return Some(charset);
120 }
121 }
122 }
123
124 None
125 }
126
127 fn extract_canonical(&self, document: &Html) -> Option<String> {
129 let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
130 document
131 .select(&selector)
132 .next()
133 .and_then(|el| el.value().attr("href").map(String::from))
134 }
135
136 fn extract_language(&self, document: &Html) -> Option<String> {
138 let html_selector = Selector::parse("html").ok()?;
140 if let Some(html) = document.select(&html_selector).next() {
141 if let Some(lang) = html.value().attr("lang") {
142 return Some(lang.to_string());
143 }
144 }
145
146 self.extract_meta(document, "language")
148 }
149}
150
151#[derive(Debug, Clone, Default)]
153pub struct PageMetadata {
154 pub title: Option<String>,
156 pub description: Option<String>,
158 pub author: Option<String>,
160 pub keywords: Vec<String>,
162 pub published_time: Option<String>,
164 pub modified_time: Option<String>,
166 pub robots: Option<String>,
168 pub viewport: Option<String>,
170 pub charset: Option<String>,
172 pub canonical: Option<String>,
174 pub language: Option<String>,
176}