reasonkit_web/extraction/
metadata.rs1use crate::browser::PageHandle;
7use crate::error::{ExtractionError, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use tracing::{debug, info, instrument};
11
12#[derive(Debug, Clone, Default, Serialize, Deserialize)]
14pub struct PageMetadata {
15 pub title: Option<String>,
17 pub description: Option<String>,
19 pub canonical: Option<String>,
21 pub language: Option<String>,
23 pub author: Option<String>,
25 pub keywords: Vec<String>,
27 pub open_graph: OpenGraphData,
29 pub twitter_card: TwitterCardData,
31 pub favicon: Option<String>,
33 pub meta_tags: HashMap<String, String>,
35 pub json_ld: Vec<serde_json::Value>,
37}
38
39#[derive(Debug, Clone, Default, Serialize, Deserialize)]
41pub struct OpenGraphData {
42 pub title: Option<String>,
44 pub description: Option<String>,
46 pub image: Option<String>,
48 pub url: Option<String>,
50 pub og_type: Option<String>,
52 pub site_name: Option<String>,
54 pub locale: Option<String>,
56}
57
58#[derive(Debug, Clone, Default, Serialize, Deserialize)]
60pub struct TwitterCardData {
61 pub card: Option<String>,
63 pub title: Option<String>,
65 pub description: Option<String>,
67 pub image: Option<String>,
69 pub site: Option<String>,
71 pub creator: Option<String>,
73}
74
75pub struct MetadataExtractor;
77
78impl MetadataExtractor {
79 #[instrument(skip(page))]
81 pub async fn extract(page: &PageHandle) -> Result<PageMetadata> {
82 info!("Extracting page metadata");
83
84 let script = r#"
85 (() => {
86 const result = {
87 title: document.title,
88 description: null,
89 canonical: null,
90 language: document.documentElement.lang || null,
91 author: null,
92 keywords: [],
93 openGraph: {},
94 twitterCard: {},
95 favicon: null,
96 metaTags: {},
97 jsonLd: []
98 };
99
100 // Extract meta tags
101 document.querySelectorAll('meta').forEach(meta => {
102 const name = meta.getAttribute('name') || meta.getAttribute('property');
103 const content = meta.getAttribute('content');
104
105 if (!name || !content) return;
106
107 result.metaTags[name] = content;
108
109 // Standard meta
110 if (name === 'description') result.description = content;
111 if (name === 'author') result.author = content;
112 if (name === 'keywords') {
113 result.keywords = content.split(',').map(k => k.trim()).filter(k => k);
114 }
115
116 // Open Graph
117 if (name.startsWith('og:')) {
118 const key = name.replace('og:', '');
119 result.openGraph[key] = content;
120 }
121
122 // Twitter Card
123 if (name.startsWith('twitter:')) {
124 const key = name.replace('twitter:', '');
125 result.twitterCard[key] = content;
126 }
127 });
128
129 // Canonical URL
130 const canonical = document.querySelector('link[rel="canonical"]');
131 if (canonical) {
132 result.canonical = canonical.getAttribute('href');
133 }
134
135 // Favicon
136 const favicon = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]');
137 if (favicon) {
138 result.favicon = favicon.getAttribute('href');
139 }
140
141 // JSON-LD
142 document.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
143 try {
144 const data = JSON.parse(script.textContent);
145 result.jsonLd.push(data);
146 } catch (e) {}
147 });
148
149 return result;
150 })()
151 "#;
152
153 let result: serde_json::Value = page
154 .page
155 .evaluate(script)
156 .await
157 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
158 .into_value()
159 .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
160
161 let og = &result["openGraph"];
162 let tw = &result["twitterCard"];
163
164 let metadata = PageMetadata {
165 title: result["title"].as_str().map(String::from),
166 description: result["description"].as_str().map(String::from),
167 canonical: result["canonical"].as_str().map(String::from),
168 language: result["language"].as_str().map(String::from),
169 author: result["author"].as_str().map(String::from),
170 keywords: result["keywords"]
171 .as_array()
172 .map(|arr| {
173 arr.iter()
174 .filter_map(|v| v.as_str().map(String::from))
175 .collect()
176 })
177 .unwrap_or_default(),
178 open_graph: OpenGraphData {
179 title: og["title"].as_str().map(String::from),
180 description: og["description"].as_str().map(String::from),
181 image: og["image"].as_str().map(String::from),
182 url: og["url"].as_str().map(String::from),
183 og_type: og["type"].as_str().map(String::from),
184 site_name: og["site_name"].as_str().map(String::from),
185 locale: og["locale"].as_str().map(String::from),
186 },
187 twitter_card: TwitterCardData {
188 card: tw["card"].as_str().map(String::from),
189 title: tw["title"].as_str().map(String::from),
190 description: tw["description"].as_str().map(String::from),
191 image: tw["image"].as_str().map(String::from),
192 site: tw["site"].as_str().map(String::from),
193 creator: tw["creator"].as_str().map(String::from),
194 },
195 favicon: result["favicon"].as_str().map(String::from),
196 meta_tags: result["metaTags"]
197 .as_object()
198 .map(|obj| {
199 obj.iter()
200 .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
201 .collect()
202 })
203 .unwrap_or_default(),
204 json_ld: result["jsonLd"].as_array().cloned().unwrap_or_default(),
205 };
206
207 debug!(
208 "Extracted metadata: title={:?}, description={:?}",
209 metadata.title, metadata.description
210 );
211
212 Ok(metadata)
213 }
214
215 pub fn best_title(metadata: &PageMetadata) -> Option<String> {
217 metadata
218 .open_graph
219 .title
220 .clone()
221 .or_else(|| metadata.twitter_card.title.clone())
222 .or_else(|| metadata.title.clone())
223 }
224
225 pub fn best_description(metadata: &PageMetadata) -> Option<String> {
227 metadata
228 .open_graph
229 .description
230 .clone()
231 .or_else(|| metadata.twitter_card.description.clone())
232 .or_else(|| metadata.description.clone())
233 }
234
235 pub fn best_image(metadata: &PageMetadata) -> Option<String> {
237 metadata
238 .open_graph
239 .image
240 .clone()
241 .or_else(|| metadata.twitter_card.image.clone())
242 }
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248
249 #[test]
250 fn test_page_metadata_default() {
251 let meta = PageMetadata::default();
252 assert!(meta.title.is_none());
253 assert!(meta.keywords.is_empty());
254 assert!(meta.json_ld.is_empty());
255 }
256
257 #[test]
258 fn test_best_title() {
259 let mut meta = PageMetadata {
260 title: Some("Page Title".to_string()),
261 ..Default::default()
262 };
263 meta.open_graph.title = Some("OG Title".to_string());
264
265 assert_eq!(
267 MetadataExtractor::best_title(&meta),
268 Some("OG Title".to_string())
269 );
270
271 meta.open_graph.title = None;
273 assert_eq!(
274 MetadataExtractor::best_title(&meta),
275 Some("Page Title".to_string())
276 );
277 }
278
279 #[test]
280 fn test_open_graph_data() {
281 let og = OpenGraphData {
282 title: Some("Title".to_string()),
283 description: Some("Desc".to_string()),
284 image: Some("https://example.com/img.jpg".to_string()),
285 url: Some("https://example.com".to_string()),
286 og_type: Some("article".to_string()),
287 site_name: Some("Example".to_string()),
288 locale: Some("en_US".to_string()),
289 };
290
291 assert_eq!(og.og_type, Some("article".to_string()));
292 }
293
294 #[test]
295 fn test_twitter_card_data() {
296 let tw = TwitterCardData {
297 card: Some("summary_large_image".to_string()),
298 title: Some("Title".to_string()),
299 description: Some("Desc".to_string()),
300 image: Some("https://example.com/img.jpg".to_string()),
301 site: Some("@example".to_string()),
302 creator: Some("@author".to_string()),
303 };
304
305 assert_eq!(tw.card, Some("summary_large_image".to_string()));
306 }
307}