reasonkit_web/extraction/
metadata.rs

1//! Page metadata extraction
2//!
3//! This module extracts page metadata including title, description,
4//! Open Graph data, Twitter cards, and other structured data.
5
6use crate::browser::PageHandle;
7use crate::error::{ExtractionError, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use tracing::{debug, info, instrument};
11
12/// Extracted page metadata
13#[derive(Debug, Clone, Default, Serialize, Deserialize)]
14pub struct PageMetadata {
15    /// Page title
16    pub title: Option<String>,
17    /// Meta description
18    pub description: Option<String>,
19    /// Canonical URL
20    pub canonical: Option<String>,
21    /// Language
22    pub language: Option<String>,
23    /// Author
24    pub author: Option<String>,
25    /// Keywords
26    pub keywords: Vec<String>,
27    /// Open Graph metadata
28    pub open_graph: OpenGraphData,
29    /// Twitter Card metadata
30    pub twitter_card: TwitterCardData,
31    /// Favicon URL
32    pub favicon: Option<String>,
33    /// All meta tags
34    pub meta_tags: HashMap<String, String>,
35    /// JSON-LD structured data
36    pub json_ld: Vec<serde_json::Value>,
37}
38
39/// Open Graph metadata
40#[derive(Debug, Clone, Default, Serialize, Deserialize)]
41pub struct OpenGraphData {
42    /// og:title
43    pub title: Option<String>,
44    /// og:description
45    pub description: Option<String>,
46    /// og:image
47    pub image: Option<String>,
48    /// og:url
49    pub url: Option<String>,
50    /// og:type
51    pub og_type: Option<String>,
52    /// og:site_name
53    pub site_name: Option<String>,
54    /// og:locale
55    pub locale: Option<String>,
56}
57
58/// Twitter Card metadata
59#[derive(Debug, Clone, Default, Serialize, Deserialize)]
60pub struct TwitterCardData {
61    /// twitter:card
62    pub card: Option<String>,
63    /// twitter:title
64    pub title: Option<String>,
65    /// twitter:description
66    pub description: Option<String>,
67    /// twitter:image
68    pub image: Option<String>,
69    /// twitter:site
70    pub site: Option<String>,
71    /// twitter:creator
72    pub creator: Option<String>,
73}
74
75/// Metadata extraction functionality
76pub struct MetadataExtractor;
77
78impl MetadataExtractor {
79    /// Extract all metadata from the page
80    #[instrument(skip(page))]
81    pub async fn extract(page: &PageHandle) -> Result<PageMetadata> {
82        info!("Extracting page metadata");
83
84        let script = r#"
85            (() => {
86                const result = {
87                    title: document.title,
88                    description: null,
89                    canonical: null,
90                    language: document.documentElement.lang || null,
91                    author: null,
92                    keywords: [],
93                    openGraph: {},
94                    twitterCard: {},
95                    favicon: null,
96                    metaTags: {},
97                    jsonLd: []
98                };
99
100                // Extract meta tags
101                document.querySelectorAll('meta').forEach(meta => {
102                    const name = meta.getAttribute('name') || meta.getAttribute('property');
103                    const content = meta.getAttribute('content');
104
105                    if (!name || !content) return;
106
107                    result.metaTags[name] = content;
108
109                    // Standard meta
110                    if (name === 'description') result.description = content;
111                    if (name === 'author') result.author = content;
112                    if (name === 'keywords') {
113                        result.keywords = content.split(',').map(k => k.trim()).filter(k => k);
114                    }
115
116                    // Open Graph
117                    if (name.startsWith('og:')) {
118                        const key = name.replace('og:', '');
119                        result.openGraph[key] = content;
120                    }
121
122                    // Twitter Card
123                    if (name.startsWith('twitter:')) {
124                        const key = name.replace('twitter:', '');
125                        result.twitterCard[key] = content;
126                    }
127                });
128
129                // Canonical URL
130                const canonical = document.querySelector('link[rel="canonical"]');
131                if (canonical) {
132                    result.canonical = canonical.getAttribute('href');
133                }
134
135                // Favicon
136                const favicon = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]');
137                if (favicon) {
138                    result.favicon = favicon.getAttribute('href');
139                }
140
141                // JSON-LD
142                document.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
143                    try {
144                        const data = JSON.parse(script.textContent);
145                        result.jsonLd.push(data);
146                    } catch (e) {}
147                });
148
149                return result;
150            })()
151        "#;
152
153        let result: serde_json::Value = page
154            .page
155            .evaluate(script)
156            .await
157            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?
158            .into_value()
159            .map_err(|e| ExtractionError::ExtractionFailed(e.to_string()))?;
160
161        let og = &result["openGraph"];
162        let tw = &result["twitterCard"];
163
164        let metadata = PageMetadata {
165            title: result["title"].as_str().map(String::from),
166            description: result["description"].as_str().map(String::from),
167            canonical: result["canonical"].as_str().map(String::from),
168            language: result["language"].as_str().map(String::from),
169            author: result["author"].as_str().map(String::from),
170            keywords: result["keywords"]
171                .as_array()
172                .map(|arr| {
173                    arr.iter()
174                        .filter_map(|v| v.as_str().map(String::from))
175                        .collect()
176                })
177                .unwrap_or_default(),
178            open_graph: OpenGraphData {
179                title: og["title"].as_str().map(String::from),
180                description: og["description"].as_str().map(String::from),
181                image: og["image"].as_str().map(String::from),
182                url: og["url"].as_str().map(String::from),
183                og_type: og["type"].as_str().map(String::from),
184                site_name: og["site_name"].as_str().map(String::from),
185                locale: og["locale"].as_str().map(String::from),
186            },
187            twitter_card: TwitterCardData {
188                card: tw["card"].as_str().map(String::from),
189                title: tw["title"].as_str().map(String::from),
190                description: tw["description"].as_str().map(String::from),
191                image: tw["image"].as_str().map(String::from),
192                site: tw["site"].as_str().map(String::from),
193                creator: tw["creator"].as_str().map(String::from),
194            },
195            favicon: result["favicon"].as_str().map(String::from),
196            meta_tags: result["metaTags"]
197                .as_object()
198                .map(|obj| {
199                    obj.iter()
200                        .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
201                        .collect()
202                })
203                .unwrap_or_default(),
204            json_ld: result["jsonLd"].as_array().cloned().unwrap_or_default(),
205        };
206
207        debug!(
208            "Extracted metadata: title={:?}, description={:?}",
209            metadata.title, metadata.description
210        );
211
212        Ok(metadata)
213    }
214
215    /// Get the best title from available sources
216    pub fn best_title(metadata: &PageMetadata) -> Option<String> {
217        metadata
218            .open_graph
219            .title
220            .clone()
221            .or_else(|| metadata.twitter_card.title.clone())
222            .or_else(|| metadata.title.clone())
223    }
224
225    /// Get the best description from available sources
226    pub fn best_description(metadata: &PageMetadata) -> Option<String> {
227        metadata
228            .open_graph
229            .description
230            .clone()
231            .or_else(|| metadata.twitter_card.description.clone())
232            .or_else(|| metadata.description.clone())
233    }
234
235    /// Get the best image from available sources
236    pub fn best_image(metadata: &PageMetadata) -> Option<String> {
237        metadata
238            .open_graph
239            .image
240            .clone()
241            .or_else(|| metadata.twitter_card.image.clone())
242    }
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248
249    #[test]
250    fn test_page_metadata_default() {
251        let meta = PageMetadata::default();
252        assert!(meta.title.is_none());
253        assert!(meta.keywords.is_empty());
254        assert!(meta.json_ld.is_empty());
255    }
256
257    #[test]
258    fn test_best_title() {
259        let mut meta = PageMetadata {
260            title: Some("Page Title".to_string()),
261            ..Default::default()
262        };
263        meta.open_graph.title = Some("OG Title".to_string());
264
265        // OG title should take precedence
266        assert_eq!(
267            MetadataExtractor::best_title(&meta),
268            Some("OG Title".to_string())
269        );
270
271        // Without OG, use page title
272        meta.open_graph.title = None;
273        assert_eq!(
274            MetadataExtractor::best_title(&meta),
275            Some("Page Title".to_string())
276        );
277    }
278
279    #[test]
280    fn test_open_graph_data() {
281        let og = OpenGraphData {
282            title: Some("Title".to_string()),
283            description: Some("Desc".to_string()),
284            image: Some("https://example.com/img.jpg".to_string()),
285            url: Some("https://example.com".to_string()),
286            og_type: Some("article".to_string()),
287            site_name: Some("Example".to_string()),
288            locale: Some("en_US".to_string()),
289        };
290
291        assert_eq!(og.og_type, Some("article".to_string()));
292    }
293
294    #[test]
295    fn test_twitter_card_data() {
296        let tw = TwitterCardData {
297            card: Some("summary_large_image".to_string()),
298            title: Some("Title".to_string()),
299            description: Some("Desc".to_string()),
300            image: Some("https://example.com/img.jpg".to_string()),
301            site: Some("@example".to_string()),
302            creator: Some("@author".to_string()),
303        };
304
305        assert_eq!(tw.card, Some("summary_large_image".to_string()));
306    }
307}