essence/format/
metadata.rs

1use crate::{
2    engines::{detection::RenderingDetector, RawScrapeResult},
3    error::Result,
4    format::advanced_extraction::AdvancedExtractor,
5    types::{Metadata, ScrapeRequest},
6};
7use scraper::{Html, Selector};
8
9/// Extract metadata from raw scrape result
10pub fn extract_metadata(raw: &RawScrapeResult, _request: &ScrapeRequest) -> Result<Metadata> {
11    let document = Html::parse_document(&raw.html);
12
13    // Try advanced extraction for article content
14    let (word_count, reading_time, excerpt, detected_language) =
15        if let Ok(article) = AdvancedExtractor::extract_article(&raw.html, &raw.url) {
16            (
17                Some(article.word_count),
18                Some(article.reading_time),
19                article.excerpt,
20                article.language,
21            )
22        } else {
23            // Fallback to basic extraction if Readability fails
24            let text = document.root_element().text().collect::<String>();
25            let word_count = AdvancedExtractor::count_words(&text);
26            (
27                Some(word_count),
28                Some(AdvancedExtractor::estimate_reading_time(word_count)),
29                AdvancedExtractor::generate_excerpt(&text),
30                AdvancedExtractor::detect_language(&text),
31            )
32        };
33
34    // Perform JS detection for metadata
35    let detection = RenderingDetector::needs_javascript(&raw.html, &raw.url);
36
37    Ok(Metadata {
38        title: extract_title(&document),
39        description: extract_description(&document),
40        language: extract_language(&document).or(detected_language),
41        keywords: extract_keywords(&document),
42        robots: extract_robots(&document),
43        og_title: extract_og_tag(&document, "og:title"),
44        og_description: extract_og_tag(&document, "og:description"),
45        og_url: extract_og_tag(&document, "og:url"),
46        og_image: extract_og_tag(&document, "og:image"),
47        url: Some(raw.url.clone()),
48        source_url: Some(raw.url.clone()),
49        status_code: raw.status_code,
50        content_type: raw.content_type.clone(),
51        canonical_url: extract_canonical_url(&document),
52        word_count,
53        reading_time,
54        excerpt,
55        detected_frameworks: if detection.detected_frameworks.is_empty() {
56            None
57        } else {
58            Some(detection.detected_frameworks)
59        },
60        detection_reason: Some(detection.reason),
61        content_script_ratio: Some(detection.content_script_ratio),
62    })
63}
64
65/// Extract page title from <title> tag
66fn extract_title(document: &Html) -> Option<String> {
67    let selector = Selector::parse("title").ok()?;
68    document
69        .select(&selector)
70        .next()
71        .map(|el| el.text().collect::<String>().trim().to_string())
72        .filter(|s| !s.is_empty())
73}
74
75/// Extract description from meta tag, with fallbacks to OG, Twitter, and first paragraph
76fn extract_description(document: &Html) -> Option<String> {
77    extract_meta_content(document, "name", "description")
78        .or_else(|| extract_meta_content(document, "property", "description"))
79        .or_else(|| extract_meta_content(document, "property", "og:description"))
80        .or_else(|| extract_meta_content(document, "name", "twitter:description"))
81        .or_else(|| extract_first_paragraph(document))
82}
83
84/// Extract a description from the first meaningful text block of the page.
85/// Used as a last resort when no meta description is available.
86/// Checks p, div, font, td, li elements in priority order.
87fn extract_first_paragraph(document: &Html) -> Option<String> {
88    // Try multiple selectors in order of semantic priority
89    let selectors = ["p", "div", "font", "td", "li"];
90
91    for sel_str in &selectors {
92        if let Ok(selector) = Selector::parse(sel_str) {
93            for el in document.select(&selector) {
94                let text = el.text().collect::<String>().trim().to_string();
95                // Only use elements with substantial text (> 80 chars)
96                // and that look like content (not navigation/UI)
97                if text.len() > 80 && !looks_like_navigation(&text) {
98                    // Truncate to ~200 chars at word boundary
99                    let desc = if text.len() > 200 {
100                        match text[..200].rfind(' ') {
101                            Some(pos) => format!("{}...", &text[..pos]),
102                            None => format!("{}...", &text[..200]),
103                        }
104                    } else {
105                        text
106                    };
107                    return Some(desc);
108                }
109            }
110        }
111    }
112    None
113}
114
115/// Check if text looks like navigation rather than content
116fn looks_like_navigation(text: &str) -> bool {
117    let lower = text.to_lowercase();
118    // Navigation text is usually short items with specific patterns
119    lower.starts_with("skip to")
120        || lower.starts_with("menu")
121        || lower.starts_with("search")
122        || (text.len() < 150 && text.matches('\n').count() > 5) // Many short lines = nav
123}
124
125/// Extract language from <html lang="...">
126fn extract_language(document: &Html) -> Option<String> {
127    let selector = Selector::parse("html").ok()?;
128    document
129        .select(&selector)
130        .next()
131        .and_then(|el| el.value().attr("lang"))
132        .map(|s| s.to_string())
133}
134
135/// Extract keywords from meta tag
136fn extract_keywords(document: &Html) -> Option<String> {
137    extract_meta_content(document, "name", "keywords")
138}
139
140/// Extract robots from meta tag
141fn extract_robots(document: &Html) -> Option<String> {
142    extract_meta_content(document, "name", "robots")
143}
144
145/// Extract Open Graph tag
146fn extract_og_tag(document: &Html, property: &str) -> Option<String> {
147    extract_meta_content(document, "property", property)
148}
149
150/// Extract canonical URL from <link rel="canonical">
151fn extract_canonical_url(document: &Html) -> Option<String> {
152    let selector = Selector::parse("link[rel='canonical']").ok()?;
153    document
154        .select(&selector)
155        .next()
156        .and_then(|el| el.value().attr("href"))
157        .map(|s| s.to_string())
158}
159
160/// Generic meta tag extractor
161fn extract_meta_content(document: &Html, attr_name: &str, attr_value: &str) -> Option<String> {
162    let selector_str = format!("meta[{}='{}']", attr_name, attr_value);
163    let selector = Selector::parse(&selector_str).ok()?;
164
165    document
166        .select(&selector)
167        .next()
168        .and_then(|el| el.value().attr("content"))
169        .map(|s| s.trim().to_string())
170        .filter(|s| !s.is_empty())
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn test_extract_title() {
179        let html = "<html><head><title>Test Page</title></head></html>";
180        let doc = Html::parse_document(html);
181        let title = extract_title(&doc);
182        assert_eq!(title, Some("Test Page".to_string()));
183    }
184
185    #[test]
186    fn test_extract_description() {
187        let html =
188            r#"<html><head><meta name="description" content="Test description"></head></html>"#;
189        let doc = Html::parse_document(html);
190        let desc = extract_description(&doc);
191        assert_eq!(desc, Some("Test description".to_string()));
192    }
193
194    #[test]
195    fn test_extract_description_og_fallback() {
196        // When no standard description exists, should fall back to og:description
197        let html = r#"<html><head><meta property="og:description" content="OG desc"></head></html>"#;
198        let doc = Html::parse_document(html);
199        let desc = extract_description(&doc);
200        assert_eq!(desc, Some("OG desc".to_string()));
201    }
202
203    #[test]
204    fn test_extract_description_twitter_fallback() {
205        // When no standard or OG description exists, should fall back to twitter:description
206        let html =
207            r#"<html><head><meta name="twitter:description" content="Twitter desc"></head></html>"#;
208        let doc = Html::parse_document(html);
209        let desc = extract_description(&doc);
210        assert_eq!(desc, Some("Twitter desc".to_string()));
211    }
212
213    #[test]
214    fn test_extract_description_prefers_standard() {
215        // Standard description should take priority over OG
216        let html = r#"<html><head>
217            <meta name="description" content="Standard desc">
218            <meta property="og:description" content="OG desc">
219        </head></html>"#;
220        let doc = Html::parse_document(html);
221        let desc = extract_description(&doc);
222        assert_eq!(desc, Some("Standard desc".to_string()));
223    }
224
225    #[test]
226    fn test_extract_og_tags() {
227        let html = r#"
228            <html>
229                <head>
230                    <meta property="og:title" content="OG Title">
231                    <meta property="og:description" content="OG Description">
232                    <meta property="og:image" content="https://example.com/image.jpg">
233                </head>
234            </html>
235        "#;
236        let doc = Html::parse_document(html);
237        assert_eq!(
238            extract_og_tag(&doc, "og:title"),
239            Some("OG Title".to_string())
240        );
241        assert_eq!(
242            extract_og_tag(&doc, "og:description"),
243            Some("OG Description".to_string())
244        );
245        assert_eq!(
246            extract_og_tag(&doc, "og:image"),
247            Some("https://example.com/image.jpg".to_string())
248        );
249    }
250
251    #[test]
252    fn test_extract_canonical_url() {
253        let html = r#"<html><head><link rel="canonical" href="https://example.com/canonical"></head></html>"#;
254        let doc = Html::parse_document(html);
255        let canonical = extract_canonical_url(&doc);
256        assert_eq!(canonical, Some("https://example.com/canonical".to_string()));
257    }
258
259    #[test]
260    fn test_extract_language() {
261        let html = r#"<html lang="en-US"><head></head></html>"#;
262        let doc = Html::parse_document(html);
263        let lang = extract_language(&doc);
264        assert_eq!(lang, Some("en-US".to_string()));
265    }
266}
essence/format/metadata.rs

essence/format/
metadata.rs