essence/format/
metadata.rs1use crate::{
2 engines::{detection::RenderingDetector, RawScrapeResult},
3 error::Result,
4 format::advanced_extraction::AdvancedExtractor,
5 types::{Metadata, ScrapeRequest},
6};
7use scraper::{Html, Selector};
8
9pub fn extract_metadata(raw: &RawScrapeResult, _request: &ScrapeRequest) -> Result<Metadata> {
11 let document = Html::parse_document(&raw.html);
12
13 let (word_count, reading_time, excerpt, detected_language) =
15 if let Ok(article) = AdvancedExtractor::extract_article(&raw.html, &raw.url) {
16 (
17 Some(article.word_count),
18 Some(article.reading_time),
19 article.excerpt,
20 article.language,
21 )
22 } else {
23 let text = document.root_element().text().collect::<String>();
25 let word_count = AdvancedExtractor::count_words(&text);
26 (
27 Some(word_count),
28 Some(AdvancedExtractor::estimate_reading_time(word_count)),
29 AdvancedExtractor::generate_excerpt(&text),
30 AdvancedExtractor::detect_language(&text),
31 )
32 };
33
34 let detection = RenderingDetector::needs_javascript(&raw.html, &raw.url);
36
37 Ok(Metadata {
38 title: extract_title(&document),
39 description: extract_description(&document),
40 language: extract_language(&document).or(detected_language),
41 keywords: extract_keywords(&document),
42 robots: extract_robots(&document),
43 og_title: extract_og_tag(&document, "og:title"),
44 og_description: extract_og_tag(&document, "og:description"),
45 og_url: extract_og_tag(&document, "og:url"),
46 og_image: extract_og_tag(&document, "og:image"),
47 url: Some(raw.url.clone()),
48 source_url: Some(raw.url.clone()),
49 status_code: raw.status_code,
50 content_type: raw.content_type.clone(),
51 canonical_url: extract_canonical_url(&document),
52 word_count,
53 reading_time,
54 excerpt,
55 detected_frameworks: if detection.detected_frameworks.is_empty() {
56 None
57 } else {
58 Some(detection.detected_frameworks)
59 },
60 detection_reason: Some(detection.reason),
61 content_script_ratio: Some(detection.content_script_ratio),
62 })
63}
64
65fn extract_title(document: &Html) -> Option<String> {
67 let selector = Selector::parse("title").ok()?;
68 document
69 .select(&selector)
70 .next()
71 .map(|el| el.text().collect::<String>().trim().to_string())
72 .filter(|s| !s.is_empty())
73}
74
75fn extract_description(document: &Html) -> Option<String> {
77 extract_meta_content(document, "name", "description")
78 .or_else(|| extract_meta_content(document, "property", "description"))
79 .or_else(|| extract_meta_content(document, "property", "og:description"))
80 .or_else(|| extract_meta_content(document, "name", "twitter:description"))
81 .or_else(|| extract_first_paragraph(document))
82}
83
84fn extract_first_paragraph(document: &Html) -> Option<String> {
88 let selectors = ["p", "div", "font", "td", "li"];
90
91 for sel_str in &selectors {
92 if let Ok(selector) = Selector::parse(sel_str) {
93 for el in document.select(&selector) {
94 let text = el.text().collect::<String>().trim().to_string();
95 if text.len() > 80 && !looks_like_navigation(&text) {
98 let desc = if text.len() > 200 {
100 match text[..200].rfind(' ') {
101 Some(pos) => format!("{}...", &text[..pos]),
102 None => format!("{}...", &text[..200]),
103 }
104 } else {
105 text
106 };
107 return Some(desc);
108 }
109 }
110 }
111 }
112 None
113}
114
115fn looks_like_navigation(text: &str) -> bool {
117 let lower = text.to_lowercase();
118 lower.starts_with("skip to")
120 || lower.starts_with("menu")
121 || lower.starts_with("search")
122 || (text.len() < 150 && text.matches('\n').count() > 5) }
124
125fn extract_language(document: &Html) -> Option<String> {
127 let selector = Selector::parse("html").ok()?;
128 document
129 .select(&selector)
130 .next()
131 .and_then(|el| el.value().attr("lang"))
132 .map(|s| s.to_string())
133}
134
135fn extract_keywords(document: &Html) -> Option<String> {
137 extract_meta_content(document, "name", "keywords")
138}
139
140fn extract_robots(document: &Html) -> Option<String> {
142 extract_meta_content(document, "name", "robots")
143}
144
145fn extract_og_tag(document: &Html, property: &str) -> Option<String> {
147 extract_meta_content(document, "property", property)
148}
149
150fn extract_canonical_url(document: &Html) -> Option<String> {
152 let selector = Selector::parse("link[rel='canonical']").ok()?;
153 document
154 .select(&selector)
155 .next()
156 .and_then(|el| el.value().attr("href"))
157 .map(|s| s.to_string())
158}
159
160fn extract_meta_content(document: &Html, attr_name: &str, attr_value: &str) -> Option<String> {
162 let selector_str = format!("meta[{}='{}']", attr_name, attr_value);
163 let selector = Selector::parse(&selector_str).ok()?;
164
165 document
166 .select(&selector)
167 .next()
168 .and_then(|el| el.value().attr("content"))
169 .map(|s| s.trim().to_string())
170 .filter(|s| !s.is_empty())
171}
172
173#[cfg(test)]
174mod tests {
175 use super::*;
176
177 #[test]
178 fn test_extract_title() {
179 let html = "<html><head><title>Test Page</title></head></html>";
180 let doc = Html::parse_document(html);
181 let title = extract_title(&doc);
182 assert_eq!(title, Some("Test Page".to_string()));
183 }
184
185 #[test]
186 fn test_extract_description() {
187 let html =
188 r#"<html><head><meta name="description" content="Test description"></head></html>"#;
189 let doc = Html::parse_document(html);
190 let desc = extract_description(&doc);
191 assert_eq!(desc, Some("Test description".to_string()));
192 }
193
194 #[test]
195 fn test_extract_description_og_fallback() {
196 let html = r#"<html><head><meta property="og:description" content="OG desc"></head></html>"#;
198 let doc = Html::parse_document(html);
199 let desc = extract_description(&doc);
200 assert_eq!(desc, Some("OG desc".to_string()));
201 }
202
203 #[test]
204 fn test_extract_description_twitter_fallback() {
205 let html =
207 r#"<html><head><meta name="twitter:description" content="Twitter desc"></head></html>"#;
208 let doc = Html::parse_document(html);
209 let desc = extract_description(&doc);
210 assert_eq!(desc, Some("Twitter desc".to_string()));
211 }
212
213 #[test]
214 fn test_extract_description_prefers_standard() {
215 let html = r#"<html><head>
217 <meta name="description" content="Standard desc">
218 <meta property="og:description" content="OG desc">
219 </head></html>"#;
220 let doc = Html::parse_document(html);
221 let desc = extract_description(&doc);
222 assert_eq!(desc, Some("Standard desc".to_string()));
223 }
224
225 #[test]
226 fn test_extract_og_tags() {
227 let html = r#"
228 <html>
229 <head>
230 <meta property="og:title" content="OG Title">
231 <meta property="og:description" content="OG Description">
232 <meta property="og:image" content="https://example.com/image.jpg">
233 </head>
234 </html>
235 "#;
236 let doc = Html::parse_document(html);
237 assert_eq!(
238 extract_og_tag(&doc, "og:title"),
239 Some("OG Title".to_string())
240 );
241 assert_eq!(
242 extract_og_tag(&doc, "og:description"),
243 Some("OG Description".to_string())
244 );
245 assert_eq!(
246 extract_og_tag(&doc, "og:image"),
247 Some("https://example.com/image.jpg".to_string())
248 );
249 }
250
251 #[test]
252 fn test_extract_canonical_url() {
253 let html = r#"<html><head><link rel="canonical" href="https://example.com/canonical"></head></html>"#;
254 let doc = Html::parse_document(html);
255 let canonical = extract_canonical_url(&doc);
256 assert_eq!(canonical, Some("https://example.com/canonical".to_string()));
257 }
258
259 #[test]
260 fn test_extract_language() {
261 let html = r#"<html lang="en-US"><head></head></html>"#;
262 let doc = Html::parse_document(html);
263 let lang = extract_language(&doc);
264 assert_eq!(lang, Some("en-US".to_string()));
265 }
266}