1use std::collections::{HashMap, HashSet};
4use std::fs;
5use std::path::Path;
6use std::sync::OnceLock;
7
8use scraper::{Html, Selector};
9use serde::{Deserialize, Serialize};
10use url::Url;
11
12use crate::error::Result;
13use crate::opengraph::Opengraph;
14use crate::schema_org::SchemaOrg;
15
16const FEED_MIME_TYPES: &[&str] = &[
17 "application/atom+xml",
18 "application/rss+xml",
19 "application/json",
20 "application/xml",
21 "text/xml",
22];
23
24const MAX_LINKS: usize = 10_000;
26const MAX_SCHEMA_ORG_ITEMS: usize = 100;
27const MAX_TEXT_CONTENT_LEN: usize = 1_000_000; fn title_selector() -> &'static Selector {
30 static SELECTOR: OnceLock<Selector> = OnceLock::new();
31 SELECTOR.get_or_init(|| Selector::parse("title").unwrap())
32}
33
34fn html_selector() -> &'static Selector {
35 static SELECTOR: OnceLock<Selector> = OnceLock::new();
36 SELECTOR.get_or_init(|| Selector::parse("html").unwrap())
37}
38
39fn meta_selector() -> &'static Selector {
40 static SELECTOR: OnceLock<Selector> = OnceLock::new();
41 SELECTOR.get_or_init(|| Selector::parse("meta").unwrap())
42}
43
44fn canonical_selector() -> &'static Selector {
45 static SELECTOR: OnceLock<Selector> = OnceLock::new();
46 SELECTOR.get_or_init(|| Selector::parse(r#"link[rel="canonical"]"#).unwrap())
47}
48
49fn feed_selector() -> &'static Selector {
50 static SELECTOR: OnceLock<Selector> = OnceLock::new();
51 SELECTOR.get_or_init(|| Selector::parse(r#"link[rel="alternate"]"#).unwrap())
52}
53
54fn body_selector() -> &'static Selector {
55 static SELECTOR: OnceLock<Selector> = OnceLock::new();
56 SELECTOR.get_or_init(|| Selector::parse("body").unwrap())
57}
58
59fn exclude_selector() -> &'static Selector {
60 static SELECTOR: OnceLock<Selector> = OnceLock::new();
61 SELECTOR.get_or_init(|| Selector::parse("script, style, noscript").unwrap())
62}
63
64fn link_selector() -> &'static Selector {
65 static SELECTOR: OnceLock<Selector> = OnceLock::new();
66 SELECTOR.get_or_init(|| Selector::parse("a[href]").unwrap())
67}
68
69fn schema_org_selector() -> &'static Selector {
70 static SELECTOR: OnceLock<Selector> = OnceLock::new();
71 SELECTOR.get_or_init(|| Selector::parse(r#"script[type="application/ld+json"]"#).unwrap())
72}
73
74#[derive(Debug, Clone, Default, Serialize, Deserialize)]
76pub struct HtmlInfo {
77 pub title: Option<String>,
79
80 pub description: Option<String>,
82
83 pub canonical_url: Option<String>,
85
86 pub feed_url: Option<String>,
88
89 pub language: Option<String>,
91
92 pub text_content: String,
94
95 pub meta: HashMap<String, String>,
97
98 pub opengraph: Opengraph,
100
101 pub schema_org: Vec<SchemaOrg>,
103
104 pub links: Vec<Link>,
106}
107
108#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
110pub struct Link {
111 pub url: String,
113
114 pub text: String,
116
117 pub rel: Option<String>,
119}
120
121impl HtmlInfo {
122 pub fn from_string(html: &str, base_url: Option<&str>) -> Result<Self> {
137 let base = base_url.and_then(|u| Url::parse(u).ok());
138 let document = Html::parse_document(html);
139 Ok(Self::extract(&document, base.as_ref()))
140 }
141
142 pub fn from_file(path: impl AsRef<Path>, base_url: Option<&str>) -> Result<Self> {
148 let content = fs::read_to_string(path)?;
149 Self::from_string(&content, base_url)
150 }
151
152 fn extract(document: &Html, base_url: Option<&Url>) -> Self {
154 let mut info = Self {
155 title: Self::extract_title(document),
156 language: Self::extract_language(document),
157 canonical_url: Self::extract_canonical(document),
158 feed_url: Self::extract_feed(document),
159 text_content: Self::extract_text_content(document),
160 links: Self::extract_links(document, base_url),
161 schema_org: Self::extract_schema_org(document),
162 ..Default::default()
163 };
164
165 info.extract_meta_tags(document);
167
168 info
169 }
170
171 fn extract_title(document: &Html) -> Option<String> {
172 document
173 .select(title_selector())
174 .next()
175 .map(|el| el.text().collect::<String>().trim().to_string())
176 .filter(|s| !s.is_empty())
177 }
178
179 fn extract_language(document: &Html) -> Option<String> {
180 document
181 .select(html_selector())
182 .next()
183 .and_then(|el| el.value().attr("lang"))
184 .map(|s| s.trim().to_string())
185 .filter(|s| !s.is_empty())
186 }
187
188 fn extract_meta_tags(&mut self, document: &Html) {
189 for element in document.select(meta_selector()) {
190 let el = element.value();
191
192 let content = match el.attr("content") {
194 Some(c) => c.trim().to_string(),
195 None => {
196 if let Some(charset) = el.attr("charset") {
198 self.meta.insert("charset".to_string(), charset.to_string());
199 }
200 continue;
201 }
202 };
203
204 let property = el
206 .attr("property")
207 .or_else(|| el.attr("name"))
208 .or_else(|| el.attr("http-equiv"));
209
210 if let Some(prop) = property {
211 let prop = prop.trim().to_string();
212 self.meta.insert(prop.clone(), content.clone());
213
214 if let Some(og_prop) = prop.strip_prefix("og:") {
216 self.opengraph.extend(og_prop, content.clone());
217 }
218
219 if prop == "description" {
221 self.description = Some(content);
222 }
223 }
224 }
225 }
226
227 fn extract_canonical(document: &Html) -> Option<String> {
228 document
229 .select(canonical_selector())
230 .next()
231 .and_then(|el| el.value().attr("href"))
232 .map(|s| s.trim().to_string())
233 .filter(|s| !s.is_empty())
234 }
235
236 fn extract_feed(document: &Html) -> Option<String> {
237 for element in document.select(feed_selector()) {
238 let el = element.value();
239 if let Some(link_type) = el.attr("type")
240 && FEED_MIME_TYPES.contains(&link_type)
241 {
242 return el.attr("href").map(|s| s.trim().to_string());
243 }
244 }
245 None
246 }
247
248 fn extract_text_content(document: &Html) -> String {
249 let Some(body) = document.select(body_selector()).next() else {
250 return String::new();
251 };
252
253 let excluded_ids: HashSet<_> = document
255 .select(exclude_selector())
256 .map(|el| el.id())
257 .collect();
258
259 let mut text = String::with_capacity(4096); for node in body.descendants() {
262 if text.len() >= MAX_TEXT_CONTENT_LEN {
264 break;
265 }
266
267 if let Some(text_node) = node.value().as_text() {
268 let is_excluded = node.ancestors().any(|a| excluded_ids.contains(&a.id()));
270
271 if !is_excluded {
272 let trimmed = text_node.trim();
273 if !trimmed.is_empty() {
274 if !text.is_empty() {
275 text.push(' ');
276 }
277 let remaining = MAX_TEXT_CONTENT_LEN.saturating_sub(text.len());
279 if trimmed.len() <= remaining {
280 text.push_str(trimmed);
281 } else {
282 text.push_str(&trimmed[..remaining]);
283 break;
284 }
285 }
286 }
287 }
288 }
289
290 text
291 }
292
293 fn extract_links(document: &Html, base_url: Option<&Url>) -> Vec<Link> {
294 document
295 .select(link_selector())
296 .filter_map(|element| {
297 let href = element.value().attr("href")?;
298 let href = href.trim();
299
300 if href.is_empty() || href.starts_with("javascript:") {
302 return None;
303 }
304
305 let url = if let Some(base) = base_url {
306 base.join(href)
307 .map(|u| u.to_string())
308 .unwrap_or_else(|_| href.to_string())
309 } else {
310 href.to_string()
311 };
312
313 let text = element.text().collect::<String>().trim().to_string();
314 let rel = element.value().attr("rel").map(|s| s.to_string());
315
316 Some(Link { url, text, rel })
317 })
318 .take(MAX_LINKS)
319 .collect()
320 }
321
322 fn extract_schema_org(document: &Html) -> Vec<SchemaOrg> {
323 document
324 .select(schema_org_selector())
325 .flat_map(|element| {
326 let content = element.text().collect::<String>();
327 SchemaOrg::parse(&content)
328 })
329 .take(MAX_SCHEMA_ORG_ITEMS)
330 .collect()
331 }
332}
333
334#[cfg(test)]
335mod tests {
336 use super::*;
337
338 #[test]
339 fn test_basic_parsing() {
340 let html = r#"
341 <!DOCTYPE html>
342 <html lang="en">
343 <head>
344 <title>Test Page</title>
345 <meta name="description" content="A test page">
346 <meta property="og:title" content="OG Title">
347 <meta property="og:type" content="article">
348 <link rel="canonical" href="https://example.com/test">
349 </head>
350 <body>
351 <p>Hello World</p>
352 <a href="/about">About Us</a>
353 </body>
354 </html>
355 "#;
356
357 let info = HtmlInfo::from_string(html, Some("https://example.com/")).unwrap();
358
359 assert_eq!(info.title, Some("Test Page".to_string()));
360 assert_eq!(info.description, Some("A test page".to_string()));
361 assert_eq!(info.language, Some("en".to_string()));
362 assert_eq!(
363 info.canonical_url,
364 Some("https://example.com/test".to_string())
365 );
366 assert_eq!(info.opengraph.title, Some("OG Title".to_string()));
367 assert_eq!(info.opengraph.og_type, Some("article".to_string()));
368 assert!(info.text_content.contains("Hello World"));
369 assert_eq!(info.links.len(), 1);
370 assert_eq!(info.links[0].url, "https://example.com/about");
371 assert_eq!(info.links[0].text, "About Us");
372 }
373
374 #[test]
375 fn test_feed_extraction() {
376 let html = r#"
377 <html>
378 <head>
379 <link rel="alternate" type="application/rss+xml" href="/feed.xml">
380 </head>
381 </html>
382 "#;
383
384 let info = HtmlInfo::from_string(html, None).unwrap();
385 assert_eq!(info.feed_url, Some("/feed.xml".to_string()));
386 }
387
388 #[test]
389 fn test_schema_org_extraction() {
390 let html = r#"
391 <html>
392 <head>
393 <script type="application/ld+json">
394 {"@type": "Article", "headline": "Test Article"}
395 </script>
396 </head>
397 </html>
398 "#;
399
400 let info = HtmlInfo::from_string(html, None).unwrap();
401 assert_eq!(info.schema_org.len(), 1);
402 assert_eq!(info.schema_org[0].schema_type, "Article");
403 }
404
405 #[test]
406 fn test_text_excludes_scripts() {
407 let html = r#"
408 <html>
409 <body>
410 <p>Visible text</p>
411 <script>console.log('hidden');</script>
412 <style>.hidden { display: none; }</style>
413 <p>More visible</p>
414 </body>
415 </html>
416 "#;
417
418 let info = HtmlInfo::from_string(html, None).unwrap();
419 assert!(info.text_content.contains("Visible text"));
420 assert!(info.text_content.contains("More visible"));
421 assert!(!info.text_content.contains("console.log"));
422 assert!(!info.text_content.contains(".hidden"));
423 }
424}