1use scraper::Html;
7use std::time::Instant;
8use url::Url;
9
10use crate::content::{
11 extract_headings, extract_paragraphs, extract_lists,
12 extract_tables, extract_code_blocks, extract_quotes, extract_images,
13};
14use crate::links::extract_links;
15use crate::metadata::{extract_metadata, extract_structured_data};
16use crate::text::extract_text;
17use crate::types::{
18 ParsedContent, PageMetadata, TextContent, Heading, Link, Image,
19 ListContent, TableContent, CodeBlock, Quote, StructuredData,
20 ParseStats, ParserConfig, ParserResult,
21};
22
23#[derive(Debug, Clone)]
46pub struct HtmlParser {
47 config: ParserConfig,
48}
49
50impl HtmlParser {
51 pub fn new() -> Self {
53 Self {
54 config: ParserConfig::default(),
55 }
56 }
57
58 pub fn with_config(config: ParserConfig) -> Self {
60 Self { config }
61 }
62
63 pub fn with_base_url(url: &str) -> ParserResult<Self> {
65 let parsed_url = Url::parse(url)?;
66 Ok(Self {
67 config: ParserConfig {
68 base_url: Some(parsed_url),
69 ..Default::default()
70 },
71 })
72 }
73
74 pub fn set_base_url(&mut self, url: &str) -> ParserResult<()> {
76 self.config.base_url = Some(Url::parse(url)?);
77 Ok(())
78 }
79
80 pub fn config(&self) -> &ParserConfig {
82 &self.config
83 }
84
85 pub fn config_mut(&mut self) -> &mut ParserConfig {
87 &mut self.config
88 }
89
90 pub fn parse(&self, html: &str) -> ParserResult<ParsedContent> {
96 let start = Instant::now();
97 let html_size = html.len();
98
99 let document = Html::parse_document(html);
101
102 let mut stats = ParseStats {
104 html_size,
105 ..Default::default()
106 };
107
108 stats.node_count = document.tree.nodes().count();
110
111 let metadata = extract_metadata(&document, self.config.base_url.as_ref())?;
113 let text = extract_text(&document, &self.config)?;
114 let headings = extract_headings(&document)?;
115 let paragraphs = extract_paragraphs(&document, &self.config)?;
116
117 let links = if self.config.extract_links {
118 extract_links(&document, &self.config)?
119 } else {
120 Vec::new()
121 };
122
123 let images = if self.config.extract_images {
124 extract_images(&document, self.config.base_url.as_ref())?
125 } else {
126 Vec::new()
127 };
128
129 let lists = extract_lists(&document)?;
130
131 let tables = if self.config.extract_tables {
132 extract_tables(&document)?
133 } else {
134 Vec::new()
135 };
136
137 let code_blocks = if self.config.extract_code_blocks {
138 extract_code_blocks(&document)?
139 } else {
140 Vec::new()
141 };
142
143 let quotes = extract_quotes(&document)?;
144
145 let structured_data = if self.config.extract_structured_data {
146 extract_structured_data(&document)
147 } else {
148 Vec::new()
149 };
150
151 stats.parse_time_us = start.elapsed().as_micros() as u64;
153
154 Ok(ParsedContent {
155 metadata,
156 text,
157 headings,
158 paragraphs,
159 links,
160 images,
161 lists,
162 tables,
163 code_blocks,
164 quotes,
165 structured_data,
166 stats,
167 })
168 }
169
170 pub fn parse_fragment(&self, html: &str) -> ParserResult<ParsedContent> {
172 let start = Instant::now();
173
174 let wrapped = format!("<body>{}</body>", html);
176 let document = Html::parse_fragment(&wrapped);
177
178 let mut stats = ParseStats {
179 html_size: html.len(),
180 node_count: document.tree.nodes().count(),
181 ..Default::default()
182 };
183
184 let text = extract_text(&document, &self.config)?;
185 let headings = extract_headings(&document)?;
186 let paragraphs = extract_paragraphs(&document, &self.config)?;
187 let links = extract_links(&document, &self.config)?;
188 let images = extract_images(&document, self.config.base_url.as_ref())?;
189 let lists = extract_lists(&document)?;
190 let tables = extract_tables(&document)?;
191 let code_blocks = extract_code_blocks(&document)?;
192 let quotes = extract_quotes(&document)?;
193
194 stats.parse_time_us = start.elapsed().as_micros() as u64;
195
196 Ok(ParsedContent {
197 metadata: PageMetadata::default(),
198 text,
199 headings,
200 paragraphs,
201 links,
202 images,
203 lists,
204 tables,
205 code_blocks,
206 quotes,
207 structured_data: Vec::new(),
208 stats,
209 })
210 }
211
212 pub fn extract_metadata(&self, html: &str) -> ParserResult<PageMetadata> {
218 let document = Html::parse_document(html);
219 extract_metadata(&document, self.config.base_url.as_ref())
220 }
221
222 pub fn extract_text(&self, html: &str) -> ParserResult<TextContent> {
224 let document = Html::parse_document(html);
225 extract_text(&document, &self.config)
226 }
227
228 pub fn extract_headings(&self, html: &str) -> ParserResult<Vec<Heading>> {
230 let document = Html::parse_document(html);
231 extract_headings(&document)
232 }
233
234 pub fn extract_links(&self, html: &str) -> ParserResult<Vec<Link>> {
236 let document = Html::parse_document(html);
237 extract_links(&document, &self.config)
238 }
239
240 pub fn extract_images(&self, html: &str) -> ParserResult<Vec<Image>> {
242 let document = Html::parse_document(html);
243 extract_images(&document, self.config.base_url.as_ref())
244 }
245
246 pub fn extract_lists(&self, html: &str) -> ParserResult<Vec<ListContent>> {
248 let document = Html::parse_document(html);
249 extract_lists(&document)
250 }
251
252 pub fn extract_tables(&self, html: &str) -> ParserResult<Vec<TableContent>> {
254 let document = Html::parse_document(html);
255 extract_tables(&document)
256 }
257
258 pub fn extract_code_blocks(&self, html: &str) -> ParserResult<Vec<CodeBlock>> {
260 let document = Html::parse_document(html);
261 extract_code_blocks(&document)
262 }
263
264 pub fn extract_quotes(&self, html: &str) -> ParserResult<Vec<Quote>> {
266 let document = Html::parse_document(html);
267 extract_quotes(&document)
268 }
269
270 pub fn extract_structured_data(&self, html: &str) -> Vec<StructuredData> {
272 let document = Html::parse_document(html);
273 extract_structured_data(&document)
274 }
275
276 pub fn resolve_url(&self, href: &str) -> Option<String> {
282 let trimmed = href.trim();
283
284 if trimmed.is_empty() {
285 return None;
286 }
287
288 if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
290 return Some(trimmed.to_string());
291 }
292
293 if trimmed.starts_with("//") {
295 return Some(format!("https:{}", trimmed));
296 }
297
298 self.config.base_url.as_ref()
300 .and_then(|base| base.join(trimmed).ok())
301 .map(|u| u.to_string())
302 }
303
304 pub fn has_base_url(&self) -> bool {
306 self.config.base_url.is_some()
307 }
308
309 pub fn base_url(&self) -> Option<&Url> {
311 self.config.base_url.as_ref()
312 }
313}
314
315impl Default for HtmlParser {
316 fn default() -> Self {
317 Self::new()
318 }
319}
320
321pub fn parse(html: &str) -> ParserResult<ParsedContent> {
327 HtmlParser::new().parse(html)
328}
329
330pub fn parse_with_url(html: &str, base_url: &str) -> ParserResult<ParsedContent> {
332 HtmlParser::with_base_url(base_url)?.parse(html)
333}
334
335pub fn get_metadata(html: &str) -> ParserResult<PageMetadata> {
337 HtmlParser::new().extract_metadata(html)
338}
339
340pub fn get_text(html: &str) -> ParserResult<TextContent> {
342 HtmlParser::new().extract_text(html)
343}
344
345pub fn get_links(html: &str) -> ParserResult<Vec<Link>> {
347 HtmlParser::new().extract_links(html)
348}
349
350#[cfg(test)]
355mod tests {
356 use super::*;
357
358 const SAMPLE_HTML: &str = r#"
359 <!DOCTYPE html>
360 <html lang="en">
361 <head>
362 <meta charset="UTF-8">
363 <title>Test Page</title>
364 <meta name="description" content="A test page for parsing">
365 <meta property="og:title" content="OG Test Page">
366 <link rel="canonical" href="https://example.com/test">
367 </head>
368 <body>
369 <header><nav>Navigation</nav></header>
370 <main>
371 <article>
372 <h1>Main Title</h1>
373 <p>This is the first paragraph of the article content.</p>
374 <h2>Section One</h2>
375 <p>Another paragraph with more detailed information.</p>
376 <ul>
377 <li>Item 1</li>
378 <li>Item 2</li>
379 </ul>
380 <a href="/internal">Internal Link</a>
381 <a href="https://external.com" rel="nofollow">External Link</a>
382 <img src="/image.jpg" alt="Test Image">
383 <pre><code class="language-rust">fn main() {}</code></pre>
384 </article>
385 </main>
386 <footer>Footer content</footer>
387 </body>
388 </html>
389 "#;
390
391 #[test]
392 fn test_parser_new() {
393 let parser = HtmlParser::new();
394 assert!(!parser.has_base_url());
395 }
396
397 #[test]
398 fn test_parser_with_base_url() {
399 let parser = HtmlParser::with_base_url("https://example.com").unwrap();
400 assert!(parser.has_base_url());
401 assert_eq!(parser.base_url().unwrap().host_str(), Some("example.com"));
402 }
403
404 #[test]
405 fn test_parser_set_base_url() {
406 let mut parser = HtmlParser::new();
407 parser.set_base_url("https://example.com").unwrap();
408 assert!(parser.has_base_url());
409 }
410
411 #[test]
412 fn test_full_parse() {
413 let parser = HtmlParser::with_base_url("https://example.com").unwrap();
414 let result = parser.parse(SAMPLE_HTML).unwrap();
415
416 assert_eq!(result.metadata.title, Some("Test Page".to_string()));
418 assert_eq!(result.metadata.description, Some("A test page for parsing".to_string()));
419 assert!(result.metadata.opengraph.is_present());
420
421 assert!(!result.headings.is_empty());
423 assert!(!result.paragraphs.is_empty());
424 assert!(!result.lists.is_empty());
425 assert!(!result.links.is_empty());
426 assert!(!result.images.is_empty());
427 assert!(!result.code_blocks.is_empty());
428
429 assert!(result.stats.html_size > 0);
431 assert!(result.stats.parse_time_us > 0);
432 }
433
434 #[test]
435 fn test_extract_metadata_only() {
436 let parser = HtmlParser::new();
437 let metadata = parser.extract_metadata(SAMPLE_HTML).unwrap();
438
439 assert_eq!(metadata.title, Some("Test Page".to_string()));
440 assert_eq!(metadata.language, Some("en".to_string()));
441 }
442
443 #[test]
444 fn test_extract_text_only() {
445 let parser = HtmlParser::new();
446 let text = parser.extract_text(SAMPLE_HTML).unwrap();
447
448 assert!(text.word_count > 0);
449 assert!(text.cleaned_text.contains("Main Title"));
450 }
451
452 #[test]
453 fn test_extract_links_only() {
454 let parser = HtmlParser::with_base_url("https://example.com").unwrap();
455 let links = parser.extract_links(SAMPLE_HTML).unwrap();
456
457 assert_eq!(links.len(), 2);
458
459 let internal = links.iter().find(|l| l.href == "/internal").unwrap();
461 assert_eq!(internal.url, Some("https://example.com/internal".to_string()));
462
463 let external = links.iter().find(|l| l.href == "https://external.com").unwrap();
465 assert!(external.is_nofollow);
466 }
467
468 #[test]
469 fn test_extract_images_only() {
470 let parser = HtmlParser::with_base_url("https://example.com").unwrap();
471 let images = parser.extract_images(SAMPLE_HTML).unwrap();
472
473 assert_eq!(images.len(), 1);
474 assert_eq!(images[0].alt, "Test Image");
475 assert_eq!(images[0].url, Some("https://example.com/image.jpg".to_string()));
476 }
477
478 #[test]
479 fn test_parse_fragment() {
480 let parser = HtmlParser::new();
481 let result = parser.parse_fragment("<p>Hello <strong>world</strong></p>").unwrap();
482
483 let _ = result.paragraphs; }
487
488 #[test]
489 fn test_resolve_url() {
490 let parser = HtmlParser::with_base_url("https://example.com/dir/").unwrap();
491
492 assert_eq!(
493 parser.resolve_url("page.html"),
494 Some("https://example.com/dir/page.html".to_string())
495 );
496
497 assert_eq!(
498 parser.resolve_url("/absolute"),
499 Some("https://example.com/absolute".to_string())
500 );
501
502 assert_eq!(
503 parser.resolve_url("https://other.com"),
504 Some("https://other.com".to_string())
505 );
506 }
507
508 #[test]
509 fn test_convenience_parse() {
510 let result = parse(SAMPLE_HTML).unwrap();
511 assert!(result.metadata.title.is_some());
512 }
513
514 #[test]
515 fn test_convenience_parse_with_url() {
516 let result = parse_with_url(SAMPLE_HTML, "https://example.com").unwrap();
517 assert!(result.metadata.title.is_some());
518 }
519
520 #[test]
521 fn test_convenience_get_metadata() {
522 let metadata = get_metadata(SAMPLE_HTML).unwrap();
523 assert_eq!(metadata.title, Some("Test Page".to_string()));
524 }
525
526 #[test]
527 fn test_convenience_get_text() {
528 let text = get_text(SAMPLE_HTML).unwrap();
529 assert!(text.word_count > 0);
530 }
531
532 #[test]
533 fn test_convenience_get_links() {
534 let links = get_links(SAMPLE_HTML).unwrap();
535 assert!(!links.is_empty());
536 }
537
538 #[test]
539 fn test_parser_with_minimal_config() {
540 let config = ParserConfig::minimal();
541 let parser = HtmlParser::with_config(config);
542 let result = parser.parse(SAMPLE_HTML).unwrap();
543
544 assert!(result.metadata.title.is_some());
546 assert!(result.text.word_count > 0);
547
548 assert!(result.images.is_empty());
550 assert!(result.tables.is_empty());
551 }
552
553 #[test]
554 fn test_malformed_html() {
555 let parser = HtmlParser::new();
556 let result = parser.parse("<p>Unclosed paragraph <div>Mixed</p></div>");
557
558 assert!(result.is_ok());
560 }
561
562 #[test]
563 fn test_empty_html() {
564 let parser = HtmlParser::new();
565 let result = parser.parse("").unwrap();
566
567 assert!(result.metadata.title.is_none());
568 assert!(result.text.is_empty());
569 }
570
571 #[test]
572 fn test_parser_default() {
573 let parser = HtmlParser::default();
574 assert!(!parser.has_base_url());
575 }
576
577 #[test]
578 fn test_config_mutation() {
579 let mut parser = HtmlParser::new();
580 parser.config_mut().extract_images = false;
581
582 let result = parser.parse(SAMPLE_HTML).unwrap();
583 assert!(result.images.is_empty());
584 }
585}