halldyll_parser/
parser.rs

1//! Main HTML parser API for halldyll-parser
2//!
3//! This module provides the primary `HtmlParser` struct that orchestrates
4//! all parsing operations and provides a clean, unified API.
5
6use scraper::Html;
7use std::time::Instant;
8use url::Url;
9
10use crate::content::{
11    extract_headings, extract_paragraphs, extract_lists,
12    extract_tables, extract_code_blocks, extract_quotes, extract_images,
13};
14use crate::links::extract_links;
15use crate::metadata::{extract_metadata, extract_structured_data};
16use crate::text::extract_text;
17use crate::types::{
18    ParsedContent, PageMetadata, TextContent, Heading, Link, Image,
19    ListContent, TableContent, CodeBlock, Quote, StructuredData,
20    ParseStats, ParserConfig, ParserResult,
21};
22
23// ============================================================================
24// HTML PARSER
25// ============================================================================
26
27/// Main HTML parser
28/// 
29/// # Example
30/// ```rust
31/// use halldyll_parser::HtmlParser;
32/// 
33/// let html = r#"
34///     <html>
35///     <head><title>Test</title></head>
36///     <body><p>Hello world</p></body>
37///     </html>
38/// "#;
39/// 
40/// let parser = HtmlParser::new();
41/// let result = parser.parse(html).unwrap();
42/// 
43/// println!("Title: {:?}", result.metadata.title);
44/// ```
45#[derive(Debug, Clone)]
46pub struct HtmlParser {
47    config: ParserConfig,
48}
49
50impl HtmlParser {
51    /// Create a new parser with default configuration
52    pub fn new() -> Self {
53        Self {
54            config: ParserConfig::default(),
55        }
56    }
57
58    /// Create a parser with custom configuration
59    pub fn with_config(config: ParserConfig) -> Self {
60        Self { config }
61    }
62
63    /// Create a parser with a base URL
64    pub fn with_base_url(url: &str) -> ParserResult<Self> {
65        let parsed_url = Url::parse(url)?;
66        Ok(Self {
67            config: ParserConfig {
68                base_url: Some(parsed_url),
69                ..Default::default()
70            },
71        })
72    }
73
74    /// Set the base URL for resolving relative URLs
75    pub fn set_base_url(&mut self, url: &str) -> ParserResult<()> {
76        self.config.base_url = Some(Url::parse(url)?);
77        Ok(())
78    }
79
80    /// Get the current configuration
81    pub fn config(&self) -> &ParserConfig {
82        &self.config
83    }
84
85    /// Get mutable configuration
86    pub fn config_mut(&mut self) -> &mut ParserConfig {
87        &mut self.config
88    }
89
90    // ========================================================================
91    // MAIN PARSE METHODS
92    // ========================================================================
93
94    /// Parse HTML and extract all content
95    pub fn parse(&self, html: &str) -> ParserResult<ParsedContent> {
96        let start = Instant::now();
97        let html_size = html.len();
98        
99        // Parse HTML document
100        let document = Html::parse_document(html);
101        
102        // Initialize stats
103        let mut stats = ParseStats {
104            html_size,
105            ..Default::default()
106        };
107
108        // Count nodes
109        stats.node_count = document.tree.nodes().count();
110        
111        // Extract all content
112        let metadata = extract_metadata(&document, self.config.base_url.as_ref())?;
113        let text = extract_text(&document, &self.config)?;
114        let headings = extract_headings(&document)?;
115        let paragraphs = extract_paragraphs(&document, &self.config)?;
116        
117        let links = if self.config.extract_links {
118            extract_links(&document, &self.config)?
119        } else {
120            Vec::new()
121        };
122        
123        let images = if self.config.extract_images {
124            extract_images(&document, self.config.base_url.as_ref())?
125        } else {
126            Vec::new()
127        };
128        
129        let lists = extract_lists(&document)?;
130        
131        let tables = if self.config.extract_tables {
132            extract_tables(&document)?
133        } else {
134            Vec::new()
135        };
136        
137        let code_blocks = if self.config.extract_code_blocks {
138            extract_code_blocks(&document)?
139        } else {
140            Vec::new()
141        };
142        
143        let quotes = extract_quotes(&document)?;
144        
145        let structured_data = if self.config.extract_structured_data {
146            extract_structured_data(&document)
147        } else {
148            Vec::new()
149        };
150        
151        // Finalize stats
152        stats.parse_time_us = start.elapsed().as_micros() as u64;
153        
154        Ok(ParsedContent {
155            metadata,
156            text,
157            headings,
158            paragraphs,
159            links,
160            images,
161            lists,
162            tables,
163            code_blocks,
164            quotes,
165            structured_data,
166            stats,
167        })
168    }
169
170    /// Parse HTML fragment (not a full document)
171    pub fn parse_fragment(&self, html: &str) -> ParserResult<ParsedContent> {
172        let start = Instant::now();
173        
174        // Wrap in body for consistent parsing
175        let wrapped = format!("<body>{}</body>", html);
176        let document = Html::parse_fragment(&wrapped);
177        
178        let mut stats = ParseStats {
179            html_size: html.len(),
180            node_count: document.tree.nodes().count(),
181            ..Default::default()
182        };
183
184        let text = extract_text(&document, &self.config)?;
185        let headings = extract_headings(&document)?;
186        let paragraphs = extract_paragraphs(&document, &self.config)?;
187        let links = extract_links(&document, &self.config)?;
188        let images = extract_images(&document, self.config.base_url.as_ref())?;
189        let lists = extract_lists(&document)?;
190        let tables = extract_tables(&document)?;
191        let code_blocks = extract_code_blocks(&document)?;
192        let quotes = extract_quotes(&document)?;
193        
194        stats.parse_time_us = start.elapsed().as_micros() as u64;
195        
196        Ok(ParsedContent {
197            metadata: PageMetadata::default(),
198            text,
199            headings,
200            paragraphs,
201            links,
202            images,
203            lists,
204            tables,
205            code_blocks,
206            quotes,
207            structured_data: Vec::new(),
208            stats,
209        })
210    }
211
212    // ========================================================================
213    // INDIVIDUAL EXTRACTION METHODS
214    // ========================================================================
215
216    /// Extract only metadata
217    pub fn extract_metadata(&self, html: &str) -> ParserResult<PageMetadata> {
218        let document = Html::parse_document(html);
219        extract_metadata(&document, self.config.base_url.as_ref())
220    }
221
222    /// Extract only text content
223    pub fn extract_text(&self, html: &str) -> ParserResult<TextContent> {
224        let document = Html::parse_document(html);
225        extract_text(&document, &self.config)
226    }
227
228    /// Extract only headings
229    pub fn extract_headings(&self, html: &str) -> ParserResult<Vec<Heading>> {
230        let document = Html::parse_document(html);
231        extract_headings(&document)
232    }
233
234    /// Extract only links
235    pub fn extract_links(&self, html: &str) -> ParserResult<Vec<Link>> {
236        let document = Html::parse_document(html);
237        extract_links(&document, &self.config)
238    }
239
240    /// Extract only images
241    pub fn extract_images(&self, html: &str) -> ParserResult<Vec<Image>> {
242        let document = Html::parse_document(html);
243        extract_images(&document, self.config.base_url.as_ref())
244    }
245
246    /// Extract only lists
247    pub fn extract_lists(&self, html: &str) -> ParserResult<Vec<ListContent>> {
248        let document = Html::parse_document(html);
249        extract_lists(&document)
250    }
251
252    /// Extract only tables
253    pub fn extract_tables(&self, html: &str) -> ParserResult<Vec<TableContent>> {
254        let document = Html::parse_document(html);
255        extract_tables(&document)
256    }
257
258    /// Extract only code blocks
259    pub fn extract_code_blocks(&self, html: &str) -> ParserResult<Vec<CodeBlock>> {
260        let document = Html::parse_document(html);
261        extract_code_blocks(&document)
262    }
263
264    /// Extract only quotes
265    pub fn extract_quotes(&self, html: &str) -> ParserResult<Vec<Quote>> {
266        let document = Html::parse_document(html);
267        extract_quotes(&document)
268    }
269
270    /// Extract only structured data
271    pub fn extract_structured_data(&self, html: &str) -> Vec<StructuredData> {
272        let document = Html::parse_document(html);
273        extract_structured_data(&document)
274    }
275
276    // ========================================================================
277    // UTILITY METHODS
278    // ========================================================================
279
280    /// Resolve a relative URL to absolute using the parser's base URL
281    pub fn resolve_url(&self, href: &str) -> Option<String> {
282        let trimmed = href.trim();
283        
284        if trimmed.is_empty() {
285            return None;
286        }
287        
288        // Already absolute
289        if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
290            return Some(trimmed.to_string());
291        }
292        
293        // Protocol-relative
294        if trimmed.starts_with("//") {
295            return Some(format!("https:{}", trimmed));
296        }
297        
298        // Resolve relative
299        self.config.base_url.as_ref()
300            .and_then(|base| base.join(trimmed).ok())
301            .map(|u| u.to_string())
302    }
303
304    /// Check if the parser has a base URL configured
305    pub fn has_base_url(&self) -> bool {
306        self.config.base_url.is_some()
307    }
308
309    /// Get the base URL if configured
310    pub fn base_url(&self) -> Option<&Url> {
311        self.config.base_url.as_ref()
312    }
313}
314
315impl Default for HtmlParser {
316    fn default() -> Self {
317        Self::new()
318    }
319}
320
321// ============================================================================
322// CONVENIENCE FUNCTIONS
323// ============================================================================
324
325/// Parse HTML with default settings (convenience function)
326pub fn parse(html: &str) -> ParserResult<ParsedContent> {
327    HtmlParser::new().parse(html)
328}
329
330/// Parse HTML with a base URL (convenience function)
331pub fn parse_with_url(html: &str, base_url: &str) -> ParserResult<ParsedContent> {
332    HtmlParser::with_base_url(base_url)?.parse(html)
333}
334
335/// Quick metadata extraction (convenience function)
336pub fn get_metadata(html: &str) -> ParserResult<PageMetadata> {
337    HtmlParser::new().extract_metadata(html)
338}
339
340/// Quick text extraction (convenience function)
341pub fn get_text(html: &str) -> ParserResult<TextContent> {
342    HtmlParser::new().extract_text(html)
343}
344
345/// Quick link extraction (convenience function)
346pub fn get_links(html: &str) -> ParserResult<Vec<Link>> {
347    HtmlParser::new().extract_links(html)
348}
349
350// ============================================================================
351// TESTS
352// ============================================================================
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    const SAMPLE_HTML: &str = r#"
359        <!DOCTYPE html>
360        <html lang="en">
361        <head>
362            <meta charset="UTF-8">
363            <title>Test Page</title>
364            <meta name="description" content="A test page for parsing">
365            <meta property="og:title" content="OG Test Page">
366            <link rel="canonical" href="https://example.com/test">
367        </head>
368        <body>
369            <header><nav>Navigation</nav></header>
370            <main>
371                <article>
372                    <h1>Main Title</h1>
373                    <p>This is the first paragraph of the article content.</p>
374                    <h2>Section One</h2>
375                    <p>Another paragraph with more detailed information.</p>
376                    <ul>
377                        <li>Item 1</li>
378                        <li>Item 2</li>
379                    </ul>
380                    <a href="/internal">Internal Link</a>
381                    <a href="https://external.com" rel="nofollow">External Link</a>
382                    <img src="/image.jpg" alt="Test Image">
383                    <pre><code class="language-rust">fn main() {}</code></pre>
384                </article>
385            </main>
386            <footer>Footer content</footer>
387        </body>
388        </html>
389    "#;
390
391    #[test]
392    fn test_parser_new() {
393        let parser = HtmlParser::new();
394        assert!(!parser.has_base_url());
395    }
396
397    #[test]
398    fn test_parser_with_base_url() {
399        let parser = HtmlParser::with_base_url("https://example.com").unwrap();
400        assert!(parser.has_base_url());
401        assert_eq!(parser.base_url().unwrap().host_str(), Some("example.com"));
402    }
403
404    #[test]
405    fn test_parser_set_base_url() {
406        let mut parser = HtmlParser::new();
407        parser.set_base_url("https://example.com").unwrap();
408        assert!(parser.has_base_url());
409    }
410
411    #[test]
412    fn test_full_parse() {
413        let parser = HtmlParser::with_base_url("https://example.com").unwrap();
414        let result = parser.parse(SAMPLE_HTML).unwrap();
415        
416        // Metadata
417        assert_eq!(result.metadata.title, Some("Test Page".to_string()));
418        assert_eq!(result.metadata.description, Some("A test page for parsing".to_string()));
419        assert!(result.metadata.opengraph.is_present());
420        
421        // Content
422        assert!(!result.headings.is_empty());
423        assert!(!result.paragraphs.is_empty());
424        assert!(!result.lists.is_empty());
425        assert!(!result.links.is_empty());
426        assert!(!result.images.is_empty());
427        assert!(!result.code_blocks.is_empty());
428        
429        // Stats
430        assert!(result.stats.html_size > 0);
431        assert!(result.stats.parse_time_us > 0);
432    }
433
434    #[test]
435    fn test_extract_metadata_only() {
436        let parser = HtmlParser::new();
437        let metadata = parser.extract_metadata(SAMPLE_HTML).unwrap();
438        
439        assert_eq!(metadata.title, Some("Test Page".to_string()));
440        assert_eq!(metadata.language, Some("en".to_string()));
441    }
442
443    #[test]
444    fn test_extract_text_only() {
445        let parser = HtmlParser::new();
446        let text = parser.extract_text(SAMPLE_HTML).unwrap();
447        
448        assert!(text.word_count > 0);
449        assert!(text.cleaned_text.contains("Main Title"));
450    }
451
452    #[test]
453    fn test_extract_links_only() {
454        let parser = HtmlParser::with_base_url("https://example.com").unwrap();
455        let links = parser.extract_links(SAMPLE_HTML).unwrap();
456        
457        assert_eq!(links.len(), 2);
458        
459        // Check internal link
460        let internal = links.iter().find(|l| l.href == "/internal").unwrap();
461        assert_eq!(internal.url, Some("https://example.com/internal".to_string()));
462        
463        // Check external link
464        let external = links.iter().find(|l| l.href == "https://external.com").unwrap();
465        assert!(external.is_nofollow);
466    }
467
468    #[test]
469    fn test_extract_images_only() {
470        let parser = HtmlParser::with_base_url("https://example.com").unwrap();
471        let images = parser.extract_images(SAMPLE_HTML).unwrap();
472        
473        assert_eq!(images.len(), 1);
474        assert_eq!(images[0].alt, "Test Image");
475        assert_eq!(images[0].url, Some("https://example.com/image.jpg".to_string()));
476    }
477
478    #[test]
479    fn test_parse_fragment() {
480        let parser = HtmlParser::new();
481        let result = parser.parse_fragment("<p>Hello <strong>world</strong></p>").unwrap();
482        
483        // Fragment parsing - just verify it parses without error
484        // Text extraction may be empty for fragments without body
485        let _ = result.paragraphs; // Use result to validate parsing works
486    }
487
488    #[test]
489    fn test_resolve_url() {
490        let parser = HtmlParser::with_base_url("https://example.com/dir/").unwrap();
491        
492        assert_eq!(
493            parser.resolve_url("page.html"),
494            Some("https://example.com/dir/page.html".to_string())
495        );
496        
497        assert_eq!(
498            parser.resolve_url("/absolute"),
499            Some("https://example.com/absolute".to_string())
500        );
501        
502        assert_eq!(
503            parser.resolve_url("https://other.com"),
504            Some("https://other.com".to_string())
505        );
506    }
507
508    #[test]
509    fn test_convenience_parse() {
510        let result = parse(SAMPLE_HTML).unwrap();
511        assert!(result.metadata.title.is_some());
512    }
513
514    #[test]
515    fn test_convenience_parse_with_url() {
516        let result = parse_with_url(SAMPLE_HTML, "https://example.com").unwrap();
517        assert!(result.metadata.title.is_some());
518    }
519
520    #[test]
521    fn test_convenience_get_metadata() {
522        let metadata = get_metadata(SAMPLE_HTML).unwrap();
523        assert_eq!(metadata.title, Some("Test Page".to_string()));
524    }
525
526    #[test]
527    fn test_convenience_get_text() {
528        let text = get_text(SAMPLE_HTML).unwrap();
529        assert!(text.word_count > 0);
530    }
531
532    #[test]
533    fn test_convenience_get_links() {
534        let links = get_links(SAMPLE_HTML).unwrap();
535        assert!(!links.is_empty());
536    }
537
538    #[test]
539    fn test_parser_with_minimal_config() {
540        let config = ParserConfig::minimal();
541        let parser = HtmlParser::with_config(config);
542        let result = parser.parse(SAMPLE_HTML).unwrap();
543        
544        // Should still extract metadata and text
545        assert!(result.metadata.title.is_some());
546        assert!(result.text.word_count > 0);
547        
548        // But not images/tables/etc
549        assert!(result.images.is_empty());
550        assert!(result.tables.is_empty());
551    }
552
553    #[test]
554    fn test_malformed_html() {
555        let parser = HtmlParser::new();
556        let result = parser.parse("<p>Unclosed paragraph <div>Mixed</p></div>");
557        
558        // Should still parse without error
559        assert!(result.is_ok());
560    }
561
562    #[test]
563    fn test_empty_html() {
564        let parser = HtmlParser::new();
565        let result = parser.parse("").unwrap();
566        
567        assert!(result.metadata.title.is_none());
568        assert!(result.text.is_empty());
569    }
570
571    #[test]
572    fn test_parser_default() {
573        let parser = HtmlParser::default();
574        assert!(!parser.has_base_url());
575    }
576
577    #[test]
578    fn test_config_mutation() {
579        let mut parser = HtmlParser::new();
580        parser.config_mut().extract_images = false;
581        
582        let result = parser.parse(SAMPLE_HTML).unwrap();
583        assert!(result.images.is_empty());
584    }
585}