halldyll_parser/
content.rs

1//! Content extraction for halldyll-parser
2//!
3//! This module handles extraction of structured content:
4//! - Headings (h1-h6)
5//! - Paragraphs
6//! - Lists (ordered, unordered, definition)
7//! - Tables
8//! - Code blocks
9//! - Blockquotes
10//! - Images
11
12use scraper::{Html, ElementRef};
13use url::Url;
14
15use crate::selector::{SELECTORS, try_parse_selector, heading_selector};
16use crate::types::{
17    Heading, Image, ImageLoading, ListContent, ListType, ListItem,
18    TableContent, TableRow, TableCell, CodeBlock, Quote,
19    ParserConfig, ParserResult,
20};
21
22// ============================================================================
23// HEADINGS
24// ============================================================================
25
26/// Extract all headings from the document
27pub fn extract_headings(document: &Html) -> ParserResult<Vec<Heading>> {
28    let mut headings = Vec::new();
29    
30    for level in 1..=6 {
31        let selector = heading_selector(level);
32        
33        for element in document.select(selector) {
34            let text = element.text().collect::<String>().trim().to_string();
35            
36            if text.is_empty() {
37                continue;
38            }
39            
40            let mut heading = Heading::new(level, &text);
41            
42            // Get ID if present
43            if let Some(id) = element.value().attr("id") {
44                heading.id = Some(id.to_string());
45            }
46            
47            // Get classes
48            heading.classes = element.value().classes()
49                .map(|c| c.to_string())
50                .collect();
51            
52            headings.push(heading);
53        }
54    }
55    
56    Ok(headings)
57}
58
59/// Get the main heading (first h1)
60pub fn get_main_heading(document: &Html) -> Option<String> {
61    document.select(&SELECTORS.h1)
62        .next()
63        .map(|el| el.text().collect::<String>().trim().to_string())
64        .filter(|s| !s.is_empty())
65}
66
67/// Build document outline from headings
68pub fn build_outline(headings: &[Heading]) -> Vec<OutlineItem> {
69    let mut outline = Vec::new();
70    let mut stack: Vec<(u8, usize)> = Vec::new(); // (level, index)
71    
72    for heading in headings {
73        let item = OutlineItem {
74            level: heading.level,
75            text: heading.text.clone(),
76            id: heading.id.clone(),
77            children: Vec::new(),
78        };
79        
80        // Pop items with same or higher level
81        while let Some((level, _)) = stack.last() {
82            if *level >= heading.level {
83                stack.pop();
84            } else {
85                break;
86            }
87        }
88        
89        outline.push(item);
90        stack.push((heading.level, outline.len() - 1));
91    }
92    
93    outline
94}
95
96/// Document outline item
97#[derive(Debug, Clone)]
98pub struct OutlineItem {
99    pub level: u8,
100    pub text: String,
101    pub id: Option<String>,
102    pub children: Vec<OutlineItem>,
103}
104
105// ============================================================================
106// PARAGRAPHS
107// ============================================================================
108
109/// Extract all paragraphs from the document
110pub fn extract_paragraphs(document: &Html, config: &ParserConfig) -> ParserResult<Vec<String>> {
111    let mut paragraphs = Vec::new();
112    
113    for element in document.select(&SELECTORS.p) {
114        let text = element.text().collect::<String>().trim().to_string();
115        
116        // Filter by minimum length
117        if text.len() >= config.min_paragraph_length {
118            paragraphs.push(text);
119        }
120    }
121    
122    Ok(paragraphs)
123}
124
125// ============================================================================
126// LISTS
127// ============================================================================
128
129/// Extract all lists from the document
130pub fn extract_lists(document: &Html) -> ParserResult<Vec<ListContent>> {
131    let mut lists = Vec::new();
132    
133    // Ordered lists
134    for ol in document.select(&SELECTORS.ol) {
135        if let Some(list) = extract_list(&ol, ListType::Ordered) {
136            lists.push(list);
137        }
138    }
139    
140    // Unordered lists
141    for ul in document.select(&SELECTORS.ul) {
142        if let Some(list) = extract_list(&ul, ListType::Unordered) {
143            lists.push(list);
144        }
145    }
146    
147    // Definition lists
148    for dl in document.select(&SELECTORS.dl) {
149        if let Some(list) = extract_definition_list(&dl) {
150            lists.push(list);
151        }
152    }
153    
154    Ok(lists)
155}
156
157/// Extract a single list
158fn extract_list(element: &ElementRef, list_type: ListType) -> Option<ListContent> {
159    let mut list = ListContent::new(list_type);
160    
161    // Only process direct li children
162    for child in element.children() {
163        if let Some(li) = ElementRef::wrap(child) {
164            if li.value().name() == "li" {
165                let item = extract_list_item(&li);
166                list.add_item(item);
167            }
168        }
169    }
170    
171    if list.is_empty() {
172        None
173    } else {
174        Some(list)
175    }
176}
177
178/// Extract a list item (with potential nested list)
179fn extract_list_item(element: &ElementRef) -> ListItem {
180    // Get text content (excluding nested lists)
181    let mut text = String::new();
182    let mut nested: Option<ListContent> = None;
183    
184    for child in element.children() {
185        match child.value() {
186            scraper::Node::Text(t) => {
187                text.push_str(t.text.trim());
188            }
189            scraper::Node::Element(el) => {
190                if let Some(child_el) = ElementRef::wrap(child) {
191                    match el.name() {
192                        "ul" => {
193                            nested = extract_list(&child_el, ListType::Unordered);
194                        }
195                        "ol" => {
196                            nested = extract_list(&child_el, ListType::Ordered);
197                        }
198                        _ => {
199                            // Get text from inline elements
200                            text.push_str(&child_el.text().collect::<String>());
201                        }
202                    }
203                }
204            }
205            _ => {}
206        }
207    }
208    
209    if let Some(nested_list) = nested {
210        ListItem::with_nested(text.trim(), nested_list)
211    } else {
212        ListItem::new(text.trim())
213    }
214}
215
216/// Extract a definition list
217fn extract_definition_list(element: &ElementRef) -> Option<ListContent> {
218    let mut list = ListContent::new(ListType::Definition);
219    
220    let mut current_term: Option<String> = None;
221    
222    for child in element.children() {
223        if let Some(el) = ElementRef::wrap(child) {
224            match el.value().name() {
225                "dt" => {
226                    current_term = Some(el.text().collect::<String>().trim().to_string());
227                }
228                "dd" => {
229                    let definition = el.text().collect::<String>().trim().to_string();
230                    let item_text = if let Some(term) = current_term.take() {
231                        format!("{}: {}", term, definition)
232                    } else {
233                        definition
234                    };
235                    list.add_item(ListItem::new(item_text));
236                }
237                _ => {}
238            }
239        }
240    }
241    
242    if list.is_empty() {
243        None
244    } else {
245        Some(list)
246    }
247}
248
249// ============================================================================
250// TABLES
251// ============================================================================
252
253/// Extract all tables from the document
254pub fn extract_tables(document: &Html) -> ParserResult<Vec<TableContent>> {
255    let mut tables = Vec::new();
256    
257    for table_el in document.select(&SELECTORS.table) {
258        if let Some(table) = extract_table(&table_el) {
259            tables.push(table);
260        }
261    }
262    
263    Ok(tables)
264}
265
266/// Extract a single table
267fn extract_table(element: &ElementRef) -> Option<TableContent> {
268    let mut table = TableContent::new();
269    
270    // Caption
271    if let Some(caption) = element.select(&SELECTORS.caption).next() {
272        table.caption = Some(caption.text().collect::<String>().trim().to_string());
273    }
274    
275    // Summary attribute
276    table.summary = element.value().attr("summary").map(|s| s.to_string());
277    
278    // Headers (from thead or th elements)
279    if let Some(thead) = element.select(&SELECTORS.thead).next() {
280        for tr in thead.select(&SELECTORS.tr) {
281            let row = extract_table_row(&tr, true);
282            if !row.cells.is_empty() {
283                table.headers.push(row);
284            }
285        }
286    } else {
287        // Look for th in first row
288        if let Some(first_tr) = element.select(&SELECTORS.tr).next() {
289            let cells: Vec<_> = first_tr.select(&SELECTORS.th).collect();
290            if !cells.is_empty() {
291                let row = extract_table_row(&first_tr, true);
292                table.headers.push(row);
293            }
294        }
295    }
296    
297    // Body rows
298    let tbody_selector = &SELECTORS.tbody;
299    let rows_to_process: Vec<ElementRef> = if let Some(tbody) = element.select(tbody_selector).next() {
300        tbody.select(&SELECTORS.tr).collect()
301    } else {
302        // Skip header row if we extracted it
303        let all_rows: Vec<_> = element.select(&SELECTORS.tr).collect();
304        if !table.headers.is_empty() && !all_rows.is_empty() {
305            all_rows.into_iter().skip(1).collect()
306        } else {
307            all_rows
308        }
309    };
310    
311    for tr in rows_to_process {
312        let row = extract_table_row(&tr, false);
313        if !row.cells.is_empty() {
314            // Update column count
315            if row.cells.len() > table.column_count {
316                table.column_count = row.cells.len();
317            }
318            table.rows.push(row);
319        }
320    }
321    
322    if table.is_empty() {
323        None
324    } else {
325        Some(table)
326    }
327}
328
329/// Extract a table row
330fn extract_table_row(element: &ElementRef, is_header: bool) -> TableRow {
331    let mut cells = Vec::new();
332    
333    // Get both th and td cells
334    for child in element.children() {
335        if let Some(cell_el) = ElementRef::wrap(child) {
336            let tag = cell_el.value().name();
337            if tag == "th" || tag == "td" {
338                let cell = extract_table_cell(&cell_el, tag == "th");
339                cells.push(cell);
340            }
341        }
342    }
343    
344    TableRow {
345        cells,
346        is_header_row: is_header,
347    }
348}
349
350/// Extract a table cell
351fn extract_table_cell(element: &ElementRef, is_header: bool) -> TableCell {
352    let content = element.text().collect::<String>().trim().to_string();
353    
354    let colspan = element.value().attr("colspan")
355        .and_then(|s| s.parse().ok())
356        .unwrap_or(1);
357    
358    let rowspan = element.value().attr("rowspan")
359        .and_then(|s| s.parse().ok())
360        .unwrap_or(1);
361    
362    TableCell {
363        content,
364        is_header,
365        colspan,
366        rowspan,
367    }
368}
369
370// ============================================================================
371// CODE BLOCKS
372// ============================================================================
373
374/// Extract all code blocks from the document
375pub fn extract_code_blocks(document: &Html) -> ParserResult<Vec<CodeBlock>> {
376    let mut code_blocks = Vec::new();
377    let mut seen_codes: std::collections::HashSet<String> = std::collections::HashSet::new();
378    
379    // Pre > code (most common)
380    for pre in document.select(&SELECTORS.pre) {
381        let code = if let Some(code_el) = pre.select(&SELECTORS.code).next() {
382            extract_code_block(&code_el, false)
383        } else {
384            extract_code_block(&pre, false)
385        };
386        
387        // Deduplicate
388        if !code.code.trim().is_empty() && !seen_codes.contains(&code.code) {
389            seen_codes.insert(code.code.clone());
390            code_blocks.push(code);
391        }
392    }
393    
394    // Standalone code elements (inline code)
395    for code_el in document.select(&SELECTORS.code) {
396        // Skip if inside pre (already handled)
397        let in_pre = code_el.ancestors()
398            .any(|ancestor| {
399                ancestor.value().as_element()
400                    .map(|e| e.name() == "pre")
401                    .unwrap_or(false)
402            });
403        
404        if !in_pre {
405            let code = extract_code_block(&code_el, true);
406            if !code.code.trim().is_empty() && !seen_codes.contains(&code.code) {
407                seen_codes.insert(code.code.clone());
408                code_blocks.push(code);
409            }
410        }
411    }
412    
413    Ok(code_blocks)
414}
415
416/// Extract a single code block
417fn extract_code_block(element: &ElementRef, is_inline: bool) -> CodeBlock {
418    let code = element.text().collect::<String>();
419    
420    // Detect language from class
421    let language = element.value().classes()
422        .find(|c| {
423            c.starts_with("language-") || 
424            c.starts_with("lang-") || 
425            c.starts_with("hljs-") ||
426            is_known_language(c)
427        })
428        .map(|c| {
429            c.trim_start_matches("language-")
430             .trim_start_matches("lang-")
431             .trim_start_matches("hljs-")
432             .to_string()
433        });
434    
435    // Check for data-language attribute
436    let language = language.or_else(|| {
437        element.value().attr("data-language")
438            .or_else(|| element.value().attr("data-lang"))
439            .map(|s| s.to_string())
440    });
441    
442    let mut block = CodeBlock::new(&code);
443    block.language = language;
444    block.is_inline = is_inline;
445    
446    block
447}
448
449/// Check if a class name is a known programming language
450fn is_known_language(class: &str) -> bool {
451    let known = [
452        "rust", "python", "javascript", "typescript", "java", "c", "cpp", 
453        "csharp", "go", "ruby", "php", "swift", "kotlin", "scala", "html",
454        "css", "sql", "bash", "shell", "json", "yaml", "xml", "markdown",
455    ];
456    known.contains(&class.to_lowercase().as_str())
457}
458
459// ============================================================================
460// QUOTES
461// ============================================================================
462
463/// Extract all blockquotes from the document
464pub fn extract_quotes(document: &Html) -> ParserResult<Vec<Quote>> {
465    let mut quotes = Vec::new();
466    
467    for blockquote in document.select(&SELECTORS.blockquote) {
468        let text = blockquote.text().collect::<String>().trim().to_string();
469        
470        if text.is_empty() {
471            continue;
472        }
473        
474        let mut quote = Quote::new(&text);
475        
476        // Look for cite attribute
477        quote.cite_url = blockquote.value().attr("cite").map(|s| s.to_string());
478        
479        // Look for footer/cite element for attribution
480        if let Some(sel) = try_parse_selector("footer, cite") {
481            if let Some(cite_el) = blockquote.select(&sel).next() {
482                quote.cite = Some(cite_el.text().collect::<String>().trim().to_string());
483            }
484        }
485        
486        quotes.push(quote);
487    }
488    
489    Ok(quotes)
490}
491
492// ============================================================================
493// IMAGES
494// ============================================================================
495
496/// Extract all images from the document
497pub fn extract_images(document: &Html, base_url: Option<&Url>) -> ParserResult<Vec<Image>> {
498    let mut images = Vec::new();
499    
500    for img in document.select(&SELECTORS.img) {
501        if let Some(image) = extract_image(&img, base_url) {
502            images.push(image);
503        }
504    }
505    
506    Ok(images)
507}
508
509/// Extract a single image
510fn extract_image(element: &ElementRef, base_url: Option<&Url>) -> Option<Image> {
511    let src = element.value().attr("src")
512        .or_else(|| element.value().attr("data-src"))
513        .or_else(|| element.value().attr("data-lazy-src"))?;
514    
515    let alt = element.value().attr("alt").unwrap_or("").to_string();
516    
517    let mut image = Image::new(src, &alt);
518    
519    // Resolve URL
520    image.url = resolve_image_url(src, base_url);
521    
522    // Get dimensions
523    image.width = element.value().attr("width")
524        .and_then(|s| s.trim_end_matches("px").parse().ok());
525    image.height = element.value().attr("height")
526        .and_then(|s| s.trim_end_matches("px").parse().ok());
527    
528    // Responsive images
529    image.srcset = element.value().attr("srcset").map(|s| s.to_string());
530    image.sizes = element.value().attr("sizes").map(|s| s.to_string());
531    
532    // Loading attribute
533    image.loading = match element.value().attr("loading") {
534        Some("lazy") => ImageLoading::Lazy,
535        _ => ImageLoading::Eager,
536    };
537    
538    // Title
539    image.title = element.value().attr("title").map(|s| s.to_string());
540    
541    Some(image)
542}
543
544/// Resolve image URL
545fn resolve_image_url(src: &str, base_url: Option<&Url>) -> Option<String> {
546    let trimmed = src.trim();
547    
548    if trimmed.is_empty() || trimmed.starts_with("data:") {
549        return None;
550    }
551    
552    if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
553        return Some(trimmed.to_string());
554    }
555    
556    if trimmed.starts_with("//") {
557        return Some(format!("https:{}", trimmed));
558    }
559    
560    base_url
561        .and_then(|base| base.join(trimmed).ok())
562        .map(|u| u.to_string())
563}
564
565// ============================================================================
566// TESTS
567// ============================================================================
568
569#[cfg(test)]
570mod tests {
571    use super::*;
572
573    fn parse_html(html: &str) -> Html {
574        Html::parse_document(html)
575    }
576
577    #[test]
578    fn test_extract_headings() {
579        let doc = parse_html(r#"
580            <html><body>
581                <h1 id="main">Main Title</h1>
582                <h2>Section 1</h2>
583                <h2>Section 2</h2>
584                <h3>Subsection</h3>
585            </body></html>
586        "#);
587        
588        let headings = extract_headings(&doc).unwrap();
589        assert_eq!(headings.len(), 4);
590        assert_eq!(headings[0].level, 1);
591        assert_eq!(headings[0].text, "Main Title");
592        assert_eq!(headings[0].id, Some("main".to_string()));
593    }
594
595    #[test]
596    fn test_get_main_heading() {
597        let doc = parse_html("<html><body><h1>Main Title</h1></body></html>");
598        assert_eq!(get_main_heading(&doc), Some("Main Title".to_string()));
599    }
600
601    #[test]
602    fn test_extract_paragraphs() {
603        let doc = parse_html(r#"
604            <html><body>
605                <p>This is a long enough paragraph to be included.</p>
606                <p>Short</p>
607                <p>Another paragraph that should be extracted.</p>
608            </body></html>
609        "#);
610        
611        let config = ParserConfig::default();
612        let paragraphs = extract_paragraphs(&doc, &config).unwrap();
613        assert_eq!(paragraphs.len(), 2);
614    }
615
616    #[test]
617    fn test_extract_ordered_list() {
618        let doc = parse_html(r#"
619            <ol>
620                <li>First item</li>
621                <li>Second item</li>
622                <li>Third item</li>
623            </ol>
624        "#);
625        
626        let lists = extract_lists(&doc).unwrap();
627        assert_eq!(lists.len(), 1);
628        assert_eq!(lists[0].list_type, ListType::Ordered);
629        assert_eq!(lists[0].items.len(), 3);
630    }
631
632    #[test]
633    fn test_extract_nested_list() {
634        let doc = parse_html(r#"
635            <ul>
636                <li>Item 1
637                    <ul>
638                        <li>Nested 1</li>
639                        <li>Nested 2</li>
640                    </ul>
641                </li>
642                <li>Item 2</li>
643            </ul>
644        "#);
645        
646        let lists = extract_lists(&doc).unwrap();
647        assert!(!lists.is_empty());
648        // First item should have nested list
649        assert!(lists[0].items[0].nested.is_some());
650    }
651
652    #[test]
653    fn test_extract_table() {
654        let doc = parse_html(r#"
655            <table>
656                <caption>Test Table</caption>
657                <thead>
658                    <tr><th>Header 1</th><th>Header 2</th></tr>
659                </thead>
660                <tbody>
661                    <tr><td>Cell 1</td><td>Cell 2</td></tr>
662                    <tr><td>Cell 3</td><td>Cell 4</td></tr>
663                </tbody>
664            </table>
665        "#);
666        
667        let tables = extract_tables(&doc).unwrap();
668        assert_eq!(tables.len(), 1);
669        assert_eq!(tables[0].caption, Some("Test Table".to_string()));
670        assert_eq!(tables[0].headers.len(), 1);
671        assert_eq!(tables[0].rows.len(), 2);
672        assert_eq!(tables[0].column_count, 2);
673    }
674
675    #[test]
676    fn test_extract_code_block() {
677        let doc = parse_html(r#"
678            <pre><code class="language-rust">
679                fn main() {
680                    println!("Hello");
681                }
682            </code></pre>
683        "#);
684        
685        let code_blocks = extract_code_blocks(&doc).unwrap();
686        assert_eq!(code_blocks.len(), 1);
687        assert_eq!(code_blocks[0].language, Some("rust".to_string()));
688        assert!(!code_blocks[0].is_inline);
689    }
690
691    #[test]
692    fn test_extract_inline_code() {
693        let doc = parse_html(r#"<p>Use the <code>println!</code> macro.</p>"#);
694        
695        let code_blocks = extract_code_blocks(&doc).unwrap();
696        assert_eq!(code_blocks.len(), 1);
697        assert!(code_blocks[0].is_inline);
698    }
699
700    #[test]
701    fn test_extract_quotes() {
702        let doc = parse_html(r#"
703            <blockquote cite="https://example.com">
704                <p>This is a quote.</p>
705                <footer>— Author Name</footer>
706            </blockquote>
707        "#);
708        
709        let quotes = extract_quotes(&doc).unwrap();
710        assert_eq!(quotes.len(), 1);
711        assert!(quotes[0].text.contains("This is a quote"));
712        assert_eq!(quotes[0].cite_url, Some("https://example.com".to_string()));
713    }
714
715    #[test]
716    fn test_extract_images() {
717        let doc = parse_html(r#"
718            <img src="/images/photo.jpg" 
719                 alt="A photo" 
720                 title="Photo title"
721                 width="800" 
722                 height="600"
723                 loading="lazy">
724        "#);
725        
726        let base = Url::parse("https://example.com").unwrap();
727        let images = extract_images(&doc, Some(&base)).unwrap();
728        
729        assert_eq!(images.len(), 1);
730        assert_eq!(images[0].alt, "A photo");
731        assert_eq!(images[0].title, Some("Photo title".to_string()));
732        assert_eq!(images[0].width, Some(800));
733        assert_eq!(images[0].height, Some(600));
734        assert_eq!(images[0].loading, ImageLoading::Lazy);
735        assert_eq!(images[0].url, Some("https://example.com/images/photo.jpg".to_string()));
736    }
737
738    #[test]
739    fn test_image_decorative() {
740        let doc = parse_html(r#"<img src="/spacer.gif" alt="">"#);
741        let images = extract_images(&doc, None).unwrap();
742        assert!(images[0].is_decorative);
743    }
744
745    #[test]
746    fn test_table_with_colspan() {
747        let doc = parse_html(r#"
748            <table>
749                <tr><td colspan="2">Spanning cell</td></tr>
750                <tr><td>Cell 1</td><td>Cell 2</td></tr>
751            </table>
752        "#);
753        
754        let tables = extract_tables(&doc).unwrap();
755        assert_eq!(tables[0].rows[0].cells[0].colspan, 2);
756    }
757
758    #[test]
759    fn test_definition_list() {
760        let doc = parse_html(r#"
761            <dl>
762                <dt>Term 1</dt>
763                <dd>Definition 1</dd>
764                <dt>Term 2</dt>
765                <dd>Definition 2</dd>
766            </dl>
767        "#);
768        
769        let lists = extract_lists(&doc).unwrap();
770        assert_eq!(lists.len(), 1);
771        assert_eq!(lists[0].list_type, ListType::Definition);
772        assert_eq!(lists[0].items.len(), 2);
773    }
774
775    #[test]
776    fn test_build_outline() {
777        let headings = vec![
778            Heading::new(1, "Main"),
779            Heading::new(2, "Section 1"),
780            Heading::new(3, "Subsection 1.1"),
781            Heading::new(2, "Section 2"),
782        ];
783        
784        let outline = build_outline(&headings);
785        assert_eq!(outline.len(), 4);
786    }
787
788    #[test]
789    fn test_is_known_language() {
790        assert!(is_known_language("rust"));
791        assert!(is_known_language("Python"));
792        assert!(is_known_language("JAVASCRIPT"));
793        assert!(!is_known_language("unknown-lang"));
794    }
795
796    #[test]
797    fn test_responsive_image() {
798        let doc = parse_html(r#"
799            <img src="/img.jpg" 
800                 srcset="/img-320.jpg 320w, /img-640.jpg 640w"
801                 sizes="(max-width: 320px) 280px, 640px"
802                 alt="Responsive">
803        "#);
804        
805        let images = extract_images(&doc, None).unwrap();
806        assert!(images[0].is_responsive());
807        assert!(images[0].srcset.is_some());
808        assert!(images[0].sizes.is_some());
809    }
810}