Skip to main content

edgeparse_core/output/
html.rs

1//! HTML5 output generator.
2
3use crate::models::content::ContentElement;
4use crate::models::document::PdfDocument;
5use crate::models::table::TableTokenRow;
6use crate::EdgePdfError;
7
8/// Generate HTML5 representation of a PdfDocument.
9///
10/// # Errors
11/// Returns `EdgePdfError::OutputError` on write failures.
12pub fn to_html(doc: &PdfDocument) -> Result<String, EdgePdfError> {
13    let mut output = String::new();
14    output.push_str("<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n");
15    output.push_str("<meta charset=\"UTF-8\">\n");
16
17    if let Some(ref title) = doc.title {
18        output.push_str(&format!("<title>{}</title>\n", html_escape(title)));
19    } else {
20        output.push_str(&format!("<title>{}</title>\n", html_escape(&doc.file_name)));
21    }
22
23    output.push_str("</head>\n<body>\n");
24
25    if doc.kids.is_empty() {
26        output.push_str("<p>No content extracted.</p>\n");
27    } else {
28        for element in &doc.kids {
29            render_element(&mut output, element);
30        }
31    }
32
33    output.push_str("</body>\n</html>\n");
34    Ok(output)
35}
36
37/// Extract text from table token rows.
38fn token_rows_text(rows: &[TableTokenRow]) -> String {
39    rows.iter()
40        .flat_map(|row| row.iter())
41        .map(|token| token.base.value.as_str())
42        .collect::<Vec<_>>()
43        .join(" ")
44}
45
46fn render_element(out: &mut String, element: &ContentElement) {
47    match element {
48        ContentElement::Heading(h) => {
49            let level = h.heading_level.unwrap_or(1).clamp(1, 6);
50            let text = html_escape(&h.base.base.value());
51            out.push_str(&format!("<h{level}>{text}</h{level}>\n"));
52        }
53        ContentElement::Paragraph(p) => {
54            let text = html_escape(&p.base.value());
55            let trimmed = text.trim();
56            if !trimmed.is_empty() {
57                out.push_str(&format!("<p>{trimmed}</p>\n"));
58            }
59        }
60        ContentElement::List(list) => {
61            out.push_str("<ul>\n");
62            for item in &list.list_items {
63                let label = token_rows_text(&item.label.content);
64                let body = token_rows_text(&item.body.content);
65                out.push_str(&format!(
66                    "<li>{}{}</li>\n",
67                    html_escape(label.trim()),
68                    html_escape(body.trim())
69                ));
70            }
71            out.push_str("</ul>\n");
72        }
73        ContentElement::Image(_) => {
74            out.push_str("<img src=\"image\" alt=\"Image\">\n");
75        }
76        ContentElement::HeaderFooter(_) => {
77            // Skip headers/footers in HTML by default
78        }
79        ContentElement::TextBlock(tb) => {
80            let text = html_escape(&tb.value());
81            let trimmed = text.trim();
82            if !trimmed.is_empty() {
83                out.push_str(&format!("<p>{trimmed}</p>\n"));
84            }
85        }
86        ContentElement::TextLine(tl) => {
87            let text = html_escape(&tl.value());
88            let trimmed = text.trim();
89            if !trimmed.is_empty() {
90                out.push_str(&format!("<span>{trimmed}</span>\n"));
91            }
92        }
93        ContentElement::TextChunk(tc) => {
94            out.push_str(&html_escape(&tc.value));
95        }
96        _ => {}
97    }
98}
99
100/// Escape HTML special characters.
101fn html_escape(s: &str) -> String {
102    s.replace('&', "&amp;")
103        .replace('<', "&lt;")
104        .replace('>', "&gt;")
105        .replace('"', "&quot;")
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn test_html_escape() {
114        assert_eq!(html_escape("<script>"), "&lt;script&gt;");
115        assert_eq!(html_escape("a & b"), "a &amp; b");
116    }
117
118    #[test]
119    fn test_html_structure() {
120        let doc = PdfDocument::new("test.pdf".to_string());
121        let html = to_html(&doc).unwrap();
122        assert!(html.contains("<!DOCTYPE html>"));
123        assert!(html.contains("<html lang=\"en\">"));
124        assert!(html.contains("</body>"));
125        assert!(html.contains("</html>"));
126    }
127
128    #[test]
129    fn test_html_title() {
130        let mut doc = PdfDocument::new("test.pdf".to_string());
131        doc.title = Some("Test <Title>".to_string());
132        let html = to_html(&doc).unwrap();
133        assert!(html.contains("<title>Test &lt;Title&gt;</title>"));
134    }
135}