edgeparse_core/output/
html.rs1use crate::models::content::ContentElement;
4use crate::models::document::PdfDocument;
5use crate::models::table::TableTokenRow;
6use crate::EdgePdfError;
7
8pub fn to_html(doc: &PdfDocument) -> Result<String, EdgePdfError> {
13 let mut output = String::new();
14 output.push_str("<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n");
15 output.push_str("<meta charset=\"UTF-8\">\n");
16
17 if let Some(ref title) = doc.title {
18 output.push_str(&format!("<title>{}</title>\n", html_escape(title)));
19 } else {
20 output.push_str(&format!("<title>{}</title>\n", html_escape(&doc.file_name)));
21 }
22
23 output.push_str("</head>\n<body>\n");
24
25 if doc.kids.is_empty() {
26 output.push_str("<p>No content extracted.</p>\n");
27 } else {
28 for element in &doc.kids {
29 render_element(&mut output, element);
30 }
31 }
32
33 output.push_str("</body>\n</html>\n");
34 Ok(output)
35}
36
37fn token_rows_text(rows: &[TableTokenRow]) -> String {
39 rows.iter()
40 .flat_map(|row| row.iter())
41 .map(|token| token.base.value.as_str())
42 .collect::<Vec<_>>()
43 .join(" ")
44}
45
46fn render_element(out: &mut String, element: &ContentElement) {
47 match element {
48 ContentElement::Heading(h) => {
49 let level = h.heading_level.unwrap_or(1).clamp(1, 6);
50 let text = html_escape(&h.base.base.value());
51 out.push_str(&format!("<h{level}>{text}</h{level}>\n"));
52 }
53 ContentElement::Paragraph(p) => {
54 let text = html_escape(&p.base.value());
55 let trimmed = text.trim();
56 if !trimmed.is_empty() {
57 out.push_str(&format!("<p>{trimmed}</p>\n"));
58 }
59 }
60 ContentElement::List(list) => {
61 out.push_str("<ul>\n");
62 for item in &list.list_items {
63 let label = token_rows_text(&item.label.content);
64 let body = token_rows_text(&item.body.content);
65 out.push_str(&format!(
66 "<li>{}{}</li>\n",
67 html_escape(label.trim()),
68 html_escape(body.trim())
69 ));
70 }
71 out.push_str("</ul>\n");
72 }
73 ContentElement::Image(_) => {
74 out.push_str("<img src=\"image\" alt=\"Image\">\n");
75 }
76 ContentElement::HeaderFooter(_) => {
77 }
79 ContentElement::TextBlock(tb) => {
80 let text = html_escape(&tb.value());
81 let trimmed = text.trim();
82 if !trimmed.is_empty() {
83 out.push_str(&format!("<p>{trimmed}</p>\n"));
84 }
85 }
86 ContentElement::TextLine(tl) => {
87 let text = html_escape(&tl.value());
88 let trimmed = text.trim();
89 if !trimmed.is_empty() {
90 out.push_str(&format!("<span>{trimmed}</span>\n"));
91 }
92 }
93 ContentElement::TextChunk(tc) => {
94 out.push_str(&html_escape(&tc.value));
95 }
96 _ => {}
97 }
98}
99
100fn html_escape(s: &str) -> String {
102 s.replace('&', "&")
103 .replace('<', "<")
104 .replace('>', ">")
105 .replace('"', """)
106}
107
108#[cfg(test)]
109mod tests {
110 use super::*;
111
112 #[test]
113 fn test_html_escape() {
114 assert_eq!(html_escape("<script>"), "<script>");
115 assert_eq!(html_escape("a & b"), "a & b");
116 }
117
118 #[test]
119 fn test_html_structure() {
120 let doc = PdfDocument::new("test.pdf".to_string());
121 let html = to_html(&doc).unwrap();
122 assert!(html.contains("<!DOCTYPE html>"));
123 assert!(html.contains("<html lang=\"en\">"));
124 assert!(html.contains("</body>"));
125 assert!(html.contains("</html>"));
126 }
127
128 #[test]
129 fn test_html_title() {
130 let mut doc = PdfDocument::new("test.pdf".to_string());
131 doc.title = Some("Test <Title>".to_string());
132 let html = to_html(&doc).unwrap();
133 assert!(html.contains("<title>Test <Title></title>"));
134 }
135}