buup/transformers/
markdown_to_html.rs

1use crate::{Transform, TransformError, TransformerCategory};
2
3/// Markdown to HTML transformer
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub struct MarkdownToHtml;
6
7impl Transform for MarkdownToHtml {
8    fn name(&self) -> &'static str {
9        "Markdown to HTML"
10    }
11
12    fn id(&self) -> &'static str {
13        "markdowntohtml"
14    }
15
16    fn category(&self) -> TransformerCategory {
17        TransformerCategory::Formatter
18    }
19
20    fn description(&self) -> &'static str {
21        "Converts Markdown text to HTML format"
22    }
23
24    fn transform(&self, input: &str) -> Result<String, TransformError> {
25        let mut html = String::new();
26        let mut in_code_block = false;
27        let mut code_language = String::new();
28        let mut in_list = false;
29        let mut in_ordered_list = false;
30        let mut in_blockquote = false;
31        let lines = input.lines();
32
33        for line in lines {
34            // Handle code blocks
35            if line.trim().starts_with("```") {
36                if in_code_block {
37                    html.push_str("</code></pre>\n");
38                    in_code_block = false;
39                    code_language.clear();
40                } else {
41                    in_code_block = true;
42                    code_language.clear();
43                    // Extract language if specified
44                    let language_start = line.trim_start().chars().skip(3).collect::<String>();
45                    if !language_start.is_empty() {
46                        code_language = language_start.trim().to_string();
47                        if !code_language.is_empty() {
48                            html.push_str(&format!(
49                                "<pre><code class=\"language-{}\">",
50                                code_language
51                            ));
52                        } else {
53                            html.push_str("<pre><code>");
54                        }
55                    } else {
56                        html.push_str("<pre><code>");
57                    }
58                }
59                continue;
60            }
61
62            if in_code_block {
63                html.push_str(&line.replace('<', "&lt;").replace('>', "&gt;"));
64                html.push('\n');
65                continue;
66            }
67
68            // Handle horizontal rules
69            if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
70                html.push_str("<hr>\n");
71                continue;
72            }
73
74            // Handle blockquotes
75            if line.trim().starts_with('>') {
76                if !in_blockquote {
77                    html.push_str("<blockquote>\n");
78                    in_blockquote = true;
79                }
80                let content = line.trim()[1..].trim_start();
81                let processed_content = process_inline_markdown(content);
82                html.push_str(&format!("<p>{}</p>\n", processed_content));
83                continue;
84            } else if in_blockquote && line.trim().is_empty() {
85                html.push_str("</blockquote>\n");
86                in_blockquote = false;
87                continue;
88            }
89
90            // Handle headers
91            let level = line.chars().take_while(|&c| c == '#').count();
92            if level > 0 && level <= 6 && line.chars().nth(level) == Some(' ') {
93                let content = line[level..].trim();
94                let processed_content = process_inline_markdown(content);
95                html.push_str(&format!("<h{}>{}</h{}>\n", level, processed_content, level));
96                continue;
97            }
98
99            // Handle ordered lists
100            if let Some(content) = line.trim().strip_prefix("1. ") {
101                if !in_ordered_list {
102                    if in_list {
103                        html.push_str("</ul>\n");
104                        in_list = false;
105                    }
106                    html.push_str("<ol>\n");
107                    in_ordered_list = true;
108                }
109                let processed_content = process_inline_markdown(content);
110                html.push_str(&format!("<li>{}</li>\n", processed_content));
111                continue;
112            } else if in_ordered_list && line.trim().len() >= 3 {
113                // Check for any number followed by a dot and space (e.g., "2. ", "10. ")
114                let parts: Vec<&str> = line.trim().splitn(2, ". ").collect();
115                if parts.len() == 2 && parts[0].parse::<usize>().is_ok() {
116                    let processed_content = process_inline_markdown(parts[1]);
117                    html.push_str(&format!("<li>{}</li>\n", processed_content));
118                    continue;
119                } else if in_ordered_list {
120                    html.push_str("</ol>\n");
121                    in_ordered_list = false;
122                }
123            } else if in_ordered_list && line.trim().is_empty() {
124                html.push_str("</ol>\n");
125                in_ordered_list = false;
126            }
127
128            // Handle unordered lists
129            if line.trim().starts_with("- ") || line.trim().starts_with("* ") {
130                if !in_list {
131                    if in_ordered_list {
132                        html.push_str("</ol>\n");
133                        in_ordered_list = false;
134                    }
135                    html.push_str("<ul>\n");
136                    in_list = true;
137                }
138                let marker_len = 2; // Both "- " and "* " are 2 chars long
139                let content = line.trim()[marker_len..].trim();
140                let processed_content = process_inline_markdown(content);
141                html.push_str(&format!("<li>{}</li>\n", processed_content));
142                continue;
143            } else if in_list && line.trim().is_empty() {
144                html.push_str("</ul>\n");
145                in_list = false;
146                continue;
147            }
148
149            // Handle paragraphs
150            if !line.trim().is_empty() {
151                let processed_line = process_inline_markdown(line);
152
153                // Skip adding paragraph tags around certain elements that are already block-level
154                if !processed_line.starts_with("<h")
155                    && !processed_line.starts_with("<ul")
156                    && !processed_line.starts_with("<ol")
157                    && !processed_line.starts_with("<li")
158                    && !processed_line.starts_with("<blockquote")
159                {
160                    html.push_str("<p>");
161                    html.push_str(&processed_line);
162                    html.push_str("</p>\n");
163                } else {
164                    html.push_str(&processed_line);
165                    html.push('\n');
166                }
167            } else if !in_list && !in_ordered_list && !in_blockquote && !line.trim().is_empty() {
168                html.push('\n');
169            }
170        }
171
172        // Close any open tags
173        if in_list {
174            html.push_str("</ul>\n");
175        }
176        if in_ordered_list {
177            html.push_str("</ol>\n");
178        }
179        if in_blockquote {
180            html.push_str("</blockquote>\n");
181        }
182        if in_code_block {
183            html.push_str("</code></pre>\n");
184        }
185
186        Ok(html)
187    }
188
189    fn default_test_input(&self) -> &'static str {
190        "# Hello World\n\nThis is a **bold** and *italic* text with ~~strikethrough~~ and `inline code`.\n\n- List item 1\n- List item 2\n\n1. Ordered item 1\n2. Ordered item 2\n\n> This is a blockquote\n\n[Link text](https://example.com)\n\n---\n\n```rust\nfn main() {\n    println!(\"Hello, world!\");\n}\n```"
191    }
192}
193
194// Helper function to process inline Markdown elements
195fn process_inline_markdown(input: &str) -> String {
196    let mut result = input.to_string();
197
198    // Process inline code (backticks)
199    while let Some(start) = result.find('`') {
200        if let Some(end) = result[start + 1..].find('`') {
201            let code_content = &result[start + 1..start + 1 + end];
202            let code_html = format!("<code>{}</code>", code_content);
203            result.replace_range(start..=start + 1 + end, &code_html);
204        } else {
205            break;
206        }
207    }
208
209    // Process bold (double asterisks)
210    while let Some(start) = result.find("**") {
211        if let Some(end) = result[start + 2..].find("**") {
212            let bold_content = &result[start + 2..start + 2 + end];
213            let bold_html = format!("<strong>{}</strong>", bold_content);
214            result.replace_range(start..=start + 2 + end + 1, &bold_html);
215        } else {
216            break;
217        }
218    }
219
220    // Process italic (single asterisk)
221    while let Some(start) = result.find('*') {
222        if let Some(end) = result[start + 1..].find('*') {
223            let italic_content = &result[start + 1..start + 1 + end];
224            let italic_html = format!("<em>{}</em>", italic_content);
225            result.replace_range(start..=start + 1 + end, &italic_html);
226        } else {
227            break;
228        }
229    }
230
231    // Process strikethrough (double tilde)
232    while let Some(start) = result.find("~~") {
233        if let Some(end) = result[start + 2..].find("~~") {
234            let strike_content = &result[start + 2..start + 2 + end];
235            let strike_html = format!("<del>{}</del>", strike_content);
236            result.replace_range(start..=start + 2 + end + 1, &strike_html);
237        } else {
238            break;
239        }
240    }
241
242    // Process links
243    while let Some(start) = result.find('[') {
244        if let Some(text_end) = result[start..].find(']') {
245            let text_end = start + text_end;
246            if result.len() > text_end + 1 && result.as_bytes()[text_end + 1] == b'(' {
247                if let Some(url_end) = result[text_end + 1..].find(')') {
248                    let url_end = text_end + 1 + url_end;
249                    let link_text = &result[start + 1..text_end];
250                    let url = &result[text_end + 2..url_end];
251                    let link_html = format!("<a href=\"{}\">{}</a>", url, link_text);
252                    result.replace_range(start..=url_end, &link_html);
253                } else {
254                    break;
255                }
256            } else {
257                break;
258            }
259        } else {
260            break;
261        }
262    }
263
264    // Sanitize angle brackets for HTML entities, but preserve HTML tags we've already created
265    let mut final_result = String::new();
266    let mut i = 0;
267    let bytes = result.as_bytes();
268
269    while i < bytes.len() {
270        // Check for HTML tag start
271        if bytes[i] == b'<' && i + 1 < bytes.len() {
272            if is_start_of_html_tag(&bytes[i + 1..]) {
273                // This is an HTML tag, add it as is
274                final_result.push('<');
275                i += 1;
276
277                // Add characters until we reach the end of tag
278                while i < bytes.len() && bytes[i] != b'>' {
279                    final_result.push(bytes[i] as char);
280                    i += 1;
281                }
282
283                if i < bytes.len() {
284                    final_result.push('>');
285                    i += 1;
286                }
287            } else {
288                // Not an HTML tag, escape it
289                final_result.push_str("&lt;");
290                i += 1;
291            }
292        } else if bytes[i] == b'>' && (i == 0 || bytes[i - 1] != b'/') {
293            // Only escape '>' that are not part of a closing tag
294            let preceding_is_tag = i >= 2 && bytes[i - 1] == b'/' && bytes[i - 2] == b'<';
295            if !preceding_is_tag {
296                final_result.push_str("&gt;");
297            } else {
298                final_result.push('>');
299            }
300            i += 1;
301        } else {
302            final_result.push(bytes[i] as char);
303            i += 1;
304        }
305    }
306
307    final_result
308}
309
310// Helper function to determine if we're at the start of an HTML tag
311fn is_start_of_html_tag(bytes: &[u8]) -> bool {
312    let html_tags = &[
313        b"a " as &[u8],
314        b"a>" as &[u8],
315        b"a href" as &[u8],
316        b"/a>" as &[u8],
317        b"strong" as &[u8],
318        b"/strong" as &[u8],
319        b"em" as &[u8],
320        b"/em" as &[u8],
321        b"del" as &[u8],
322        b"/del" as &[u8],
323        b"code" as &[u8],
324        b"/code" as &[u8],
325        b"p>" as &[u8],
326        b"/p>" as &[u8],
327    ];
328
329    for &tag in html_tags {
330        if bytes.len() >= tag.len() && bytes[..tag.len()] == *tag {
331            return true;
332        }
333    }
334    false
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    #[test]
342    fn test_markdown_to_html() {
343        let transformer = MarkdownToHtml;
344        let input = "# Title\n\nThis is **bold** and *italic*.\n\n- Item 1\n- Item 2\n\n[Link](https://example.com)";
345        let expected = "<h1>Title</h1>\n<p>This is <strong>bold</strong> and <em>italic</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>\n<p><a href=\"https://example.com\">Link</a></p>\n";
346        assert_eq!(transformer.transform(input).unwrap(), expected);
347    }
348
349    #[test]
350    fn test_code_block() {
351        let transformer = MarkdownToHtml;
352        let input = "```\ncode here\n```";
353        let expected = "<pre><code>code here\n</code></pre>\n";
354        assert_eq!(transformer.transform(input).unwrap(), expected);
355    }
356
357    #[test]
358    fn test_code_block_with_language() {
359        let transformer = MarkdownToHtml;
360        let input = "```rust\nfn main() {\n    println!(\"Hello!\");\n}\n```";
361        let expected = "<pre><code class=\"language-rust\">fn main() {\n    println!(\"Hello!\");\n}\n</code></pre>\n";
362        assert_eq!(transformer.transform(input).unwrap(), expected);
363    }
364
365    #[test]
366    fn test_ordered_list() {
367        let transformer = MarkdownToHtml;
368        let input = "1. First item\n2. Second item";
369        let expected = "<ol>\n<li>First item</li>\n<li>Second item</li>\n</ol>\n";
370        assert_eq!(transformer.transform(input).unwrap(), expected);
371    }
372
373    #[test]
374    fn test_blockquote() {
375        let transformer = MarkdownToHtml;
376        let input = "> This is a quote";
377        let expected = "<blockquote>\n<p>This is a quote</p>\n</blockquote>\n";
378        assert_eq!(transformer.transform(input).unwrap(), expected);
379    }
380
381    #[test]
382    fn test_horizontal_rule() {
383        let transformer = MarkdownToHtml;
384        let input = "Before\n\n---\n\nAfter";
385        let expected = "<p>Before</p>\n<hr>\n<p>After</p>\n";
386        assert_eq!(transformer.transform(input).unwrap(), expected);
387    }
388
389    #[test]
390    fn test_strikethrough() {
391        let transformer = MarkdownToHtml;
392        let input = "This is ~~strikethrough~~ text";
393        let expected = "<p>This is <del>strikethrough</del> text</p>\n";
394        assert_eq!(transformer.transform(input).unwrap(), expected);
395    }
396
397    #[test]
398    fn test_inline_code() {
399        let transformer = MarkdownToHtml;
400        let input = "This is `inline code` text";
401        let expected = "<p>This is <code>inline code</code> text</p>\n";
402        assert_eq!(transformer.transform(input).unwrap(), expected);
403    }
404}