Skip to main content

mark_html/
lib.rs

1#[cfg(test)]
2pub mod test;
3#[derive(Debug, PartialEq)]
4enum Token {
5    Heading(usize),
6    BoldStart,
7    BoldEnd,
8    ItalicStart,
9    ItalicEnd,
10    Text(String),
11    NewLine,
12    Link { text: String, url: String },
13    ListItemStart,
14    CodeBlock(String),
15}
16
17#[derive(Debug, PartialEq)]
18enum Node {
19    Document(Vec<Node>),
20    Heading(usize, Vec<Node>),
21    Paragraph(Vec<Node>),
22    Bold(Vec<Node>),
23    Italic(Vec<Node>),
24    Text(String),
25    Link { text: String, url: String },
26    UnorderedList(Vec<Node>),
27    ListItem(Vec<Node>),
28    CodeBlock(String),
29}
30
31pub fn to_html(input: &str) -> String {
32    let tokens = lex(input);
33    let ast = parse(&tokens);
34    render(&ast)
35}
36
37fn lex(input: &str) -> Vec<Token> {
38    let mut tokens = Vec::new();
39    let mut chars = input.chars().peekable();
40    let mut bold_active = false;
41    let mut italic_active = false;
42    while let Some(c) = chars.next() {
43        match c {
44            '#' => {
45                let mut level = 1;
46                while let Some('#') = chars.peek() {
47                    chars.next();
48                    level += 1;
49                }
50                // Headings are typically followed by a space
51                if chars.peek() == Some(&' ') {
52                    chars.next();
53                }
54                tokens.push(Token::Heading(level));
55            }
56            '*' => {
57                if chars.peek() == Some(&'*') {
58                    chars.next(); // consume the second '*'
59                    if bold_active {
60                        tokens.push(Token::BoldEnd);
61                    } else {
62                        tokens.push(Token::BoldStart);
63                    }
64                    bold_active = !bold_active;
65                } else {
66                    if italic_active {
67                        tokens.push(Token::ItalicEnd);
68                    } else {
69                        tokens.push(Token::ItalicStart);
70                    }
71                    italic_active = !italic_active;
72                }
73            }
74            '\n' => {
75                tokens.push(Token::NewLine);
76            }
77            '[' => {
78                let mut text = String::new();
79                while let Some(&ch) = chars.peek() {
80                    if ch == ']' {
81                        break;
82                    }
83                    text.push(chars.next().unwrap());
84                }
85
86                // Check for the full link syntax: [text](url)
87                if chars.peek() == Some(&']') {
88                    chars.next(); // consume ']'
89                    if chars.peek() == Some(&'(') {
90                        chars.next(); // consume '('
91                        let mut url = String::new();
92                        while let Some(&ch) = chars.peek() {
93                            if ch == ')' {
94                                break;
95                            }
96                            url.push(chars.next().unwrap());
97                        }
98                        if chars.peek() == Some(&')') {
99                            chars.next(); // consume ')'
100                            tokens.push(Token::Link { text, url });
101                        } else {
102                            // This is a malformed link, like [text](url
103                            // Treat all parts as plain text.
104                            tokens.push(Token::Text("[".to_string()));
105                            tokens.push(Token::Text(text));
106                            tokens.push(Token::Text("]".to_string()));
107                            tokens.push(Token::Text("(".to_string()));
108                            tokens.push(Token::Text(url));
109                        }
110                    } else {
111                        // This is just text in brackets, like [text]
112                        tokens.push(Token::Text("[".to_string()));
113                        tokens.push(Token::Text(text));
114                        tokens.push(Token::Text("]".to_string()));
115                    }
116                } else {
117                    // No closing bracket found, like [text
118                    tokens.push(Token::Text("[".to_string()));
119                    tokens.push(Token::Text(text));
120                }
121            }
122            '-' => {
123                if chars.peek() == Some(&' ') {
124                    chars.next(); // consume the space
125                    tokens.push(Token::ListItemStart);
126                } else {
127                    tokens.push(Token::Text("-".to_string()));
128                }
129            }
130            '`' => {
131                let mut p = chars.clone();
132                if p.next() == Some('`') && p.next() == Some('`') {
133                    // Consume the ```
134                    chars.next();
135                    chars.next();
136
137                    // Consume optional language specifier, and the rest of the line.
138                    while let Some(c) = chars.peek() {
139                        if *c == '\n' {
140                            break;
141                        }
142                        chars.next();
143                    }
144                    if chars.peek() == Some(&'\n') {
145                        chars.next(); // Consume the newline
146                    }
147
148                    let mut code = String::new();
149                    'code_block: loop {
150                        if let Some('`') = chars.peek() {
151                            let mut p2 = chars.clone();
152                            p2.next(); // `
153                            if let Some('`') = p2.peek() {
154                                p2.next(); // `
155                                if let Some('`') = p2.peek() {
156                                    // Found end fence
157                                    chars.next();
158                                    chars.next();
159                                    chars.next();
160                                    break 'code_block;
161                                }
162                            }
163                        }
164
165                        if let Some(c) = chars.next() {
166                            code.push(c);
167                        } else {
168                            // Unterminated
169                            break 'code_block;
170                        }
171                    }
172                    tokens.push(Token::CodeBlock(code));
173                } else {
174                    tokens.push(Token::Text("`".to_string()));
175                }
176            }
177            _ => {
178                let mut buff = String::new();
179                buff.push(c);
180                while let Some(&next) = chars.peek() {
181                    if next == '#'
182                        || next == '*'
183                        || next == '\n'
184                        || next == '['
185                        || next == '-'
186                        || next == '`'
187                    {
188                        break;
189                    }
190                    buff.push(chars.next().unwrap());
191                }
192                tokens.push(Token::Text(buff));
193            }
194        }
195    }
196    tokens
197}
198
199fn parse(tokens: &[Token]) -> Node {
200    let mut nodes = Vec::new();
201    let mut i = 0;
202    while i < tokens.len() {
203        if let Some(Token::CodeBlock(content)) = tokens.get(i) {
204            nodes.push(Node::CodeBlock(content.clone()));
205            i += 1;
206            continue;
207        }
208
209        let end_of_line = tokens[i..]
210            .iter()
211            .position(|t| *t == Token::NewLine)
212            .map_or(tokens.len(), |p| i + p);
213        let line_tokens = &tokens[i..end_of_line];
214
215        if line_tokens.is_empty() {
216            i = end_of_line + 1;
217            continue;
218        }
219
220        match &line_tokens[0] {
221            Token::Heading(level) => {
222                let content = parse_inlines(&line_tokens[1..]);
223                nodes.push(Node::Heading(*level, content));
224            }
225            Token::ListItemStart => {
226                let mut list_items = Vec::new();
227
228                // First item
229                let item_content = parse_inlines(&line_tokens[1..]);
230                list_items.push(Node::ListItem(item_content));
231                i = end_of_line + 1;
232
233                // Process subsequent list items
234                while i < tokens.len() {
235                    let next_line_end = tokens[i..]
236                        .iter()
237                        .position(|t| *t == Token::NewLine)
238                        .map_or(tokens.len(), |p| i + p);
239                    let next_line_tokens = &tokens[i..next_line_end];
240
241                    if next_line_tokens.is_empty() {
242                        i = next_line_end + 1;
243                        break; // Blank line ends the list
244                    }
245
246                    if let Some(Token::ListItemStart) = next_line_tokens.first() {
247                        let item_content = parse_inlines(&next_line_tokens[1..]);
248                        list_items.push(Node::ListItem(item_content));
249                        i = next_line_end + 1;
250                    } else {
251                        break; // Not a list item, so the list ends
252                    }
253                }
254                nodes.push(Node::UnorderedList(list_items));
255                continue; // Continue the main loop
256            }
257            _ => {
258                let content = parse_inlines(line_tokens);
259                nodes.push(Node::Paragraph(content));
260            }
261        }
262        i = end_of_line + 1;
263    }
264    Node::Document(nodes)
265}
266
267// This is our powerful helper function to handle text styles.
268// It can even handle nesting, like **bold *and* italic**.
269fn parse_inlines(tokens: &[Token]) -> Vec<Node> {
270    let mut nodes = Vec::new();
271    let mut i = 0;
272    while i < tokens.len() {
273        match &tokens[i] {
274            Token::Text(text) => {
275                nodes.push(Node::Text(text.clone()));
276                i += 1;
277            }
278            Token::BoldStart => {
279                i += 1; // Consume BoldStart
280                // Find the matching BoldEnd
281                let end_pos = tokens[i..]
282                    .iter()
283                    .position(|t| matches!(t, Token::BoldEnd))
284                    .map_or(tokens.len(), |pos| i + pos);
285
286                // Recursively parse the content inside the bold tags
287                let inner_nodes = parse_inlines(&tokens[i..end_pos]);
288                nodes.push(Node::Bold(inner_nodes));
289
290                i = end_pos;
291                if i < tokens.len() {
292                    i += 1; // Consume BoldEnd
293                }
294            }
295            Token::ItalicStart => {
296                i += 1; // Consume ItalicStart
297                // Find the matching ItalicEnd
298                let end_pos = tokens[i..]
299                    .iter()
300                    .position(|t| matches!(t, Token::ItalicEnd))
301                    .map_or(tokens.len(), |pos| i + pos);
302
303                // Recursively parse the content inside the italic tags
304                let inner_nodes = parse_inlines(&tokens[i..end_pos]);
305                nodes.push(Node::Italic(inner_nodes));
306
307                i = end_pos;
308                if i < tokens.len() {
309                    i += 1; // Consume ItalicEnd
310                }
311            }
312            Token::Link { text, url } => {
313                nodes.push(Node::Link {
314                    text: text.clone(),
315                    url: url.clone(),
316                });
317                i += 1;
318            }
319            // We shouldn't encounter these here if our block parsing is correct, but we'll skip them.
320            Token::Heading(_)
321            | Token::NewLine
322            | Token::BoldEnd
323            | Token::ItalicEnd
324            | Token::ListItemStart
325            | Token::CodeBlock(_) => {
326                i += 1;
327            }
328        }
329    }
330    nodes
331}
332
333fn render(node: &Node) -> String {
334    match node {
335        Node::Document(children) => children
336            .iter()
337            .map(render)
338            .collect::<Vec<String>>()
339            .join("\n"),
340        Node::Heading(level, children) => {
341            format!("<h{}>{}</h{}>", level, render_all(children), level)
342        }
343        Node::Paragraph(children) => {
344            format!("<p>{}</p>", render_all(children))
345        }
346        Node::Bold(children) => {
347            format!("<strong>{}</strong>", render_all(children))
348        }
349        Node::Italic(children) => {
350            format!("<em>{}</em>", render_all(children))
351        }
352        Node::Text(text) => text
353            .replace("&", "&amp;")
354            .replace("<", "&lt;")
355            .replace(">", "&gt;"),
356        Node::Link { text, url } => {
357            let escaped_text = text
358                .replace("&", "&amp;")
359                .replace("<", "&lt;")
360                .replace(">", "&gt;");
361            format!("<a href=\"{}\">{}</a>", url, escaped_text)
362        }
363        Node::UnorderedList(children) => {
364            let items = children
365                .iter()
366                .map(render)
367                .collect::<Vec<String>>()
368                .join("\n");
369            format!("<ul>\n{}\n</ul>", items)
370        }
371        Node::ListItem(children) => {
372            format!("<li>{}</li>", render_all(children))
373        }
374        Node::CodeBlock(content) => {
375            let escaped_content = content;
376            format!("<pre><code>{}</code></pre>", escaped_content)
377        }
378    }
379}
380
381fn render_all(nodes: &[Node]) -> String {
382    nodes.iter().map(render).collect()
383}