Skip to main content

oxios_markdown/
html.rs

1//! Markdown → Telegram-supported HTML subset converter.
2//!
3//! Ported from files.md (`server/pkg/txt/md.go` lines 262–432, `str.go` lines 122–170)
4//! by Artem Zakirullin.
5//!
6//! Uses parser combinators (open/close/or/and/some) for inline markup.
7//! Supported tags: `*`/`_` → `<i>`, `**`/`__` → `<b>`,
8//! `` ` `` → `<code>`, ` ``` ` → `<pre>`, `#` → `<b>`.
9
10use regex::Regex;
11use std::collections::HashMap;
12use std::rc::Rc;
13
14// ---------------------------------------------------------------------------
15// Public API — utility functions
16// ---------------------------------------------------------------------------
17
18/// Escape HTML special characters (`&`, `<`, `>`).
19pub fn escape_html(s: &str) -> String {
20    s.replace('&', "&amp;")
21        .replace('<', "&lt;")
22        .replace('>', "&gt;")
23}
24
25/// Strip all HTML tags from a string.
26pub fn strip_html_tags(s: &str) -> String {
27    let re = Regex::new(r"<[^>]*>").unwrap();
28    re.replace_all(s, "").to_string()
29}
30
31/// Replace regex matches with placeholders, returning the modified string
32/// and a map of placeholder → original.
33pub fn replace_with_placeholders(
34    s: &str,
35    pattern: &str,
36    placeholder: &str,
37) -> (String, HashMap<String, String>) {
38    let re = Regex::new(pattern).unwrap();
39    let mut placeholders = HashMap::new();
40    let mut counter: usize = 0;
41
42    let result = re
43        .replace_all(s, |caps: &regex::Captures<'_>| {
44            let full = caps.get(0).unwrap().as_str().to_string();
45            let ph = format!("#{placeholder}{counter}#");
46            counter += 1;
47            placeholders.insert(ph.clone(), full);
48            ph
49        })
50        .to_string();
51
52    (result, placeholders)
53}
54
55/// Restore placeholders back to their original values.
56pub fn restore_from_placeholders(s: &str, placeholders: &HashMap<String, String>) -> String {
57    let mut result = s.to_string();
58    for (ph, original) in placeholders {
59        result = result.replace(ph, original);
60    }
61    result
62}
63
64// ---------------------------------------------------------------------------
65// Parser-combinator infrastructure
66// ---------------------------------------------------------------------------
67
68/// A single parse result: `consumed` is the matched/transformed text,
69/// `left` is the unconsumed remainder.
70#[derive(Clone, Debug)]
71struct ParseResult {
72    consumed: String,
73    left: String,
74}
75
76/// The open-tag mapping: markdown token → HTML open tag.
77static OPEN_TAGS: &[(&str, &str)] = &[("*", "<i>"), ("**", "<b>"), ("_", "<i>"), ("__", "<b>")];
78
79/// The close-tag mapping: markdown token → HTML close tag.
80static CLOSE_TAGS: &[(&str, &str)] =
81    &[("*", "</i>"), ("**", "</b>"), ("_", "</i>"), ("__", "</b>")];
82
83fn open_tag(token: &str) -> &'static str {
84    OPEN_TAGS
85        .iter()
86        .find(|(k, _)| *k == token)
87        .map(|(_, v)| *v)
88        .unwrap_or("")
89}
90
91fn close_tag(token: &str) -> &'static str {
92    CLOSE_TAGS
93        .iter()
94        .find(|(k, _)| *k == token)
95        .map(|(_, v)| *v)
96        .unwrap_or("")
97}
98
99/// Using `Rc<dyn Fn>` so that parsers can be cloned (needed for grammar reuse).
100type Parser = Rc<dyn Fn(&str) -> Vec<ParseResult>>;
101
102/// `open(tag)` — recognises the opening markdown token and, on success,
103/// produces the corresponding HTML open tag.
104fn parse_open(token: &'static str) -> Parser {
105    Rc::new(move |input: &str| {
106        if let Some(rest) = input.strip_prefix(token) {
107            vec![ParseResult {
108                consumed: open_tag(token).to_string(),
109                left: rest.to_string(),
110            }]
111        } else {
112            vec![]
113        }
114    })
115}
116
117/// `close(tag)` — recognises the closing markdown token and, on success,
118/// produces the corresponding HTML close tag.
119fn parse_close(token: &'static str) -> Parser {
120    Rc::new(move |input: &str| {
121        if let Some(rest) = input.strip_prefix(token) {
122            vec![ParseResult {
123                consumed: close_tag(token).to_string(),
124                left: rest.to_string(),
125            }]
126        } else {
127            vec![]
128        }
129    })
130}
131
132/// `not_markdown()` — consumes plain text up to the next `*` or `_` character.
133fn parse_not_markdown() -> Parser {
134    Rc::new(|input: &str| {
135        for (i, ch) in input.char_indices() {
136            if ch == '*' || ch == '_' {
137                return vec![ParseResult {
138                    consumed: input[..i].to_string(),
139                    left: input[i..].to_string(),
140                }];
141            }
142        }
143        if !input.is_empty() {
144            vec![ParseResult {
145                consumed: input.to_string(),
146                left: String::new(),
147            }]
148        } else {
149            vec![]
150        }
151    })
152}
153
154/// `or` — try multiple parsers; concatenate all successful results.
155fn parse_or(parsers: Vec<Parser>) -> Parser {
156    Rc::new(move |input: &str| {
157        let mut results = Vec::new();
158        for p in &parsers {
159            results.extend(p(input));
160        }
161        results
162    })
163}
164
165/// `and` — apply parsers in sequence; every parser must consume something.
166fn parse_and(parsers: Vec<Parser>) -> Parser {
167    Rc::new(move |input: &str| {
168        let mut results = vec![ParseResult {
169            consumed: String::new(),
170            left: input.to_string(),
171        }];
172
173        for p in &parsers {
174            let mut new_results = Vec::new();
175            for r in &results {
176                for parsed in p(&r.left) {
177                    if !parsed.consumed.is_empty() {
178                        new_results.push(ParseResult {
179                            consumed: format!("{}{}", r.consumed, parsed.consumed),
180                            left: parsed.left.clone(),
181                        });
182                    }
183                }
184            }
185            if new_results.is_empty() {
186                return vec![];
187            }
188            results = new_results;
189        }
190        results
191    })
192}
193
194/// `some` — apply a parser one or more times (recursive).
195fn parse_some(parser: Parser) -> Parser {
196    Rc::new(move |input: &str| recursive(input, &parser, 0))
197}
198
199fn recursive(input: &str, parser: &Parser, depth: usize) -> Vec<ParseResult> {
200    let mut results = Vec::new();
201    let mut empty = true;
202
203    for item in parser(input) {
204        if item.consumed.is_empty() {
205            continue;
206        }
207        empty = false;
208        for child in recursive(&item.left, parser, depth + 1) {
209            results.push(ParseResult {
210                consumed: format!("{}{}", item.consumed, child.consumed),
211                left: child.left,
212            });
213        }
214    }
215
216    if empty && depth != 0 {
217        results.push(ParseResult {
218            consumed: String::new(),
219            left: input.to_string(),
220        });
221    }
222
223    results
224}
225
226// ---------------------------------------------------------------------------
227// The markdown parser grammar
228// ---------------------------------------------------------------------------
229
230/// Build the top-level inline markdown parser.
231/// Supports one level of nesting for bold/italic.
232fn markdown_parser() -> Parser {
233    // text = notMarkdown
234    let text = parse_not_markdown();
235
236    // italicNoBold = or(
237    //     and(open("*"), text, close("*")),
238    //     and(open("_"), text, close("_")),
239    // )
240    let italic_no_bold = parse_or(vec![
241        parse_and(vec![
242            parse_open("*"),
243            parse_not_markdown(),
244            parse_close("*"),
245        ]),
246        parse_and(vec![
247            parse_open("_"),
248            parse_not_markdown(),
249            parse_close("_"),
250        ]),
251    ]);
252
253    // bold = or(
254    //     and(open("**"), some(or(text, italicNoBold)), close("**")),
255    //     and(open("__"), some(or(text, italicNoBold)), close("__")),
256    // )
257    let bold = parse_or(vec![
258        parse_and(vec![
259            parse_open("**"),
260            parse_some(parse_or(vec![parse_not_markdown(), italic_no_bold.clone()])),
261            parse_close("**"),
262        ]),
263        parse_and(vec![
264            parse_open("__"),
265            parse_some(parse_or(vec![parse_not_markdown(), italic_no_bold])),
266            parse_close("__"),
267        ]),
268    ]);
269
270    // italic = or(
271    //     and(open("*"), some(or(text, bold)), close("*")),
272    //     and(open("_"), some(or(text, bold)), close("_")),
273    // )
274    let italic = parse_or(vec![
275        parse_and(vec![
276            parse_open("*"),
277            parse_some(parse_or(vec![parse_not_markdown(), bold.clone()])),
278            parse_close("*"),
279        ]),
280        parse_and(vec![
281            parse_open("_"),
282            parse_some(parse_or(vec![parse_not_markdown(), bold.clone()])),
283            parse_close("_"),
284        ]),
285    ]);
286
287    // span = or(bold, italic, text)
288    // result = some(span)
289    parse_some(parse_or(vec![bold, italic, text]))
290}
291
292// ---------------------------------------------------------------------------
293// Public API — MarkdownToHTML
294// ---------------------------------------------------------------------------
295
296/// Convert markdown to Telegram-supported HTML subset.
297///
298/// Handles inline `*`/`_` → `<i>`, `**`/`__` → `<b>`, backtick code blocks,
299/// and `#` headers.
300pub fn markdown_to_html(md: &str) -> String {
301    let md_without_code = escape_html(md);
302
303    // Protect code blocks (```...```) and inline code (`...`)
304    let (md_without_code, code_placeholders) =
305        replace_with_placeholders(&md_without_code, r"(?s)```.*?```", "c0debl0ck");
306    let (md_without_code, inline_placeholders) =
307        replace_with_placeholders(&md_without_code, r"`[^`]+`", "inl1ne");
308
309    // Split by double-newline; each segment is parsed independently.
310    let re_newlines = Regex::new(r"\n{2,}").unwrap();
311    let segments = re_newlines.split(&md_without_code);
312    let processed: Vec<String> = segments
313        .map(|segment| {
314            let parser = markdown_parser();
315            let docs = parser(segment);
316            if !docs.is_empty() {
317                format!("{}{}", docs[0].consumed, docs[0].left)
318            } else {
319                segment.to_string()
320            }
321        })
322        .collect();
323    let md_without_code = processed.join("\n\n");
324
325    // Restore code blocks
326    let mut result = restore_from_placeholders(&md_without_code, &code_placeholders);
327    result = restore_from_placeholders(&result, &inline_placeholders);
328
329    // Convert ```...``` → <pre>...</pre>
330    let re_code_block = Regex::new(r"(?s)```(.+?)```").unwrap();
331    result = re_code_block
332        .replace_all(&result, |caps: &regex::Captures<'_>| {
333            let inner = caps.get(1).unwrap().as_str().trim();
334            format!("<pre>{inner}</pre>")
335        })
336        .to_string();
337
338    // Convert `...` → <code>...</code>
339    let re_inline_code = Regex::new(r"`([^`]+?)`").unwrap();
340    result = re_inline_code
341        .replace_all(&result, "<code>$1</code>")
342        .to_string();
343
344    // Convert #+ heading → <b>heading</b>
345    let re_header = Regex::new(r"(?m)^#+\s*(.+)").unwrap();
346    result = re_header.replace_all(&result, "<b>$1</b>").to_string();
347
348    result
349}
350
351// ---------------------------------------------------------------------------
352// Tests
353// ---------------------------------------------------------------------------
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    #[test]
360    fn test_escape_html() {
361        assert_eq!(escape_html("a & b < c > d"), "a &amp; b &lt; c &gt; d");
362        assert_eq!(escape_html("plain"), "plain");
363    }
364
365    #[test]
366    fn test_strip_html_tags() {
367        assert_eq!(strip_html_tags("<b>hello</b>"), "hello");
368        assert_eq!(strip_html_tags("no tags"), "no tags");
369        assert_eq!(
370            strip_html_tags("<b>bold</b> and <i>italic</i>"),
371            "bold and italic"
372        );
373    }
374
375    #[test]
376    fn test_replace_and_restore_placeholders() {
377        let input = "some ```code``` here";
378        let (modified, phs) = replace_with_placeholders(input, r"(?s)```.*?```", "c0de");
379        assert!(modified.contains("c0de"));
380        let restored = restore_from_placeholders(&modified, &phs);
381        assert_eq!(restored, input);
382    }
383
384    #[test]
385    fn test_markdown_to_html_italic() {
386        let result = markdown_to_html("hello *world*");
387        assert!(result.contains("<i>world</i>"));
388        assert!(result.contains("hello"));
389    }
390
391    #[test]
392    fn test_markdown_to_html_bold() {
393        let result = markdown_to_html("hello **world**");
394        assert!(result.contains("<b>world</b>"));
395    }
396
397    #[test]
398    fn test_markdown_to_html_bold_underscore() {
399        let result = markdown_to_html("hello __world__");
400        assert!(result.contains("<b>world</b>"));
401    }
402
403    #[test]
404    fn test_markdown_to_html_italic_underscore() {
405        let result = markdown_to_html("hello _world_");
406        assert!(result.contains("<i>world</i>"));
407    }
408
409    #[test]
410    fn test_markdown_to_html_nested_bold_italic() {
411        let result = markdown_to_html("**bold *italic* bold**");
412        assert!(result.contains("<b>"));
413        assert!(result.contains("<i>italic</i>"));
414        assert!(result.contains("</b>"));
415    }
416
417    #[test]
418    fn test_markdown_to_html_code_block() {
419        let result = markdown_to_html("```\ncode\n```");
420        assert!(result.contains("<pre>code</pre>"));
421    }
422
423    #[test]
424    fn test_markdown_to_html_inline_code() {
425        let result = markdown_to_html("use `foo` here");
426        assert!(result.contains("<code>foo</code>"));
427    }
428
429    #[test]
430    fn test_markdown_to_html_header() {
431        let result = markdown_to_html("# Title");
432        assert!(result.contains("<b>Title</b>"));
433    }
434
435    #[test]
436    fn test_markdown_to_html_header_h3() {
437        let result = markdown_to_html("### Subtitle");
438        assert!(result.contains("<b>Subtitle</b>"));
439    }
440
441    #[test]
442    fn test_markdown_to_html_plain_text_unchanged() {
443        let result = markdown_to_html("just plain text");
444        assert_eq!(result, "just plain text");
445    }
446
447    #[test]
448    fn test_markdown_to_html_html_chars_escaped() {
449        let result = markdown_to_html("a < b & c > d");
450        assert!(result.contains("&lt;"));
451        assert!(result.contains("&gt;"));
452        assert!(result.contains("&amp;"));
453    }
454
455    #[test]
456    fn test_markdown_to_html_mixed() {
457        let result = markdown_to_html("**bold** and *italic* and `code`");
458        assert!(result.contains("<b>bold</b>"));
459        assert!(result.contains("<i>italic</i>"));
460        assert!(result.contains("<code>code</code>"));
461    }
462
463    #[test]
464    fn test_parser_not_markdown() {
465        let p = parse_not_markdown();
466        let results = p("hello*world");
467        assert_eq!(results.len(), 1);
468        assert_eq!(results[0].consumed, "hello");
469        assert_eq!(results[0].left, "*world");
470    }
471
472    #[test]
473    fn test_parser_not_markdown_no_special() {
474        let p = parse_not_markdown();
475        let results = p("hello world");
476        assert_eq!(results.len(), 1);
477        assert_eq!(results[0].consumed, "hello world");
478        assert_eq!(results[0].left, "");
479    }
480
481    #[test]
482    fn test_parser_open_close() {
483        let p = parse_open("**");
484        let results = p("**bold**");
485        assert_eq!(results.len(), 1);
486        assert_eq!(results[0].consumed, "<b>");
487        assert_eq!(results[0].left, "bold**");
488
489        let p = parse_close("**");
490        let results = p("**rest");
491        assert_eq!(results.len(), 1);
492        assert_eq!(results[0].consumed, "</b>");
493        assert_eq!(results[0].left, "rest");
494    }
495
496    #[test]
497    fn test_parser_and() {
498        let p = parse_and(vec![
499            parse_open("*"),
500            parse_not_markdown(),
501            parse_close("*"),
502        ]);
503        let results = p("*hello*");
504        assert!(!results.is_empty());
505        assert_eq!(results[0].consumed, "<i>hello</i>");
506    }
507}