telegram_escape/
lib.rs

1use std::{borrow::Cow, sync::LazyLock};
2
3use pulldown_cmark::{Event, Options as DeOptions, Parser, Tag, TagEnd};
4use pulldown_cmark_to_cmark::Options as SerOptions;
5use regex::Regex;
6
7macro_rules! regex {
8    ($re:literal $(,)?) => {
9        LazyLock::new(|| regex::Regex::new($re).unwrap())
10    };
11}
12
13static TG_MD_ESCAPE_REGEX: LazyLock<Regex> = regex!(r"[_*\[\]()~`>#+\-=|{}\.!\\]");
14static TG_MD_CODE_ESCAPE_REGEX: LazyLock<Regex> = regex!(r"[`\\]");
15static TG_MD_SERIALIZE_OPTIONS: LazyLock<SerOptions> = LazyLock::new(|| SerOptions {
16    code_block_token_count: 3,
17    ..Default::default()
18});
19// _*[]()~`>#+-=|{}.!\
20
21/// Escapes given text, abiding Telegram flavoured Markdown
22/// [rules](https://core.telegram.org/bots/api#formatting-options).
23pub fn tg_escape(text: &str) -> String {
24    let mut options = DeOptions::empty();
25    options.insert(DeOptions::ENABLE_STRIKETHROUGH);
26
27    let mut inside_code = false;
28
29    let parser = Parser::new_ext(text, options).map(|event| {
30        match &event {
31            Event::Start(Tag::CodeBlock(_)) => {
32                inside_code = true;
33
34                event
35            }
36            Event::End(TagEnd::CodeBlock) => {
37                inside_code = false;
38
39                event
40            }
41            Event::Text(text) | Event::Code(text) => {
42                if text.len() == 1 {
43                    // pulldown-cmark-to-cmark escapes single characters properly on it's own, aside
44                    return event;
45                }
46
47                let re = if inside_code || matches!(&event, Event::Code(_)) {
48                    &TG_MD_CODE_ESCAPE_REGEX
49                } else {
50                    &TG_MD_ESCAPE_REGEX
51                };
52
53                // manual COW implementation...
54                let replaced = re.replace_all(text, r"\$0");
55
56                match replaced {
57                    Cow::Borrowed(_) => event,
58                    Cow::Owned(text) => match event {
59                        Event::Text(_) => Event::Text(text.into()),
60                        Event::Code(_) => Event::Code(text.into()),
61                        _ => unreachable!(),
62                    },
63                }
64            }
65            _ => event,
66        }
67    });
68
69    let mut res = String::with_capacity(text.len());
70
71    pulldown_cmark_to_cmark::cmark_with_options(parser, &mut res, TG_MD_SERIALIZE_OPTIONS.clone())
72        .expect("writing to string failed!");
73
74    res
75}
76
77#[cfg(test)]
78mod tests {
79    use super::*;
80    use pretty_assertions::assert_eq;
81
82    #[test]
83    fn test_md_escape() {
84        assert_eq!(
85            tg_escape("Soon you'll get a stats for today, and the overall status can be viewed by the /get_stat command :)"),
86            r#"Soon you'll get a stats for today, and the overall status can be viewed by the /get\_stat command :\)"#
87        )
88    }
89
90    #[test]
91    fn test_escape_outside_code_all_specials() {
92        // All MarkdownV2 special characters should be escaped outside code (avoid link syntax)
93        let input = r#"a_*~`>#+-=|{}.!\x"#;
94        let expected = r"a\_\*\~\`\\>\#\+\-\=\|\{\}\.\!\\x";
95
96        assert_eq!(tg_escape(input), expected);
97    }
98
99    #[test]
100    fn test_inline_code_escapes_only_backtick_and_backslash() {
101        // Inside inline code, only ` and \\ are escaped
102        let input = r#"Before `a_*~>#+-=|{}.!\` after"#;
103        let expected = r#"Before `a_*~>#+-=|{}.!\\` after"#;
104
105        assert_eq!(tg_escape(input), expected);
106    }
107
108    #[test]
109    fn test_code_block_escapes_only_backtick_and_backslash() {
110        // Inside code blocks, only ` and \\ are escaped
111        let input = r#"```
112a_*[]()~`>#+-=|{}.!\
113```"#;
114        let expected = r#"
115```
116a_*[]()~\`>#+-=|{}.!\\
117```"#;
118
119        assert_eq!(tg_escape(input), expected);
120    }
121
122    #[test]
123    fn test_mixed_multiple_inline_code_segments() {
124        let input = r#"pre_* `codeA_*` mid_* `codeB_\` post_*"#;
125        let expected = r#"pre\_\* `codeA_*` mid\_\* `codeB_\\` post\_\*"#;
126
127        assert_eq!(tg_escape(input), expected);
128    }
129
130    #[test]
131    fn test_emphasis_around_text_with_inline_code() {
132        let input = r#"*start* `inside_*` end_*"#;
133        let expected = r#"*start* `inside_*` end\_\*"#;
134
135        assert_eq!(tg_escape(input), expected);
136    }
137
138    #[test]
139    #[ignore = "this test is failing"]
140    fn test_escaped_characters() {
141        let input = r"Escaped characters: \\ \* \_ \[ \] \( \) \~";
142        let expected = r"Escaped characters: \\\\ \\\* \\\_ \\\[ \\\] \\\( \\\) \\\~";
143
144        assert_eq!(tg_escape(input), expected);
145    }
146
147    #[test]
148    #[ignore = "this test is failing"]
149    fn test_math_expressions() {
150        let input = r"Mathematical expressions: 2 + 2 = 4, x > y, a <= b";
151        let expected = r"Mathematical expressions: 2 \+ 2 \= 4, x \> y, a \<\= b";
152
153        assert_eq!(tg_escape(input), expected);
154    }
155}