telegram_escape/
lib.rs

1/// Telegram MarkdownV2 special characters that must be escaped in regular text.
2///
3/// Source of truth: <https://core.telegram.org/bots/api#markdownv2-style>
4const TG_SPECIAL_CHARS: &[char] = &[
5    '_', '*', '[', ']', '(', ')', '~', '`', '>', '#', '+', '-', '=', '|', '{', '}', '.', '!', '\\',
6];
7
8/// O(1) lookup table built at compile time from [`TG_SPECIAL_CHARS`].
9const TG_SPECIAL: [bool; 128] = {
10    let mut table = [false; 128];
11    let mut i = 0;
12    while i < TG_SPECIAL_CHARS.len() {
13        table[TG_SPECIAL_CHARS[i] as usize] = true;
14        i += 1;
15    }
16    table
17};
18
19/// Returns `true` if `c` is a Telegram MarkdownV2 special character.
20fn is_tg_special(c: char) -> bool {
21    let code = c as u32;
22    code < 128 && TG_SPECIAL[code as usize]
23}
24
25/// Push a character to `out`, escaping it for code context (only `` ` `` and `\`).
26fn push_code_escaped(out: &mut String, c: char) {
27    if c == '`' || c == '\\' {
28        out.push('\\');
29    }
30    out.push(c);
31}
32
33// ---------------------------------------------------------------------------
34// Finding helpers (work with slices, return relative offsets)
35// ---------------------------------------------------------------------------
36
37/// Find the end of a code block. `after_opening` starts right after the opening `` ``` ``.
38/// Returns the byte length consumed (including the closing `` ``` ``), or `None`.
39fn find_code_block_end(after_opening: &str) -> Option<usize> {
40    let newline_pos = after_opening.find('\n')?;
41    let mut search_from = newline_pos;
42    while search_from < after_opening.len() {
43        let pos = after_opening[search_from..].find("\n```")?;
44        let end = search_from + pos + 4; // \n + ```
45        if end >= after_opening.len() || after_opening[end..].starts_with('\n') {
46            return Some(end);
47        }
48        search_from += pos + 1;
49    }
50    None
51}
52
53/// Find the position of a closing delimiter in `content`.
54/// Returns the byte offset relative to `content`, or `None`.
55///
56/// Skips over:
57/// - already-escaped characters (`\X`)
58/// - inline code spans (`` `...` ``)
59/// - code blocks (`` ```...``` ``)
60fn find_closing(content: &str, delim: &str) -> Option<usize> {
61    let mut i = 0;
62
63    while i < content.len() {
64        let ch = content[i..].chars().next().unwrap();
65
66        // Skip already-escaped characters
67        if ch == '\\'
68            && let Some(next_ch) = content.get(i + 1..).and_then(|s| s.chars().next())
69            && is_tg_special(next_ch)
70        {
71            i += 1 + next_ch.len_utf8();
72            continue;
73        }
74
75        // Skip code blocks
76        if content[i..].starts_with("```")
77            && let Some(end) = find_code_block_end(&content[i + 3..])
78        {
79            i += 3 + end;
80            continue;
81        }
82
83        // Skip inline code
84        if ch == '`'
85            && let Some(pos) = content[i + 1..].find('`')
86        {
87            i += pos + 2; // past both backticks
88            continue;
89        }
90
91        // Check for closing delimiter
92        if content[i..].starts_with(delim) {
93            return Some(i);
94        }
95
96        i += ch.len_utf8();
97    }
98
99    None
100}
101
102// ---------------------------------------------------------------------------
103// Inline formatting delimiter table
104// ---------------------------------------------------------------------------
105
106/// Edge-case guard for a formatting delimiter.
107#[derive(Clone, Copy, PartialEq, Eq)]
108enum DelimiterGuard {
109    /// No special handling.
110    None,
111    /// Reject if the opening is immediately followed by an extra copy of the
112    /// delimiter's first character.  Prevents `__` from greedily matching `___`
113    /// (underline eating into italic).
114    RejectTripled,
115    /// Reject if the closing delimiter is adjacent to another copy of the same
116    /// character.  Prevents single `_` (italic) from matching a `_` that is
117    /// part of `__` (underline).
118    RejectDoubledClose,
119}
120
121struct InlineDelimiter {
122    delim: &'static str,
123    guard: DelimiterGuard,
124}
125
126impl InlineDelimiter {
127    /// Returns `true` if the opening context rejects this match.
128    fn open_rejected(&self, after_open: &str) -> bool {
129        match self.guard {
130            DelimiterGuard::RejectTripled => after_open.starts_with(&self.delim[..1]),
131            _ => false,
132        }
133    }
134
135    /// Returns `true` if the closing position should be rejected.
136    fn close_rejected(&self, after_open: &str, close_pos: usize) -> bool {
137        match self.guard {
138            DelimiterGuard::RejectDoubledClose => {
139                let dc = self.delim.as_bytes()[0];
140                let len = self.delim.len();
141                after_open.as_bytes().get(close_pos + len) == Some(&dc)
142                    || (close_pos > 0 && after_open.as_bytes().get(close_pos - 1) == Some(&dc))
143            }
144            _ => false,
145        }
146    }
147}
148
149/// Inline formatting delimiters, checked **in order**.
150///
151/// Multi-character delimiters must precede their single-character subsets
152/// (e.g. `||` before `|`, `__` before `_`).
153const INLINE_DELIMITERS: &[InlineDelimiter] = &[
154    InlineDelimiter {
155        delim: "||",
156        guard: DelimiterGuard::None,
157    }, // spoiler
158    InlineDelimiter {
159        delim: "__",
160        guard: DelimiterGuard::RejectTripled,
161    }, // underline
162    InlineDelimiter {
163        delim: "*",
164        guard: DelimiterGuard::None,
165    }, // bold
166    InlineDelimiter {
167        delim: "_",
168        guard: DelimiterGuard::RejectDoubledClose,
169    }, // italic
170    InlineDelimiter {
171        delim: "~",
172        guard: DelimiterGuard::None,
173    }, // strikethrough
174];
175
176// ---------------------------------------------------------------------------
177// Fragment: parsed piece of the input
178// ---------------------------------------------------------------------------
179
180/// A parsed fragment of the input text.
181enum Fragment<'a> {
182    /// Already-escaped character (e.g., `\*`), pass through verbatim.
183    Escaped(char),
184    /// Code block content (between `` ``` `` markers).
185    CodeBlock(&'a str),
186    /// Inline code content (between `` ` `` markers).
187    InlineCode(&'a str),
188    /// Link with text and URL.
189    Link { text: &'a str, url: &'a str },
190    /// Formatted text with delimiter (e.g., `*bold*`).
191    Formatted {
192        delim: &'static str,
193        content: &'a str,
194    },
195    /// Plain character (escape if special).
196    Plain(char),
197}
198
199impl Fragment<'_> {
200    fn render(&self, out: &mut String) {
201        match self {
202            Self::Escaped(c) => {
203                out.push('\\');
204                out.push(*c);
205            }
206            Self::CodeBlock(content) => {
207                out.push_str("```");
208                for c in content.chars() {
209                    push_code_escaped(out, c);
210                }
211                out.push_str("```");
212            }
213            Self::InlineCode(content) => {
214                out.push('`');
215                for c in content.chars() {
216                    push_code_escaped(out, c);
217                }
218                out.push('`');
219            }
220            Self::Link { text, url } => {
221                out.push('[');
222                out.push_str(&tg_escape(text));
223                out.push_str("](");
224                out.push_str(url);
225                out.push(')');
226            }
227            Self::Formatted { delim, content } => {
228                out.push_str(delim);
229                out.push_str(&tg_escape(content));
230                out.push_str(delim);
231            }
232            Self::Plain(c) => {
233                if is_tg_special(*c) {
234                    out.push('\\');
235                }
236                out.push(*c);
237            }
238        }
239    }
240}
241
242// ---------------------------------------------------------------------------
243// Fragment parsers — each returns `Some` and advances `input` on success,
244// or returns `None` leaving `input` unchanged.
245// ---------------------------------------------------------------------------
246
247fn try_escaped_char<'a>(input: &mut &'a str) -> Option<Fragment<'a>> {
248    let rest = *input;
249    let mut chars = rest.chars();
250    if chars.next()? != '\\' {
251        return None;
252    }
253    let next = chars.next().filter(|c| is_tg_special(*c))?;
254    *input = &rest[1 + next.len_utf8()..];
255    Some(Fragment::Escaped(next))
256}
257
258fn try_code_block<'a>(input: &mut &'a str) -> Option<Fragment<'a>> {
259    let rest = *input;
260    let after_opening = rest.strip_prefix("```")?;
261    let end = find_code_block_end(after_opening)?;
262    let content = &after_opening[..end - 3]; // everything before closing ```
263    *input = &after_opening[end..];
264    Some(Fragment::CodeBlock(content))
265}
266
267fn try_inline_code<'a>(input: &mut &'a str) -> Option<Fragment<'a>> {
268    let rest = *input;
269    let after_backtick = rest.strip_prefix('`')?;
270    let close = after_backtick.find('`')?;
271    let content = &after_backtick[..close];
272    *input = &after_backtick[close + 1..];
273    Some(Fragment::InlineCode(content))
274}
275
276fn try_link<'a>(input: &mut &'a str) -> Option<Fragment<'a>> {
277    let rest = *input;
278    let after_bracket = rest.strip_prefix('[')?;
279
280    let bracket_close = find_closing(after_bracket, "]")?;
281    let after_text = after_bracket[bracket_close + 1..].strip_prefix('(')?;
282    let paren_close = after_text.find(')')?;
283
284    let text = &after_bracket[..bracket_close];
285    let url = &after_text[..paren_close];
286    *input = &after_text[paren_close + 1..];
287    Some(Fragment::Link { text, url })
288}
289
290fn try_formatting<'a>(input: &mut &'a str) -> Option<Fragment<'a>> {
291    let rest = *input;
292
293    for d in INLINE_DELIMITERS {
294        if !rest.starts_with(d.delim) {
295            continue;
296        }
297
298        let len = d.delim.len();
299        let after_open = &rest[len..];
300
301        if d.open_rejected(after_open) {
302            continue;
303        }
304
305        let Some(close) = find_closing(after_open, d.delim) else {
306            continue;
307        };
308
309        if d.close_rejected(after_open, close) {
310            continue;
311        }
312
313        let content = &after_open[..close];
314        *input = &after_open[close + len..];
315        return Some(Fragment::Formatted {
316            delim: d.delim,
317            content,
318        });
319    }
320
321    None
322}
323
324/// Parse the next fragment from `input`, advancing past it.
325fn next_fragment<'a>(input: &mut &'a str) -> Fragment<'a> {
326    if let Some(f) = try_escaped_char(input) {
327        return f;
328    }
329    if let Some(f) = try_code_block(input) {
330        return f;
331    }
332    if let Some(f) = try_inline_code(input) {
333        return f;
334    }
335    if let Some(f) = try_link(input) {
336        return f;
337    }
338    if let Some(f) = try_formatting(input) {
339        return f;
340    }
341
342    let ch = input.chars().next().unwrap();
343    *input = &input[ch.len_utf8()..];
344    Fragment::Plain(ch)
345}
346
347/// Escapes given text, abiding Telegram flavoured Markdown
348/// [rules](https://core.telegram.org/bots/api#formatting-options).
349///
350/// Preserves Telegram MarkdownV2 formatting constructs:
351/// - `*bold*`, `_italic_`, `__underline__`, `~strikethrough~`, `||spoiler||`
352/// - `` `inline code` ``, ` ```code block``` `
353/// - `[link text](url)`
354/// - Already-escaped characters like `\*` are passed through
355///
356/// All other special characters in regular text are escaped with `\`.
357/// Inside code spans/blocks, only `` ` `` and `\` are escaped.
358pub fn tg_escape(text: &str) -> String {
359    let mut out = String::with_capacity(text.len());
360    let mut input = text;
361
362    while !input.is_empty() {
363        next_fragment(&mut input).render(&mut out);
364    }
365
366    out
367}
368
369#[cfg(feature = "python")]
370mod python {
371    use pyo3::prelude::*;
372
373    /// Escape text for Telegram's MarkdownV2 formatting.
374    ///
375    /// Applies context-aware escaping:
376    /// - In regular text: escapes ``_*[]()~`>#+-=|{}.!\\`` characters
377    /// - In code blocks and inline code: only escapes `` ` `` and ``\\`` characters
378    /// - Preserves formatting: ``*bold*``, ``_italic_``, ``~strike~``, etc.
379    #[pyfunction]
380    fn tg_escape(text: &str) -> String {
381        super::tg_escape(text)
382    }
383
384    #[pymodule]
385    #[pyo3(name = "_core")]
386    fn telegram_escape_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
387        m.add_function(wrap_pyfunction!(tg_escape, m)?)?;
388        Ok(())
389    }
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395    use pretty_assertions::assert_eq;
396
397    #[test]
398    fn test_md_escape() {
399        assert_eq!(
400            tg_escape(
401                "Soon you'll get a stats for today, and the overall status can be viewed by the /get_stat command :)"
402            ),
403            r#"Soon you'll get a stats for today, and the overall status can be viewed by the /get\_stat command :\)"#
404        )
405    }
406
407    #[test]
408    fn test_escape_outside_code_all_specials() {
409        // All special chars are unmatched formatting, so all get escaped
410        let input = r#"a_*~`>#+-=|{}.!\x"#;
411        let expected = r"a\_\*\~\`\>\#\+\-\=\|\{\}\.\!\\x";
412
413        assert_eq!(tg_escape(input), expected);
414    }
415
416    #[test]
417    fn test_inline_code_escapes_only_backtick_and_backslash() {
418        // Inside inline code, only ` and \\ are escaped
419        let input = r#"Before `a_*~>#+-=|{}.!\` after"#;
420        let expected = r#"Before `a_*~>#+-=|{}.!\\` after"#;
421
422        assert_eq!(tg_escape(input), expected);
423    }
424
425    #[test]
426    fn test_code_block_escapes_only_backtick_and_backslash() {
427        // Inside code blocks, only ` and \\ are escaped
428        let input = "```\na_*[]()~`>#+-=|{}.!\\\n```";
429        let expected = "```\na_*[]()~\\`>#+-=|{}.!\\\\\n```";
430
431        assert_eq!(tg_escape(input), expected);
432    }
433
434    #[test]
435    fn test_mixed_multiple_inline_code_segments() {
436        // The new parser finds matched _..._ (italic) and *...* (bold) pairs
437        // spanning across code spans — this is valid Telegram MarkdownV2.
438        let input = r#"pre_* `codeA_*` mid_* `codeB_\` post_*"#;
439        let expected = r#"pre_\* `codeA_*` mid_* `codeB_\\` post\_*"#;
440
441        assert_eq!(tg_escape(input), expected);
442    }
443
444    #[test]
445    fn test_emphasis_around_text_with_inline_code() {
446        let input = r#"*start* `inside_*` end_*"#;
447        let expected = r#"*start* `inside_*` end\_\*"#;
448
449        assert_eq!(tg_escape(input), expected);
450    }
451
452    #[test]
453    fn test_escaped_characters() {
454        // Already-escaped characters are preserved verbatim
455        let input = r"Escaped characters: \\ \* \_ \[ \] \( \) \~";
456        let expected = r"Escaped characters: \\ \* \_ \[ \] \( \) \~";
457
458        assert_eq!(tg_escape(input), expected);
459    }
460
461    #[test]
462    fn test_math_expressions() {
463        // '<' is not a Telegram MarkdownV2 reserved character, so it is not escaped.
464        let input = r"Mathematical expressions: 2 + 2 = 4, x > y, a <= b";
465        let expected = r"Mathematical expressions: 2 \+ 2 \= 4, x \> y, a <\= b";
466
467        assert_eq!(tg_escape(input), expected);
468    }
469
470    // --- Formatting preservation ---
471
472    #[test]
473    fn test_bold_preserved() {
474        assert_eq!(tg_escape("*bold*"), "*bold*");
475    }
476
477    #[test]
478    fn test_italic_preserved() {
479        assert_eq!(tg_escape("_italic_"), "_italic_");
480    }
481
482    #[test]
483    fn test_underline_preserved() {
484        assert_eq!(tg_escape("__underline__"), "__underline__");
485    }
486
487    #[test]
488    fn test_strikethrough_preserved() {
489        assert_eq!(tg_escape("~strikethrough~"), "~strikethrough~");
490    }
491
492    #[test]
493    fn test_spoiler_preserved() {
494        assert_eq!(tg_escape("||spoiler||"), "||spoiler||");
495    }
496
497    #[test]
498    fn test_link_preserved() {
499        assert_eq!(
500            tg_escape("[Click here](https://example.com)"),
501            "[Click here](https://example.com)"
502        );
503    }
504
505    #[test]
506    fn test_link_text_escaped() {
507        assert_eq!(
508            tg_escape("[click + go](https://example.com)"),
509            r"[click \+ go](https://example.com)"
510        );
511    }
512
513    #[test]
514    fn test_nested_formatting() {
515        assert_eq!(tg_escape("*bold _italic_ bold*"), "*bold _italic_ bold*");
516    }
517
518    #[test]
519    fn test_bold_with_special_chars() {
520        assert_eq!(tg_escape("hello *world*!"), r"hello *world*\!");
521    }
522
523    #[test]
524    fn test_mixed_formatting_and_plain() {
525        assert_eq!(
526            tg_escape("hello *world* and _stuff_!"),
527            r"hello *world* and _stuff_\!"
528        );
529    }
530
531    #[test]
532    fn test_code_block_with_language() {
533        let input = "```rust\nfn main() {}\n```";
534        let expected = "```rust\nfn main() {}\n```";
535        assert_eq!(tg_escape(input), expected);
536    }
537
538    // --- Edge cases ---
539
540    #[test]
541    fn test_empty_string() {
542        assert_eq!(tg_escape(""), "");
543    }
544
545    #[test]
546    fn test_no_special_chars() {
547        assert_eq!(tg_escape("hello world"), "hello world");
548    }
549
550    #[test]
551    fn test_unmatched_bold() {
552        assert_eq!(tg_escape("price is 5*3"), r"price is 5\*3");
553    }
554
555    #[test]
556    fn test_unmatched_italic() {
557        assert_eq!(tg_escape("file_name"), r"file\_name");
558    }
559
560    #[test]
561    fn test_unmatched_backtick() {
562        assert_eq!(tg_escape("it's a `test"), r"it's a \`test");
563    }
564
565    #[test]
566    fn test_adjacent_formatting() {
567        assert_eq!(tg_escape("*bold*_italic_"), "*bold*_italic_");
568    }
569
570    #[test]
571    fn test_formatting_with_special_inside() {
572        assert_eq!(tg_escape("*2+2=4*"), r"*2\+2\=4*");
573    }
574
575    #[test]
576    fn test_multiple_newlines() {
577        assert_eq!(tg_escape("a\n\nb"), "a\n\nb");
578    }
579
580    #[test]
581    fn test_non_special_chars_pass_through() {
582        // < @ / : ; are NOT Telegram MarkdownV2 special chars
583        assert_eq!(tg_escape("a < b @ c / d : e ; f"), "a < b @ c / d : e ; f");
584    }
585
586    #[test]
587    fn test_code_block_with_backticks_inside() {
588        let input = "```\nsome `code` here\n```";
589        let expected = "```\nsome \\`code\\` here\n```";
590        assert_eq!(tg_escape(input), expected);
591    }
592
593    #[test]
594    fn test_link_with_formatted_text() {
595        assert_eq!(
596            tg_escape("[*bold link*](https://example.com)"),
597            "[*bold link*](https://example.com)"
598        );
599    }
600
601    #[test]
602    fn test_unmatched_bracket_not_link() {
603        assert_eq!(tg_escape("[not a link"), r"\[not a link");
604    }
605
606    #[test]
607    fn test_bracket_without_paren() {
608        assert_eq!(tg_escape("[text] no url"), r"\[text\] no url");
609    }
610
611    #[test]
612    fn test_spoiler_with_special_inside() {
613        assert_eq!(tg_escape("||secret!||"), r"||secret\!||");
614    }
615
616    #[test]
617    fn test_underline_vs_italic() {
618        // __ is underline, not double italic
619        assert_eq!(tg_escape("__underline__"), "__underline__");
620        // single _ around __ content
621        assert_eq!(tg_escape("_italic_"), "_italic_");
622    }
623
624    #[test]
625    fn test_escaped_delimiter_not_matched() {
626        // \* should not be treated as a bold delimiter
627        assert_eq!(tg_escape(r"\*not bold\*"), r"\*not bold\*");
628    }
629
630    #[test]
631    fn test_backslash_before_non_special() {
632        // \ before a non-special char: the \ itself is special and gets escaped
633        assert_eq!(tg_escape(r"\n"), r"\\n");
634    }
635
636    #[test]
637    fn test_consecutive_specials() {
638        assert_eq!(tg_escape("()[]{}"), r"\(\)\[\]\{\}");
639    }
640
641    #[test]
642    fn test_cyrillic_text() {
643        assert_eq!(tg_escape("НОВЫЙ"), "НОВЫЙ");
644        assert_eq!(tg_escape("Привет мир"), "Привет мир");
645        assert_eq!(tg_escape("Привет *мир*!"), r"Привет *мир*\!");
646    }
647
648    #[test]
649    fn test_multibyte_in_code() {
650        assert_eq!(tg_escape("`код`"), "`код`");
651        assert_eq!(tg_escape("```\nкод\n```"), "```\nкод\n```");
652    }
653
654    #[test]
655    fn test_delimiter_ordering_invariant() {
656        // A shorter delimiter must not precede a longer one it is a prefix of,
657        // otherwise the shorter one would greedily consume the longer one's opening.
658        for (i, a) in INLINE_DELIMITERS.iter().enumerate() {
659            for b in &INLINE_DELIMITERS[i + 1..] {
660                assert!(
661                    !b.delim.starts_with(a.delim),
662                    "'{0}' is a prefix of '{1}' but comes before it — \
663                     multi-char delimiters must precede their subsets",
664                    a.delim,
665                    b.delim,
666                );
667            }
668        }
669    }
670}
telegram_escape/lib.rs

telegram_escape/
lib.rs