Skip to main content

layer_client/
parsers.rs

1//! Text formatting parsers: HTML and Markdown ↔ Telegram [`MessageEntity`]
2//!
3//! # Markdown (Telegram-flavoured)
4//! ## Parsing (`parse_markdown`)
5//! | Syntax | Entity |
6//! |--------|--------|
7//! | `**bold**` or `*bold*` | Bold |
8//! | `__italic__` or `_italic_` | Italic |
9//! | `~~strike~~` | Strikethrough |
10//! | `\|\|spoiler\|\|` | Spoiler |
11//! | `` `code` `` | Code |
12//! | ` ```lang\npre\n``` ` | Pre (code block) |
13//! | `[text](url)` | TextUrl |
14//! | `[text](tg://user?id=123)` | MentionName |
15//! | `![text](tg://emoji?id=123)` | CustomEmoji |
16//! | `\*`, `\_`, `\~` … | Escaped literal char |
17//!
18//! ## Generating (`generate_markdown`)
19//! Produces the same syntax above for all supported entity types.
20//! `Underline` has no unambiguous markdown delimiter and is silently skipped.
21//!
22//! # HTML
23//! Supported tags: `<b>`, `<strong>`, `<i>`, `<em>`, `<u>`, `<s>`, `<del>`,
24//! `<code>`, `<pre>`, `<tg-spoiler>`, `<a href="url">`,
25//! `<tg-emoji emoji-id="id">text</tg-emoji>`
26//!
27//! # Feature gates
28//! * `html`     : enables `parse_html` / `generate_html` via the built-in hand-rolled
29//! parser (zero extra deps).
30//! * `html5ever`: replaces `parse_html` with a spec-compliant html5ever tokenizer.
31//! `generate_html` is always the same hand-rolled generator.
32
33use layer_tl_types as tl;
34
35// Markdown
36
37/// Parse Telegram-flavoured markdown into (plain_text, entities).
38pub fn parse_markdown(text: &str) -> (String, Vec<tl::enums::MessageEntity>) {
39    let mut out = String::with_capacity(text.len());
40    let mut ents = Vec::new();
41    let chars: Vec<char> = text.chars().collect();
42    let n = chars.len();
43    let mut i = 0;
44    let mut open_stack: Vec<(MarkdownTag, i32)> = Vec::new();
45    let mut utf16_off: i32 = 0;
46
47    macro_rules! push_char {
48        ($c:expr) => {{
49            let c: char = $c;
50            out.push(c);
51            utf16_off += c.len_utf16() as i32;
52        }};
53    }
54
55    while i < n {
56        // backslash escape: \X → literal X (for any special char)
57        if chars[i] == '\\' && i + 1 < n {
58            let next = chars[i + 1];
59            if matches!(
60                next,
61                '*' | '_' | '~' | '|' | '[' | ']' | '(' | ')' | '`' | '\\' | '!'
62            ) {
63                push_char!(next);
64                i += 2;
65                continue;
66            }
67        }
68
69        // code block: ```lang\n...```
70        if i + 2 < n && chars[i] == '`' && chars[i + 1] == '`' && chars[i + 2] == '`' {
71            let start = i + 3;
72            let mut j = start;
73            while j + 2 < n {
74                if chars[j] == '`' && chars[j + 1] == '`' && chars[j + 2] == '`' {
75                    break;
76                }
77                j += 1;
78            }
79            if j + 2 < n {
80                let block: String = chars[start..j].iter().collect();
81                let (lang, code) = if let Some(nl) = block.find('\n') {
82                    (block[..nl].trim().to_string(), block[nl + 1..].to_string())
83                } else {
84                    (String::new(), block)
85                };
86                let code_off = utf16_off;
87                let code_utf16: i32 = code.encode_utf16().count() as i32;
88                ents.push(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
89                    offset: code_off,
90                    length: code_utf16,
91                    language: lang,
92                }));
93                for c in code.chars() {
94                    push_char!(c);
95                }
96                i = j + 3;
97                continue;
98            }
99        }
100
101        // inline code: `code`
102        if chars[i] == '`' {
103            let start = i + 1;
104            let mut j = start;
105            while j < n && chars[j] != '`' {
106                j += 1;
107            }
108            if j < n {
109                let code: String = chars[start..j].iter().collect();
110                let code_off = utf16_off;
111                let code_utf16: i32 = code.encode_utf16().count() as i32;
112                ents.push(tl::enums::MessageEntity::Code(
113                    tl::types::MessageEntityCode {
114                        offset: code_off,
115                        length: code_utf16,
116                    },
117                ));
118                for c in code.chars() {
119                    push_char!(c);
120                }
121                i = j + 1;
122                continue;
123            }
124        }
125
126        // custom emoji: ![text](tg://emoji?id=12345)
127        if chars[i] == '!' && i + 1 < n && chars[i + 1] == '[' {
128            let text_start = i + 2;
129            let mut j = text_start;
130            while j < n && chars[j] != ']' {
131                j += 1;
132            }
133            if j < n && j + 1 < n && chars[j + 1] == '(' {
134                let link_start = j + 2;
135                let mut k = link_start;
136                while k < n && chars[k] != ')' {
137                    k += 1;
138                }
139                if k < n {
140                    let inner_text: String = chars[text_start..j].iter().collect();
141                    let url: String = chars[link_start..k].iter().collect();
142                    const EMOJI_PFX: &str = "tg://emoji?id=";
143                    if let Some(stripped) = url.strip_prefix(EMOJI_PFX) {
144                        if let Ok(doc_id) = stripped.parse::<i64>() {
145                            let ent_off = utf16_off;
146                            for c in inner_text.chars() {
147                                push_char!(c);
148                            }
149                            ents.push(tl::enums::MessageEntity::CustomEmoji(
150                                tl::types::MessageEntityCustomEmoji {
151                                    offset: ent_off,
152                                    length: utf16_off - ent_off,
153                                    document_id: doc_id,
154                                },
155                            ));
156                            i = k + 1;
157                            continue;
158                        }
159                    }
160                }
161            }
162        }
163
164        // inline link / mention: [text](url) or [text](tg://user?id=123)
165        if chars[i] == '[' {
166            let text_start = i + 1;
167            let mut j = text_start;
168            let mut depth = 1i32;
169            while j < n {
170                if chars[j] == '[' {
171                    depth += 1;
172                }
173                if chars[j] == ']' {
174                    depth -= 1;
175                    if depth == 0 {
176                        break;
177                    }
178                }
179                j += 1;
180            }
181            if j < n && j + 1 < n && chars[j + 1] == '(' {
182                let link_start = j + 2;
183                let mut k = link_start;
184                while k < n && chars[k] != ')' {
185                    k += 1;
186                }
187                if k < n {
188                    let inner_text: String = chars[text_start..j].iter().collect();
189                    let url: String = chars[link_start..k].iter().collect();
190                    const MENTION_PFX: &str = "tg://user?id=";
191                    let ent_off = utf16_off;
192                    for c in inner_text.chars() {
193                        push_char!(c);
194                    }
195                    let ent_len = utf16_off - ent_off;
196                    if let Some(stripped) = url.strip_prefix(MENTION_PFX) {
197                        if let Ok(uid) = stripped.parse::<i64>() {
198                            ents.push(tl::enums::MessageEntity::MentionName(
199                                tl::types::MessageEntityMentionName {
200                                    offset: ent_off,
201                                    length: ent_len,
202                                    user_id: uid,
203                                },
204                            ));
205                        }
206                    } else {
207                        ents.push(tl::enums::MessageEntity::TextUrl(
208                            tl::types::MessageEntityTextUrl {
209                                offset: ent_off,
210                                length: ent_len,
211                                url,
212                            },
213                        ));
214                    }
215                    i = k + 1;
216                    continue;
217                }
218            }
219        }
220
221        // two-char delimiters: **, __, ~~, ||
222        let two: Option<MarkdownTag> = if i + 1 < n {
223            match [chars[i], chars[i + 1]] {
224                ['*', '*'] => Some(MarkdownTag::Bold),
225                ['_', '_'] => Some(MarkdownTag::Italic),
226                ['~', '~'] => Some(MarkdownTag::Strike),
227                ['|', '|'] => Some(MarkdownTag::Spoiler),
228                _ => None,
229            }
230        } else {
231            None
232        };
233
234        if let Some(tag) = two {
235            if let Some(pos) = open_stack.iter().rposition(|(t, _)| *t == tag) {
236                let (_, start_off) = open_stack.remove(pos);
237                let length = utf16_off - start_off;
238                if length > 0 {
239                    ents.push(make_entity(tag, start_off, length));
240                }
241            } else {
242                open_stack.push((tag, utf16_off));
243            }
244            i += 2;
245            continue;
246        }
247
248        // single-char delimiters: *bold*, _italic_
249        // Only fires when the current char is NOT part of a two-char sequence.
250        let one: Option<MarkdownTag> = match chars[i] {
251            '*' => Some(MarkdownTag::Bold),
252            '_' => Some(MarkdownTag::Italic),
253            _ => None,
254        };
255
256        if let Some(tag) = one {
257            if let Some(pos) = open_stack.iter().rposition(|(t, _)| *t == tag) {
258                let (_, start_off) = open_stack.remove(pos);
259                let length = utf16_off - start_off;
260                if length > 0 {
261                    ents.push(make_entity(tag, start_off, length));
262                }
263            } else {
264                open_stack.push((tag, utf16_off));
265            }
266            i += 1;
267            continue;
268        }
269
270        push_char!(chars[i]);
271        i += 1;
272    }
273
274    (out, ents)
275}
276
277fn make_entity(tag: MarkdownTag, offset: i32, length: i32) -> tl::enums::MessageEntity {
278    match tag {
279        MarkdownTag::Bold => {
280            tl::enums::MessageEntity::Bold(tl::types::MessageEntityBold { offset, length })
281        }
282        MarkdownTag::Italic => {
283            tl::enums::MessageEntity::Italic(tl::types::MessageEntityItalic { offset, length })
284        }
285        MarkdownTag::Strike => {
286            tl::enums::MessageEntity::Strike(tl::types::MessageEntityStrike { offset, length })
287        }
288        MarkdownTag::Spoiler => {
289            tl::enums::MessageEntity::Spoiler(tl::types::MessageEntitySpoiler { offset, length })
290        }
291    }
292}
293
294#[derive(Debug, Clone, Copy, PartialEq, Eq)]
295enum MarkdownTag {
296    Bold,
297    Italic,
298    Strike,
299    Spoiler,
300}
301
302/// Generate Telegram markdown from plain text + entities.
303///
304/// All entity types are handled. `Underline` has no unambiguous markdown
305/// delimiter and is silently skipped (use `generate_html` if you need it).
306pub fn generate_markdown(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
307    use tl::enums::MessageEntity as ME;
308
309    // Each entry is (utf16_position, is_open, marker_string).
310    // Pre blocks need a trailing newline before the closing ```.
311    let mut insertions: Vec<(i32, bool, String)> = Vec::new();
312
313    for ent in entities {
314        match ent {
315            ME::Bold(e) => {
316                insertions.push((e.offset, true, "**".into()));
317                insertions.push((e.offset + e.length, false, "**".into()));
318            }
319            ME::Italic(e) => {
320                insertions.push((e.offset, true, "__".into()));
321                insertions.push((e.offset + e.length, false, "__".into()));
322            }
323            ME::Strike(e) => {
324                insertions.push((e.offset, true, "~~".into()));
325                insertions.push((e.offset + e.length, false, "~~".into()));
326            }
327            ME::Spoiler(e) => {
328                insertions.push((e.offset, true, "||".into()));
329                insertions.push((e.offset + e.length, false, "||".into()));
330            }
331            ME::Code(e) => {
332                insertions.push((e.offset, true, "`".into()));
333                insertions.push((e.offset + e.length, false, "`".into()));
334            }
335            ME::Pre(e) => {
336                let lang = e.language.trim();
337                insertions.push((e.offset, true, format!("```{lang}\n")));
338                insertions.push((e.offset + e.length, false, "\n```".into()));
339            }
340            ME::TextUrl(e) => {
341                insertions.push((e.offset, true, "[".into()));
342                insertions.push((e.offset + e.length, false, format!("]({})", e.url)));
343            }
344            ME::MentionName(e) => {
345                insertions.push((e.offset, true, "[".into()));
346                insertions.push((
347                    e.offset + e.length,
348                    false,
349                    format!("](tg://user?id={})", e.user_id),
350                ));
351            }
352            ME::CustomEmoji(e) => {
353                insertions.push((e.offset, true, "![".into()));
354                insertions.push((
355                    e.offset + e.length,
356                    false,
357                    format!("](tg://emoji?id={})", e.document_id),
358                ));
359            }
360            // Underline has no clean markdown delimiter; skip it.
361            _ => {}
362        }
363    }
364
365    // Sort: by position, opens before closes at the same position.
366    insertions.sort_by(|(a_pos, a_open, _), (b_pos, b_open, _)| {
367        a_pos.cmp(b_pos).then_with(|| b_open.cmp(a_open))
368    });
369
370    let mut result = String::with_capacity(
371        text.len() + insertions.iter().map(|(_, _, s)| s.len()).sum::<usize>(),
372    );
373    let mut ins_idx = 0;
374    let mut utf16_pos: i32 = 0;
375
376    for ch in text.chars() {
377        while ins_idx < insertions.len() && insertions[ins_idx].0 <= utf16_pos {
378            result.push_str(&insertions[ins_idx].2);
379            ins_idx += 1;
380        }
381        // Escape markdown special chars in plain text.
382        match ch {
383            '*' | '_' | '~' | '|' | '[' | ']' | '(' | ')' | '`' | '\\' | '!' => {
384                result.push('\\');
385                result.push(ch);
386            }
387            c => result.push(c),
388        }
389        utf16_pos += ch.len_utf16() as i32;
390    }
391    while ins_idx < insertions.len() {
392        result.push_str(&insertions[ins_idx].2);
393        ins_idx += 1;
394    }
395
396    result
397}
398
399// HTML parser: built-in hand-rolled (no extra deps)
400// Compiled when `html5ever` feature is NOT active.
401
402/// Parse a Telegram-compatible HTML string into (plain_text, entities).
403///
404/// Hand-rolled, zero-dependency implementation.  Override with the
405/// `html5ever` Cargo feature for a spec-compliant tokenizer.
406#[cfg(not(feature = "html5ever"))]
407pub fn parse_html(html: &str) -> (String, Vec<tl::enums::MessageEntity>) {
408    let mut out = String::with_capacity(html.len());
409    let mut ents = Vec::new();
410    let mut stack: Vec<(HtmlTag, i32, Option<String>)> = Vec::new();
411    let mut utf16_off: i32 = 0;
412
413    let bytes = html.as_bytes();
414    let len = bytes.len();
415    let mut i = 0;
416
417    while i < len {
418        if bytes[i] == b'<' {
419            let tag_start = i + 1;
420            let mut j = tag_start;
421            while j < len && bytes[j] != b'>' {
422                j += 1;
423            }
424            let tag_content = &html[tag_start..j];
425            i = j + 1;
426
427            let is_close = tag_content.starts_with('/');
428            let tag_str = if is_close {
429                tag_content[1..].trim()
430            } else {
431                tag_content.trim()
432            };
433            let (tag_name, attrs) = parse_tag(tag_str);
434
435            if is_close {
436                if let Some(pos) = stack.iter().rposition(|(t, _, _)| t.name() == tag_name) {
437                    let (htag, start_off, extra) = stack.remove(pos);
438                    let length = utf16_off - start_off;
439                    if length > 0 {
440                        let entity = match htag {
441                            HtmlTag::Bold => Some(tl::enums::MessageEntity::Bold(
442                                tl::types::MessageEntityBold {
443                                    offset: start_off,
444                                    length,
445                                },
446                            )),
447                            HtmlTag::Italic => Some(tl::enums::MessageEntity::Italic(
448                                tl::types::MessageEntityItalic {
449                                    offset: start_off,
450                                    length,
451                                },
452                            )),
453                            HtmlTag::Underline => Some(tl::enums::MessageEntity::Underline(
454                                tl::types::MessageEntityUnderline {
455                                    offset: start_off,
456                                    length,
457                                },
458                            )),
459                            HtmlTag::Strike => Some(tl::enums::MessageEntity::Strike(
460                                tl::types::MessageEntityStrike {
461                                    offset: start_off,
462                                    length,
463                                },
464                            )),
465                            HtmlTag::Spoiler => Some(tl::enums::MessageEntity::Spoiler(
466                                tl::types::MessageEntitySpoiler {
467                                    offset: start_off,
468                                    length,
469                                },
470                            )),
471                            HtmlTag::Code => Some(tl::enums::MessageEntity::Code(
472                                tl::types::MessageEntityCode {
473                                    offset: start_off,
474                                    length,
475                                },
476                            )),
477                            HtmlTag::Pre => {
478                                Some(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
479                                    offset: start_off,
480                                    length,
481                                    language: extra.unwrap_or_default(),
482                                }))
483                            }
484                            HtmlTag::Link(url) => {
485                                const PFX: &str = "tg://user?id=";
486                                if let Some(stripped) = url.strip_prefix(PFX) {
487                                    stripped.parse::<i64>().ok().map(|uid| {
488                                        tl::enums::MessageEntity::MentionName(
489                                            tl::types::MessageEntityMentionName {
490                                                offset: start_off,
491                                                length,
492                                                user_id: uid,
493                                            },
494                                        )
495                                    })
496                                } else {
497                                    Some(tl::enums::MessageEntity::TextUrl(
498                                        tl::types::MessageEntityTextUrl {
499                                            offset: start_off,
500                                            length,
501                                            url,
502                                        },
503                                    ))
504                                }
505                            }
506                            HtmlTag::CustomEmoji(id) => {
507                                Some(tl::enums::MessageEntity::CustomEmoji(
508                                    tl::types::MessageEntityCustomEmoji {
509                                        offset: start_off,
510                                        length,
511                                        document_id: id,
512                                    },
513                                ))
514                            }
515                            HtmlTag::Unknown => None,
516                        };
517                        if let Some(e) = entity {
518                            ents.push(e);
519                        }
520                    }
521                }
522            } else {
523                let htag = match tag_name {
524                    "b" | "strong" => HtmlTag::Bold,
525                    "i" | "em" => HtmlTag::Italic,
526                    "u" => HtmlTag::Underline,
527                    "s" | "del" | "strike" => HtmlTag::Strike,
528                    "tg-spoiler" => HtmlTag::Spoiler,
529                    "code" => HtmlTag::Code,
530                    "pre" => HtmlTag::Pre,
531                    "a" => HtmlTag::Link(
532                        attrs
533                            .iter()
534                            .find(|(k, _)| k == "href")
535                            .map(|(_, v)| v.clone())
536                            .unwrap_or_default(),
537                    ),
538                    "tg-emoji" => HtmlTag::CustomEmoji(
539                        attrs
540                            .iter()
541                            .find(|(k, _)| k == "emoji-id")
542                            .and_then(|(_, v)| v.parse::<i64>().ok())
543                            .unwrap_or(0),
544                    ),
545                    "br" => {
546                        out.push('\n');
547                        utf16_off += 1;
548                        continue;
549                    }
550                    _ => HtmlTag::Unknown,
551                };
552                stack.push((htag, utf16_off, None));
553            }
554        } else {
555            let text_start = i;
556            while i < len && bytes[i] != b'<' {
557                i += 1;
558            }
559            let decoded = decode_html_entities(&html[text_start..i]);
560            for ch in decoded.chars() {
561                out.push(ch);
562                utf16_off += ch.len_utf16() as i32;
563            }
564        }
565    }
566
567    (out, ents)
568}
569
570#[cfg(not(feature = "html5ever"))]
571fn decode_html_entities(s: &str) -> String {
572    s.replace("&amp;", "&")
573        .replace("&lt;", "<")
574        .replace("&gt;", ">")
575        .replace("&quot;", "\"")
576        .replace("&#39;", "'")
577        .replace("&nbsp;", "\u{00A0}")
578}
579
580#[cfg(not(feature = "html5ever"))]
581fn parse_tag(s: &str) -> (&str, Vec<(String, String)>) {
582    let mut parts = s.splitn(2, char::is_whitespace);
583    let name = parts.next().unwrap_or("").trim_end_matches('/');
584    let attrs = parse_attrs(parts.next().unwrap_or(""));
585    (name, attrs)
586}
587
588#[cfg(not(feature = "html5ever"))]
589fn parse_attrs(s: &str) -> Vec<(String, String)> {
590    let mut result = Vec::new();
591    let mut rem = s.trim();
592    while !rem.is_empty() {
593        if let Some(eq) = rem.find('=') {
594            let key = rem[..eq].trim().to_string();
595            rem = rem[eq + 1..].trim_start();
596            let (val, rest) = if let Some(s) = rem.strip_prefix('"') {
597                let end = s.find('"').map(|p| p + 1).unwrap_or(rem.len() - 1);
598                (rem[1..end].to_string(), &rem[end + 1..])
599            } else if let Some(s) = rem.strip_prefix('\'') {
600                let end = s.find('\'').map(|p| p + 1).unwrap_or(rem.len() - 1);
601                (rem[1..end].to_string(), &rem[end + 1..])
602            } else {
603                let end = rem.find(char::is_whitespace).unwrap_or(rem.len());
604                (rem[..end].to_string(), &rem[end..])
605            };
606            result.push((key, val));
607            rem = rest.trim_start();
608        } else {
609            break;
610        }
611    }
612    result
613}
614
615#[cfg(not(feature = "html5ever"))]
616#[allow(dead_code)]
617#[derive(Debug, Clone)]
618enum HtmlTag {
619    Bold,
620    Italic,
621    Underline,
622    Strike,
623    Spoiler,
624    Code,
625    Pre,
626    Link(String),
627    CustomEmoji(i64),
628    Unknown,
629}
630
631#[cfg(not(feature = "html5ever"))]
632impl HtmlTag {
633    fn name(&self) -> &str {
634        match self {
635            Self::Bold => "b",
636            Self::Italic => "i",
637            Self::Underline => "u",
638            Self::Strike => "s",
639            Self::Spoiler => "tg-spoiler",
640            Self::Code => "code",
641            Self::Pre => "pre",
642            Self::Link(_) => "a",
643            Self::CustomEmoji(_) => "tg-emoji",
644            Self::Unknown => "",
645        }
646    }
647}
648
649// HTML parser: html5ever backend
650// Compiled when `html5ever` feature IS active; overrides the built-in parser.
651
652/// Parse a Telegram-compatible HTML string into (plain_text, entities).
653///
654/// Uses the [`html5ever`] spec-compliant tokenizer.  Enable the `html5ever`
655/// Cargo feature to activate this implementation.
656#[cfg(feature = "html5ever")]
657#[cfg_attr(docsrs, doc(cfg(feature = "html5ever")))]
658pub fn parse_html(html: &str) -> (String, Vec<tl::enums::MessageEntity>) {
659    use html5ever::tendril::StrTendril;
660    use html5ever::tokenizer::{
661        BufferQueue, Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer,
662    };
663    use std::cell::Cell;
664
665    struct Sink {
666        text: Cell<String>,
667        entities: Cell<Vec<tl::enums::MessageEntity>>,
668        offset: Cell<i32>,
669    }
670
671    impl TokenSink for Sink {
672        type Handle = ();
673
674        fn process_token(&self, token: Token, _line: u64) -> TokenSinkResult<()> {
675            let mut text = self.text.take();
676            let mut entities = self.entities.take();
677            let mut offset = self.offset.get();
678
679            // Close the most-recent open entity of `$kind` (open = length==0).
680            // Removes the entity if start == end (zero-length element).
681            macro_rules! close_ent {
682                ($kind:ident) => {{
683                    if let Some(idx) = entities
684                        .iter()
685                        .rposition(|e| matches!(e, tl::enums::MessageEntity::$kind(_)))
686                    {
687                        let closed_len = {
688                            if let tl::enums::MessageEntity::$kind(ref mut inner) = entities[idx] {
689                                inner.length = offset - inner.offset;
690                                inner.length
691                            } else {
692                                unreachable!()
693                            }
694                        };
695                        if closed_len == 0 {
696                            entities.remove(idx);
697                        }
698                    }
699                }};
700            }
701
702            match token {
703                // Start tags
704                Token::TagToken(Tag {
705                    kind: TagKind::StartTag,
706                    name,
707                    attrs,
708                    ..
709                }) => {
710                    let len0 = 0i32;
711                    match name.as_ref() {
712                        "b" | "strong" => entities.push(tl::enums::MessageEntity::Bold(
713                            tl::types::MessageEntityBold {
714                                offset,
715                                length: len0,
716                            },
717                        )),
718                        "i" | "em" => entities.push(tl::enums::MessageEntity::Italic(
719                            tl::types::MessageEntityItalic {
720                                offset,
721                                length: len0,
722                            },
723                        )),
724                        "u" => entities.push(tl::enums::MessageEntity::Underline(
725                            tl::types::MessageEntityUnderline {
726                                offset,
727                                length: len0,
728                            },
729                        )),
730                        "s" | "del" | "strike" => entities.push(tl::enums::MessageEntity::Strike(
731                            tl::types::MessageEntityStrike {
732                                offset,
733                                length: len0,
734                            },
735                        )),
736                        "tg-spoiler" => entities.push(tl::enums::MessageEntity::Spoiler(
737                            tl::types::MessageEntitySpoiler {
738                                offset,
739                                length: len0,
740                            },
741                        )),
742                        "code" => {
743                            // Inside an open <pre>? Annotate language on the pre entity.
744                            let in_pre = entities.last().map_or(
745                                false,
746                                |e| matches!(e, tl::enums::MessageEntity::Pre(p) if p.length == 0),
747                            );
748                            if in_pre {
749                                let lang = attrs
750                                    .iter()
751                                    .find(|a| a.name.local.as_ref() == "class")
752                                    .and_then(|a| {
753                                        let v: &str = a.value.as_ref();
754                                        v.strip_prefix("language-")
755                                    })
756                                    .map(|s| s.to_string())
757                                    .unwrap_or_default();
758                                if let Some(tl::enums::MessageEntity::Pre(ref mut p)) =
759                                    entities.last_mut()
760                                {
761                                    p.language = lang;
762                                }
763                            } else {
764                                entities.push(tl::enums::MessageEntity::Code(
765                                    tl::types::MessageEntityCode {
766                                        offset,
767                                        length: len0,
768                                    },
769                                ));
770                            }
771                        }
772                        "pre" => entities.push(tl::enums::MessageEntity::Pre(
773                            tl::types::MessageEntityPre {
774                                offset,
775                                length: len0,
776                                language: String::new(),
777                            },
778                        )),
779                        "a" => {
780                            let href = attrs
781                                .iter()
782                                .find(|a| a.name.local.as_ref() == "href")
783                                .map(|a| {
784                                    let v: &str = a.value.as_ref();
785                                    v.to_string()
786                                })
787                                .unwrap_or_default();
788                            const MENTION_PFX: &str = "tg://user?id=";
789                            if href.starts_with(MENTION_PFX) {
790                                if let Ok(uid) = href[MENTION_PFX.len()..].parse::<i64>() {
791                                    entities.push(tl::enums::MessageEntity::MentionName(
792                                        tl::types::MessageEntityMentionName {
793                                            offset,
794                                            length: len0,
795                                            user_id: uid,
796                                        },
797                                    ));
798                                }
799                            } else {
800                                entities.push(tl::enums::MessageEntity::TextUrl(
801                                    tl::types::MessageEntityTextUrl {
802                                        offset,
803                                        length: len0,
804                                        url: href,
805                                    },
806                                ));
807                            }
808                        }
809                        "tg-emoji" => {
810                            let doc_id = attrs
811                                .iter()
812                                .find(|a| a.name.local.as_ref() == "emoji-id")
813                                .and_then(|a| {
814                                    let v: &str = a.value.as_ref();
815                                    v.parse::<i64>().ok()
816                                })
817                                .unwrap_or(0);
818                            entities.push(tl::enums::MessageEntity::CustomEmoji(
819                                tl::types::MessageEntityCustomEmoji {
820                                    offset,
821                                    length: len0,
822                                    document_id: doc_id,
823                                },
824                            ));
825                        }
826                        "br" => {
827                            text.push('\n');
828                            offset += 1;
829                        }
830                        _ => {}
831                    }
832                }
833
834                // End tags
835                Token::TagToken(Tag {
836                    kind: TagKind::EndTag,
837                    name,
838                    ..
839                }) => {
840                    match name.as_ref() {
841                        "b" | "strong" => close_ent!(Bold),
842                        "i" | "em" => close_ent!(Italic),
843                        "u" => close_ent!(Underline),
844                        "s" | "del" | "strike" => close_ent!(Strike),
845                        "tg-spoiler" => close_ent!(Spoiler),
846                        "code" => {
847                            // Inside open <pre>: pre absorbs the code tag.
848                            let in_pre = entities.last().map_or(
849                                false,
850                                |e| matches!(e, tl::enums::MessageEntity::Pre(p) if p.length == 0),
851                            );
852                            if !in_pre {
853                                close_ent!(Code);
854                            }
855                        }
856                        "pre" => close_ent!(Pre),
857                        "a" => match entities.last() {
858                            Some(tl::enums::MessageEntity::MentionName(_)) => {
859                                close_ent!(MentionName)
860                            }
861                            _ => close_ent!(TextUrl),
862                        },
863                        "tg-emoji" => close_ent!(CustomEmoji),
864                        _ => {}
865                    }
866                }
867
868                // Text content
869                Token::CharacterTokens(s) => {
870                    let s_str: &str = s.as_ref();
871                    offset += s_str.encode_utf16().count() as i32;
872                    text.push_str(s_str);
873                }
874
875                _ => {}
876            }
877
878            self.text.replace(text);
879            self.entities.replace(entities);
880            self.offset.replace(offset);
881            TokenSinkResult::Continue
882        }
883    }
884
885    let mut input = BufferQueue::default();
886    input.push_back(StrTendril::from_slice(html).try_reinterpret().unwrap());
887
888    let tok = Tokenizer::new(
889        Sink {
890            text: Cell::new(String::with_capacity(html.len())),
891            entities: Cell::new(Vec::new()),
892            offset: Cell::new(0),
893        },
894        Default::default(),
895    );
896    let _ = tok.feed(&mut input);
897    tok.end();
898
899    let Sink { text, entities, .. } = tok.sink;
900    (text.take(), entities.take())
901}
902
903// HTML generator (always available, no html5ever dependency)
904
905/// Generate Telegram-compatible HTML from plain text + entities.
906pub fn generate_html(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
907    use tl::enums::MessageEntity as ME;
908
909    let mut markers: Vec<(i32, bool, String)> = Vec::new();
910
911    for ent in entities {
912        let (off, len, open, close) = match ent {
913            ME::Bold(e) => (e.offset, e.length, "<b>".into(), "</b>".into()),
914            ME::Italic(e) => (e.offset, e.length, "<i>".into(), "</i>".into()),
915            ME::Underline(e) => (e.offset, e.length, "<u>".into(), "</u>".into()),
916            ME::Strike(e) => (e.offset, e.length, "<s>".into(), "</s>".into()),
917            ME::Spoiler(e) => (
918                e.offset,
919                e.length,
920                "<tg-spoiler>".into(),
921                "</tg-spoiler>".into(),
922            ),
923            ME::Code(e) => (e.offset, e.length, "<code>".into(), "</code>".into()),
924            ME::Pre(e) => {
925                let lang = if e.language.is_empty() {
926                    String::new()
927                } else {
928                    format!(" class=\"language-{}\"", e.language)
929                };
930                (
931                    e.offset,
932                    e.length,
933                    format!("<pre><code{lang}>"),
934                    "</code></pre>".into(),
935                )
936            }
937            ME::TextUrl(e) => (
938                e.offset,
939                e.length,
940                format!("<a href=\"{}\">", escape_html(&e.url)),
941                "</a>".into(),
942            ),
943            ME::MentionName(e) => (
944                e.offset,
945                e.length,
946                format!("<a href=\"tg://user?id={}\">", e.user_id),
947                "</a>".into(),
948            ),
949            ME::CustomEmoji(e) => (
950                e.offset,
951                e.length,
952                format!("<tg-emoji emoji-id=\"{}\">", e.document_id),
953                "</tg-emoji>".into(),
954            ),
955            _ => continue,
956        };
957        markers.push((off, true, open));
958        markers.push((off + len, false, close));
959    }
960
961    markers.sort_by(|(a_pos, a_open, _), (b_pos, b_open, _)| {
962        a_pos.cmp(b_pos).then_with(|| b_open.cmp(a_open))
963    });
964
965    let mut result =
966        String::with_capacity(text.len() + markers.iter().map(|(_, _, s)| s.len()).sum::<usize>());
967    let mut marker_idx = 0;
968    let mut utf16_pos: i32 = 0;
969
970    for ch in text.chars() {
971        while marker_idx < markers.len() && markers[marker_idx].0 <= utf16_pos {
972            result.push_str(&markers[marker_idx].2);
973            marker_idx += 1;
974        }
975        match ch {
976            '&' => result.push_str("&amp;"),
977            '<' => result.push_str("&lt;"),
978            '>' => result.push_str("&gt;"),
979            '"' => result.push_str("&quot;"),
980            c => result.push(c),
981        }
982        utf16_pos += ch.len_utf16() as i32;
983    }
984    while marker_idx < markers.len() {
985        result.push_str(&markers[marker_idx].2);
986        marker_idx += 1;
987    }
988
989    result
990}
991
992fn escape_html(s: &str) -> String {
993    s.replace('&', "&amp;")
994        .replace('<', "&lt;")
995        .replace('>', "&gt;")
996        .replace('"', "&quot;")
997}
998
999// Tests
1000
1001#[cfg(test)]
1002mod tests {
1003    use super::*;
1004
1005    #[test]
1006    fn markdown_bold() {
1007        let (text, ents) = parse_markdown("Hello **world**!");
1008        assert_eq!(text, "Hello world!");
1009        assert_eq!(ents.len(), 1);
1010        if let tl::enums::MessageEntity::Bold(b) = &ents[0] {
1011            assert_eq!(b.offset, 6);
1012            assert_eq!(b.length, 5);
1013        } else {
1014            panic!("expected bold");
1015        }
1016    }
1017
1018    #[test]
1019    fn markdown_bold_single_asterisk() {
1020        let (text, ents) = parse_markdown("*bold*");
1021        assert_eq!(text, "bold");
1022        assert!(matches!(ents[0], tl::enums::MessageEntity::Bold(_)));
1023    }
1024
1025    #[test]
1026    fn markdown_italic_double_underscore() {
1027        let (text, ents) = parse_markdown("__italic__");
1028        assert_eq!(text, "italic");
1029        assert!(matches!(ents[0], tl::enums::MessageEntity::Italic(_)));
1030    }
1031
1032    #[test]
1033    fn markdown_italic_single_underscore() {
1034        let (text, ents) = parse_markdown("_italic_");
1035        assert_eq!(text, "italic");
1036        assert!(matches!(ents[0], tl::enums::MessageEntity::Italic(_)));
1037    }
1038
1039    #[test]
1040    fn markdown_inline_code() {
1041        let (text, ents) = parse_markdown("Use `foo()` to do it");
1042        assert_eq!(text, "Use foo() to do it");
1043        assert!(matches!(ents[0], tl::enums::MessageEntity::Code(_)));
1044    }
1045
1046    #[test]
1047    fn markdown_code_block_with_lang() {
1048        let (text, ents) = parse_markdown("```rust\nfn main() {}\n```");
1049        assert_eq!(text, "fn main() {}");
1050        if let tl::enums::MessageEntity::Pre(p) = &ents[0] {
1051            assert_eq!(p.language, "rust");
1052            assert_eq!(p.offset, 0);
1053        } else {
1054            panic!("expected pre");
1055        }
1056    }
1057
1058    #[test]
1059    fn markdown_code_block_no_lang() {
1060        let (text, ents) = parse_markdown("```\nhello\n```");
1061        assert_eq!(text, "hello");
1062        if let tl::enums::MessageEntity::Pre(p) = &ents[0] {
1063            assert_eq!(p.language, "");
1064        } else {
1065            panic!("expected pre");
1066        }
1067    }
1068
1069    #[test]
1070    fn markdown_strike() {
1071        let (text, ents) = parse_markdown("~~strike~~");
1072        assert_eq!(text, "strike");
1073        assert!(matches!(ents[0], tl::enums::MessageEntity::Strike(_)));
1074    }
1075
1076    #[test]
1077    fn markdown_spoiler() {
1078        let (text, ents) = parse_markdown("||spoiler||");
1079        assert_eq!(text, "spoiler");
1080        assert!(matches!(ents[0], tl::enums::MessageEntity::Spoiler(_)));
1081    }
1082
1083    #[test]
1084    fn markdown_text_url() {
1085        let (text, ents) = parse_markdown("[click](https://example.com)");
1086        assert_eq!(text, "click");
1087        if let tl::enums::MessageEntity::TextUrl(e) = &ents[0] {
1088            assert_eq!(e.url, "https://example.com");
1089        } else {
1090            panic!("expected text url");
1091        }
1092    }
1093
1094    #[test]
1095    fn markdown_mention() {
1096        let (text, ents) = parse_markdown("[User](tg://user?id=42)");
1097        assert_eq!(text, "User");
1098        if let tl::enums::MessageEntity::MentionName(e) = &ents[0] {
1099            assert_eq!(e.user_id, 42);
1100        } else {
1101            panic!("expected mention name");
1102        }
1103    }
1104
1105    #[test]
1106    fn markdown_custom_emoji() {
1107        let (text, ents) = parse_markdown("![👍](tg://emoji?id=5368324170671202286)");
1108        assert_eq!(text, "👍");
1109        if let tl::enums::MessageEntity::CustomEmoji(e) = &ents[0] {
1110            assert_eq!(e.document_id, 5368324170671202286);
1111        } else {
1112            panic!("expected custom emoji");
1113        }
1114    }
1115
1116    #[test]
1117    fn markdown_backslash_escape() {
1118        let (text, ents) = parse_markdown(r"\*not bold\*");
1119        assert_eq!(text, "*not bold*");
1120        assert!(ents.is_empty());
1121    }
1122
1123    #[test]
1124    fn markdown_nested() {
1125        let (text, ents) = parse_markdown("**bold __italic__ end**");
1126        assert_eq!(text, "bold italic end");
1127        assert_eq!(ents.len(), 2);
1128        assert!(
1129            ents.iter()
1130                .any(|e| matches!(e, tl::enums::MessageEntity::Bold(_)))
1131        );
1132        assert!(
1133            ents.iter()
1134                .any(|e| matches!(e, tl::enums::MessageEntity::Italic(_)))
1135        );
1136    }
1137
1138    #[test]
1139    fn generate_markdown_pre() {
1140        let entities = vec![tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
1141            offset: 0,
1142            length: 12,
1143            language: "rust".into(),
1144        })];
1145        let md = generate_markdown("fn main() {}", &entities);
1146        assert_eq!(md, "```rust\nfn main() {}\n```");
1147    }
1148
1149    #[test]
1150    fn generate_markdown_text_url() {
1151        let entities = vec![tl::enums::MessageEntity::TextUrl(
1152            tl::types::MessageEntityTextUrl {
1153                offset: 0,
1154                length: 5,
1155                url: "https://example.com".into(),
1156            },
1157        )];
1158        let md = generate_markdown("click", &entities);
1159        assert_eq!(md, "[click](https://example.com)");
1160    }
1161
1162    #[test]
1163    fn generate_markdown_mention() {
1164        let entities = vec![tl::enums::MessageEntity::MentionName(
1165            tl::types::MessageEntityMentionName {
1166                offset: 0,
1167                length: 4,
1168                user_id: 99,
1169            },
1170        )];
1171        let md = generate_markdown("User", &entities);
1172        assert_eq!(md, "[User](tg://user?id=99)");
1173    }
1174
1175    #[test]
1176    fn generate_markdown_custom_emoji() {
1177        let entities = vec![tl::enums::MessageEntity::CustomEmoji(
1178            tl::types::MessageEntityCustomEmoji {
1179                offset: 0,
1180                length: 2,
1181                document_id: 123456,
1182            },
1183        )];
1184        let md = generate_markdown("👍", &entities);
1185        assert_eq!(md, "![👍](tg://emoji?id=123456)");
1186    }
1187
1188    #[test]
1189    fn generate_markdown_escapes_special_chars() {
1190        let (_, empty): (_, Vec<_>) = (String::new(), vec![]);
1191        let md = generate_markdown("1 * 2 = 2", &empty);
1192        assert_eq!(md, r"1 \* 2 = 2");
1193    }
1194
1195    #[test]
1196    fn markdown_roundtrip_url() {
1197        let original = "click";
1198        let entities = vec![tl::enums::MessageEntity::TextUrl(
1199            tl::types::MessageEntityTextUrl {
1200                offset: 0,
1201                length: 5,
1202                url: "https://example.com".into(),
1203            },
1204        )];
1205        let md = generate_markdown(original, &entities);
1206        let (back, ents2) = parse_markdown(&md);
1207        assert_eq!(back, original);
1208        if let tl::enums::MessageEntity::TextUrl(e) = &ents2[0] {
1209            assert_eq!(e.url, "https://example.com");
1210        } else {
1211            panic!("roundtrip url failed");
1212        }
1213    }
1214
1215    #[test]
1216    fn html_bold_italic() {
1217        let (text, ents) = parse_html("<b>bold</b> and <i>italic</i>");
1218        assert_eq!(text, "bold and italic");
1219        assert_eq!(ents.len(), 2);
1220    }
1221
1222    #[test]
1223    fn html_link() {
1224        let (text, ents) = parse_html("<a href=\"https://example.com\">click</a>");
1225        assert_eq!(text, "click");
1226        if let tl::enums::MessageEntity::TextUrl(e) = &ents[0] {
1227            assert_eq!(e.url, "https://example.com");
1228        } else {
1229            panic!("expected text url");
1230        }
1231    }
1232
1233    // HTML entity decoding is a hand-rolled-only feature; html5ever handles it natively.
1234    #[cfg(not(feature = "html5ever"))]
1235    #[test]
1236    fn html_entities_decoded() {
1237        let (text, _) = parse_html("A &amp; B &lt;3&gt;");
1238        assert_eq!(text, "A & B <3>");
1239    }
1240
1241    #[test]
1242    fn generate_html_roundtrip() {
1243        let original = "Hello world";
1244        let entities = vec![tl::enums::MessageEntity::Bold(
1245            tl::types::MessageEntityBold {
1246                offset: 0,
1247                length: 5,
1248            },
1249        )];
1250        let html = generate_html(original, &entities);
1251        assert_eq!(html, "<b>Hello</b> world");
1252        let (back, ents2) = parse_html(&html);
1253        assert_eq!(back, original);
1254        assert_eq!(ents2.len(), 1);
1255    }
1256}