Skip to main content

layer_client/
parsers.rs

1// Copyright (c) Ankit Chaubey <ankitchaubey.dev@gmail.com>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4// NOTE:
5// The "Layer" project is no longer maintained or supported.
6// Its original purpose for personal SDK/APK experimentation and learning
7// has been fulfilled.
8//
9// Please use Ferogram instead:
10// https://github.com/ankit-chaubey/ferogram
11// Ferogram will receive future updates and development, although progress
12// may be slower.
13//
14// Ferogram is an async Telegram MTProto client library written in Rust.
15// Its implementation follows the behaviour of the official Telegram clients,
16// particularly Telegram Desktop and TDLib, and aims to provide a clean and
17// modern async interface for building Telegram clients and tools.
18
19//! Text formatting parsers: HTML and Markdown ↔ Telegram [`MessageEntity`]
20//!
21//! # Markdown (Telegram-flavoured)
22//! ## Parsing (`parse_markdown`)
23//! | Syntax | Entity |
24//! |--------|--------|
25//! | `**bold**` or `*bold*` | Bold |
26//! | `__italic__` or `_italic_` | Italic |
27//! | `~~strike~~` | Strikethrough |
28//! | `\|\|spoiler\|\|` | Spoiler |
29//! | `` `code` `` | Code |
30//! | ` ```lang\npre\n``` ` | Pre (code block) |
31//! | `[text](url)` | TextUrl |
32//! | `[text](tg://user?id=123)` | MentionName |
33//! | `![text](tg://emoji?id=123)` | CustomEmoji |
34//! | `\*`, `\_`, `\~` … | Escaped literal char |
35//!
36//! ## Generating (`generate_markdown`)
37//! Produces the same syntax above for all supported entity types.
38//! `Underline` has no unambiguous markdown delimiter and is silently skipped.
39//!
40//! # HTML
41//! Supported tags: `<b>`, `<strong>`, `<i>`, `<em>`, `<u>`, `<s>`, `<del>`,
42//! `<code>`, `<pre>`, `<tg-spoiler>`, `<a href="url">`,
43//! `<tg-emoji emoji-id="id">text</tg-emoji>`
44//!
45//! # Feature gates
46//! * `html`     : enables `parse_html` / `generate_html` via the built-in hand-rolled
47//!   parser (zero extra deps).
48//! * `html5ever`: replaces `parse_html` with a spec-compliant html5ever tokenizer.
49//!   `generate_html` is always the same hand-rolled generator.
50
51use layer_tl_types as tl;
52
53// Markdown
54
55/// Parse Telegram-flavoured markdown into (plain_text, entities).
56pub fn parse_markdown(text: &str) -> (String, Vec<tl::enums::MessageEntity>) {
57    let mut out = String::with_capacity(text.len());
58    let mut ents = Vec::new();
59    let chars: Vec<char> = text.chars().collect();
60    let n = chars.len();
61    let mut i = 0;
62    let mut open_stack: Vec<(MarkdownTag, i32)> = Vec::new();
63    let mut utf16_off: i32 = 0;
64
65    macro_rules! push_char {
66        ($c:expr) => {{
67            let c: char = $c;
68            out.push(c);
69            utf16_off += c.len_utf16() as i32;
70        }};
71    }
72
73    while i < n {
74        // backslash escape: \X → literal X (for any special char)
75        if chars[i] == '\\' && i + 1 < n {
76            let next = chars[i + 1];
77            if matches!(
78                next,
79                '*' | '_' | '~' | '|' | '[' | ']' | '(' | ')' | '`' | '\\' | '!'
80            ) {
81                push_char!(next);
82                i += 2;
83                continue;
84            }
85        }
86
87        // code block: ```lang\n...```
88        if i + 2 < n && chars[i] == '`' && chars[i + 1] == '`' && chars[i + 2] == '`' {
89            let start = i + 3;
90            let mut j = start;
91            while j + 2 < n {
92                if chars[j] == '`' && chars[j + 1] == '`' && chars[j + 2] == '`' {
93                    break;
94                }
95                j += 1;
96            }
97            if j + 2 < n {
98                let block: String = chars[start..j].iter().collect();
99                let (lang, code) = if let Some(nl) = block.find('\n') {
100                    (block[..nl].trim().to_string(), block[nl + 1..].to_string())
101                } else {
102                    (String::new(), block)
103                };
104                let code_off = utf16_off;
105                let code_utf16: i32 = code.encode_utf16().count() as i32;
106                ents.push(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
107                    offset: code_off,
108                    length: code_utf16,
109                    language: lang,
110                }));
111                for c in code.chars() {
112                    push_char!(c);
113                }
114                i = j + 3;
115                continue;
116            }
117        }
118
119        // inline code: `code`
120        if chars[i] == '`' {
121            let start = i + 1;
122            let mut j = start;
123            while j < n && chars[j] != '`' {
124                j += 1;
125            }
126            if j < n {
127                let code: String = chars[start..j].iter().collect();
128                let code_off = utf16_off;
129                let code_utf16: i32 = code.encode_utf16().count() as i32;
130                ents.push(tl::enums::MessageEntity::Code(
131                    tl::types::MessageEntityCode {
132                        offset: code_off,
133                        length: code_utf16,
134                    },
135                ));
136                for c in code.chars() {
137                    push_char!(c);
138                }
139                i = j + 1;
140                continue;
141            }
142        }
143
144        // custom emoji: ![text](tg://emoji?id=12345)
145        if chars[i] == '!' && i + 1 < n && chars[i + 1] == '[' {
146            let text_start = i + 2;
147            let mut j = text_start;
148            while j < n && chars[j] != ']' {
149                j += 1;
150            }
151            if j < n && j + 1 < n && chars[j + 1] == '(' {
152                let link_start = j + 2;
153                let mut k = link_start;
154                while k < n && chars[k] != ')' {
155                    k += 1;
156                }
157                if k < n {
158                    let inner_text: String = chars[text_start..j].iter().collect();
159                    let url: String = chars[link_start..k].iter().collect();
160                    const EMOJI_PFX: &str = "tg://emoji?id=";
161                    if let Some(stripped) = url.strip_prefix(EMOJI_PFX)
162                        && let Ok(doc_id) = stripped.parse::<i64>()
163                    {
164                        let ent_off = utf16_off;
165                        for c in inner_text.chars() {
166                            push_char!(c);
167                        }
168                        ents.push(tl::enums::MessageEntity::CustomEmoji(
169                            tl::types::MessageEntityCustomEmoji {
170                                offset: ent_off,
171                                length: utf16_off - ent_off,
172                                document_id: doc_id,
173                            },
174                        ));
175                        i = k + 1;
176                        continue;
177                    }
178                }
179            }
180        }
181
182        // inline link / mention: [text](url) or [text](tg://user?id=123)
183        if chars[i] == '[' {
184            let text_start = i + 1;
185            let mut j = text_start;
186            let mut depth = 1i32;
187            while j < n {
188                if chars[j] == '[' {
189                    depth += 1;
190                }
191                if chars[j] == ']' {
192                    depth -= 1;
193                    if depth == 0 {
194                        break;
195                    }
196                }
197                j += 1;
198            }
199            if j < n && j + 1 < n && chars[j + 1] == '(' {
200                let link_start = j + 2;
201                let mut k = link_start;
202                while k < n && chars[k] != ')' {
203                    k += 1;
204                }
205                if k < n {
206                    let inner_text: String = chars[text_start..j].iter().collect();
207                    let url: String = chars[link_start..k].iter().collect();
208                    const MENTION_PFX: &str = "tg://user?id=";
209                    let ent_off = utf16_off;
210                    for c in inner_text.chars() {
211                        push_char!(c);
212                    }
213                    let ent_len = utf16_off - ent_off;
214                    if let Some(stripped) = url.strip_prefix(MENTION_PFX) {
215                        if let Ok(uid) = stripped.parse::<i64>() {
216                            ents.push(tl::enums::MessageEntity::MentionName(
217                                tl::types::MessageEntityMentionName {
218                                    offset: ent_off,
219                                    length: ent_len,
220                                    user_id: uid,
221                                },
222                            ));
223                        }
224                    } else {
225                        ents.push(tl::enums::MessageEntity::TextUrl(
226                            tl::types::MessageEntityTextUrl {
227                                offset: ent_off,
228                                length: ent_len,
229                                url,
230                            },
231                        ));
232                    }
233                    i = k + 1;
234                    continue;
235                }
236            }
237        }
238
239        // two-char delimiters: **, __, ~~, ||
240        let two: Option<MarkdownTag> = if i + 1 < n {
241            match [chars[i], chars[i + 1]] {
242                ['*', '*'] => Some(MarkdownTag::Bold),
243                ['_', '_'] => Some(MarkdownTag::Italic),
244                ['~', '~'] => Some(MarkdownTag::Strike),
245                ['|', '|'] => Some(MarkdownTag::Spoiler),
246                _ => None,
247            }
248        } else {
249            None
250        };
251
252        if let Some(tag) = two {
253            if let Some(pos) = open_stack.iter().rposition(|(t, _)| *t == tag) {
254                let (_, start_off) = open_stack.remove(pos);
255                let length = utf16_off - start_off;
256                if length > 0 {
257                    ents.push(make_entity(tag, start_off, length));
258                }
259            } else {
260                open_stack.push((tag, utf16_off));
261            }
262            i += 2;
263            continue;
264        }
265
266        // single-char delimiters: *bold*, _italic_
267        // Only fires when the current char is NOT part of a two-char sequence.
268        let one: Option<MarkdownTag> = match chars[i] {
269            '*' => Some(MarkdownTag::Bold),
270            '_' => Some(MarkdownTag::Italic),
271            _ => None,
272        };
273
274        if let Some(tag) = one {
275            if let Some(pos) = open_stack.iter().rposition(|(t, _)| *t == tag) {
276                let (_, start_off) = open_stack.remove(pos);
277                let length = utf16_off - start_off;
278                if length > 0 {
279                    ents.push(make_entity(tag, start_off, length));
280                }
281            } else {
282                open_stack.push((tag, utf16_off));
283            }
284            i += 1;
285            continue;
286        }
287
288        push_char!(chars[i]);
289        i += 1;
290    }
291
292    (out, ents)
293}
294
295fn make_entity(tag: MarkdownTag, offset: i32, length: i32) -> tl::enums::MessageEntity {
296    match tag {
297        MarkdownTag::Bold => {
298            tl::enums::MessageEntity::Bold(tl::types::MessageEntityBold { offset, length })
299        }
300        MarkdownTag::Italic => {
301            tl::enums::MessageEntity::Italic(tl::types::MessageEntityItalic { offset, length })
302        }
303        MarkdownTag::Strike => {
304            tl::enums::MessageEntity::Strike(tl::types::MessageEntityStrike { offset, length })
305        }
306        MarkdownTag::Spoiler => {
307            tl::enums::MessageEntity::Spoiler(tl::types::MessageEntitySpoiler { offset, length })
308        }
309    }
310}
311
312#[derive(Debug, Clone, Copy, PartialEq, Eq)]
313enum MarkdownTag {
314    Bold,
315    Italic,
316    Strike,
317    Spoiler,
318}
319
320/// Generate Telegram markdown from plain text + entities.
321///
322/// All entity types are handled. `Underline` has no unambiguous markdown
323/// delimiter and is silently skipped (use `generate_html` if you need it).
324pub fn generate_markdown(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
325    use tl::enums::MessageEntity as ME;
326
327    // Each entry is (utf16_position, is_open, marker_string).
328    // Pre blocks need a trailing newline before the closing ```.
329    let mut insertions: Vec<(i32, bool, String)> = Vec::new();
330
331    for ent in entities {
332        match ent {
333            ME::Bold(e) => {
334                insertions.push((e.offset, true, "**".into()));
335                insertions.push((e.offset + e.length, false, "**".into()));
336            }
337            ME::Italic(e) => {
338                insertions.push((e.offset, true, "__".into()));
339                insertions.push((e.offset + e.length, false, "__".into()));
340            }
341            ME::Strike(e) => {
342                insertions.push((e.offset, true, "~~".into()));
343                insertions.push((e.offset + e.length, false, "~~".into()));
344            }
345            ME::Spoiler(e) => {
346                insertions.push((e.offset, true, "||".into()));
347                insertions.push((e.offset + e.length, false, "||".into()));
348            }
349            ME::Code(e) => {
350                insertions.push((e.offset, true, "`".into()));
351                insertions.push((e.offset + e.length, false, "`".into()));
352            }
353            ME::Pre(e) => {
354                let lang = e.language.trim();
355                insertions.push((e.offset, true, format!("```{lang}\n")));
356                insertions.push((e.offset + e.length, false, "\n```".into()));
357            }
358            ME::TextUrl(e) => {
359                insertions.push((e.offset, true, "[".into()));
360                insertions.push((e.offset + e.length, false, format!("]({})", e.url)));
361            }
362            ME::MentionName(e) => {
363                insertions.push((e.offset, true, "[".into()));
364                insertions.push((
365                    e.offset + e.length,
366                    false,
367                    format!("](tg://user?id={})", e.user_id),
368                ));
369            }
370            ME::CustomEmoji(e) => {
371                insertions.push((e.offset, true, "![".into()));
372                insertions.push((
373                    e.offset + e.length,
374                    false,
375                    format!("](tg://emoji?id={})", e.document_id),
376                ));
377            }
378            // Underline has no clean markdown delimiter; skip it.
379            _ => {}
380        }
381    }
382
383    // Sort: by position, opens before closes at the same position.
384    insertions.sort_by(|(a_pos, a_open, _), (b_pos, b_open, _)| {
385        a_pos.cmp(b_pos).then_with(|| b_open.cmp(a_open))
386    });
387
388    let mut result = String::with_capacity(
389        text.len() + insertions.iter().map(|(_, _, s)| s.len()).sum::<usize>(),
390    );
391    let mut ins_idx = 0;
392    let mut utf16_pos: i32 = 0;
393
394    for ch in text.chars() {
395        while ins_idx < insertions.len() && insertions[ins_idx].0 <= utf16_pos {
396            result.push_str(&insertions[ins_idx].2);
397            ins_idx += 1;
398        }
399        // Escape markdown special chars in plain text.
400        match ch {
401            '*' | '_' | '~' | '|' | '[' | ']' | '(' | ')' | '`' | '\\' | '!' => {
402                result.push('\\');
403                result.push(ch);
404            }
405            c => result.push(c),
406        }
407        utf16_pos += ch.len_utf16() as i32;
408    }
409    while ins_idx < insertions.len() {
410        result.push_str(&insertions[ins_idx].2);
411        ins_idx += 1;
412    }
413
414    result
415}
416
417// HTML parser: built-in hand-rolled (no extra deps)
418// Compiled when `html5ever` feature is NOT active.
419
420/// Parse a Telegram-compatible HTML string into (plain_text, entities).
421///
422/// Hand-rolled, zero-dependency implementation.  Override with the
423/// `html5ever` Cargo feature for a spec-compliant tokenizer.
424#[cfg(not(feature = "html5ever"))]
425pub fn parse_html(html: &str) -> (String, Vec<tl::enums::MessageEntity>) {
426    let mut out = String::with_capacity(html.len());
427    let mut ents = Vec::new();
428    let mut stack: Vec<(HtmlTag, i32, Option<String>)> = Vec::new();
429    let mut utf16_off: i32 = 0;
430
431    let bytes = html.as_bytes();
432    let len = bytes.len();
433    let mut i = 0;
434
435    while i < len {
436        if bytes[i] == b'<' {
437            let tag_start = i + 1;
438            let mut j = tag_start;
439            while j < len && bytes[j] != b'>' {
440                j += 1;
441            }
442            let tag_content = &html[tag_start..j];
443            i = j + 1;
444
445            let is_close = tag_content.starts_with('/');
446            let tag_str = if is_close {
447                tag_content[1..].trim()
448            } else {
449                tag_content.trim()
450            };
451            let (tag_name, attrs) = parse_tag(tag_str);
452
453            if is_close {
454                if let Some(pos) = stack.iter().rposition(|(t, _, _)| t.name() == tag_name) {
455                    let (htag, start_off, extra) = stack.remove(pos);
456                    let length = utf16_off - start_off;
457                    if length > 0 {
458                        let entity = match htag {
459                            HtmlTag::Bold => Some(tl::enums::MessageEntity::Bold(
460                                tl::types::MessageEntityBold {
461                                    offset: start_off,
462                                    length,
463                                },
464                            )),
465                            HtmlTag::Italic => Some(tl::enums::MessageEntity::Italic(
466                                tl::types::MessageEntityItalic {
467                                    offset: start_off,
468                                    length,
469                                },
470                            )),
471                            HtmlTag::Underline => Some(tl::enums::MessageEntity::Underline(
472                                tl::types::MessageEntityUnderline {
473                                    offset: start_off,
474                                    length,
475                                },
476                            )),
477                            HtmlTag::Strike => Some(tl::enums::MessageEntity::Strike(
478                                tl::types::MessageEntityStrike {
479                                    offset: start_off,
480                                    length,
481                                },
482                            )),
483                            HtmlTag::Spoiler => Some(tl::enums::MessageEntity::Spoiler(
484                                tl::types::MessageEntitySpoiler {
485                                    offset: start_off,
486                                    length,
487                                },
488                            )),
489                            HtmlTag::Code => Some(tl::enums::MessageEntity::Code(
490                                tl::types::MessageEntityCode {
491                                    offset: start_off,
492                                    length,
493                                },
494                            )),
495                            HtmlTag::Pre => {
496                                Some(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
497                                    offset: start_off,
498                                    length,
499                                    language: extra.unwrap_or_default(),
500                                }))
501                            }
502                            HtmlTag::Link(url) => {
503                                const PFX: &str = "tg://user?id=";
504                                if let Some(stripped) = url.strip_prefix(PFX) {
505                                    stripped.parse::<i64>().ok().map(|uid| {
506                                        tl::enums::MessageEntity::MentionName(
507                                            tl::types::MessageEntityMentionName {
508                                                offset: start_off,
509                                                length,
510                                                user_id: uid,
511                                            },
512                                        )
513                                    })
514                                } else {
515                                    Some(tl::enums::MessageEntity::TextUrl(
516                                        tl::types::MessageEntityTextUrl {
517                                            offset: start_off,
518                                            length,
519                                            url,
520                                        },
521                                    ))
522                                }
523                            }
524                            HtmlTag::CustomEmoji(id) => {
525                                Some(tl::enums::MessageEntity::CustomEmoji(
526                                    tl::types::MessageEntityCustomEmoji {
527                                        offset: start_off,
528                                        length,
529                                        document_id: id,
530                                    },
531                                ))
532                            }
533                            HtmlTag::Unknown => None,
534                        };
535                        if let Some(e) = entity {
536                            ents.push(e);
537                        }
538                    }
539                }
540            } else {
541                let htag = match tag_name {
542                    "b" | "strong" => HtmlTag::Bold,
543                    "i" | "em" => HtmlTag::Italic,
544                    "u" => HtmlTag::Underline,
545                    "s" | "del" | "strike" => HtmlTag::Strike,
546                    "tg-spoiler" => HtmlTag::Spoiler,
547                    "code" => HtmlTag::Code,
548                    "pre" => HtmlTag::Pre,
549                    "a" => HtmlTag::Link(
550                        attrs
551                            .iter()
552                            .find(|(k, _)| k == "href")
553                            .map(|(_, v)| v.clone())
554                            .unwrap_or_default(),
555                    ),
556                    "tg-emoji" => HtmlTag::CustomEmoji(
557                        attrs
558                            .iter()
559                            .find(|(k, _)| k == "emoji-id")
560                            .and_then(|(_, v)| v.parse::<i64>().ok())
561                            .unwrap_or(0),
562                    ),
563                    "br" => {
564                        out.push('\n');
565                        utf16_off += 1;
566                        continue;
567                    }
568                    _ => HtmlTag::Unknown,
569                };
570                stack.push((htag, utf16_off, None));
571            }
572        } else {
573            let text_start = i;
574            while i < len && bytes[i] != b'<' {
575                i += 1;
576            }
577            let decoded = decode_html_entities(&html[text_start..i]);
578            for ch in decoded.chars() {
579                out.push(ch);
580                utf16_off += ch.len_utf16() as i32;
581            }
582        }
583    }
584
585    (out, ents)
586}
587
588#[cfg(not(feature = "html5ever"))]
589fn decode_html_entities(s: &str) -> String {
590    s.replace("&amp;", "&")
591        .replace("&lt;", "<")
592        .replace("&gt;", ">")
593        .replace("&quot;", "\"")
594        .replace("&#39;", "'")
595        .replace("&nbsp;", "\u{00A0}")
596}
597
598#[cfg(not(feature = "html5ever"))]
599fn parse_tag(s: &str) -> (&str, Vec<(String, String)>) {
600    let mut parts = s.splitn(2, char::is_whitespace);
601    let name = parts.next().unwrap_or("").trim_end_matches('/');
602    let attrs = parse_attrs(parts.next().unwrap_or(""));
603    (name, attrs)
604}
605
606#[cfg(not(feature = "html5ever"))]
607fn parse_attrs(s: &str) -> Vec<(String, String)> {
608    let mut result = Vec::new();
609    let mut rem = s.trim();
610    while !rem.is_empty() {
611        if let Some(eq) = rem.find('=') {
612            let key = rem[..eq].trim().to_string();
613            rem = rem[eq + 1..].trim_start();
614            let (val, rest) = if let Some(s) = rem.strip_prefix('"') {
615                let end = s.find('"').map(|p| p + 1).unwrap_or(rem.len() - 1);
616                (rem[1..end].to_string(), &rem[end + 1..])
617            } else if let Some(s) = rem.strip_prefix('\'') {
618                let end = s.find('\'').map(|p| p + 1).unwrap_or(rem.len() - 1);
619                (rem[1..end].to_string(), &rem[end + 1..])
620            } else {
621                let end = rem.find(char::is_whitespace).unwrap_or(rem.len());
622                (rem[..end].to_string(), &rem[end..])
623            };
624            result.push((key, val));
625            rem = rest.trim_start();
626        } else {
627            break;
628        }
629    }
630    result
631}
632
633#[cfg(not(feature = "html5ever"))]
634#[allow(dead_code)]
635#[derive(Debug, Clone)]
636enum HtmlTag {
637    Bold,
638    Italic,
639    Underline,
640    Strike,
641    Spoiler,
642    Code,
643    Pre,
644    Link(String),
645    CustomEmoji(i64),
646    Unknown,
647}
648
649#[cfg(not(feature = "html5ever"))]
650impl HtmlTag {
651    fn name(&self) -> &str {
652        match self {
653            Self::Bold => "b",
654            Self::Italic => "i",
655            Self::Underline => "u",
656            Self::Strike => "s",
657            Self::Spoiler => "tg-spoiler",
658            Self::Code => "code",
659            Self::Pre => "pre",
660            Self::Link(_) => "a",
661            Self::CustomEmoji(_) => "tg-emoji",
662            Self::Unknown => "",
663        }
664    }
665}
666
667// HTML parser: html5ever backend
668// Compiled when `html5ever` feature IS active; overrides the built-in parser.
669
670/// Parse a Telegram-compatible HTML string into (plain_text, entities).
671///
672/// Uses the [`html5ever`] spec-compliant tokenizer.  Enable the `html5ever`
673/// Cargo feature to activate this implementation.
674#[cfg(feature = "html5ever")]
675#[cfg_attr(docsrs, doc(cfg(feature = "html5ever")))]
676pub fn parse_html(html: &str) -> (String, Vec<tl::enums::MessageEntity>) {
677    use html5ever::tendril::StrTendril;
678    use html5ever::tokenizer::{
679        BufferQueue, Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer,
680    };
681    use std::cell::Cell;
682
683    struct Sink {
684        text: Cell<String>,
685        entities: Cell<Vec<tl::enums::MessageEntity>>,
686        offset: Cell<i32>,
687    }
688
689    impl TokenSink for Sink {
690        type Handle = ();
691
692        fn process_token(&self, token: Token, _line: u64) -> TokenSinkResult<()> {
693            let mut text = self.text.take();
694            let mut entities = self.entities.take();
695            let mut offset = self.offset.get();
696
697            // Close the most-recent open entity of `$kind` (open = length==0).
698            // Removes the entity if start == end (zero-length element).
699            macro_rules! close_ent {
700                ($kind:ident) => {{
701                    if let Some(idx) = entities
702                        .iter()
703                        .rposition(|e| matches!(e, tl::enums::MessageEntity::$kind(_)))
704                    {
705                        let closed_len = {
706                            if let tl::enums::MessageEntity::$kind(ref mut inner) = entities[idx] {
707                                inner.length = offset - inner.offset;
708                                inner.length
709                            } else {
710                                unreachable!()
711                            }
712                        };
713                        if closed_len == 0 {
714                            entities.remove(idx);
715                        }
716                    }
717                }};
718            }
719
720            match token {
721                // Start tags
722                Token::TagToken(Tag {
723                    kind: TagKind::StartTag,
724                    name,
725                    attrs,
726                    ..
727                }) => {
728                    let len0 = 0i32;
729                    match name.as_ref() {
730                        "b" | "strong" => entities.push(tl::enums::MessageEntity::Bold(
731                            tl::types::MessageEntityBold {
732                                offset,
733                                length: len0,
734                            },
735                        )),
736                        "i" | "em" => entities.push(tl::enums::MessageEntity::Italic(
737                            tl::types::MessageEntityItalic {
738                                offset,
739                                length: len0,
740                            },
741                        )),
742                        "u" => entities.push(tl::enums::MessageEntity::Underline(
743                            tl::types::MessageEntityUnderline {
744                                offset,
745                                length: len0,
746                            },
747                        )),
748                        "s" | "del" | "strike" => entities.push(tl::enums::MessageEntity::Strike(
749                            tl::types::MessageEntityStrike {
750                                offset,
751                                length: len0,
752                            },
753                        )),
754                        "tg-spoiler" => entities.push(tl::enums::MessageEntity::Spoiler(
755                            tl::types::MessageEntitySpoiler {
756                                offset,
757                                length: len0,
758                            },
759                        )),
760                        "code" => {
761                            // Inside an open <pre>? Annotate language on the pre entity.
762                            let in_pre = entities.last().map_or(
763                                false,
764                                |e| matches!(e, tl::enums::MessageEntity::Pre(p) if p.length == 0),
765                            );
766                            if in_pre {
767                                let lang = attrs
768                                    .iter()
769                                    .find(|a| a.name.local.as_ref() == "class")
770                                    .and_then(|a| {
771                                        let v: &str = a.value.as_ref();
772                                        v.strip_prefix("language-")
773                                    })
774                                    .map(|s| s.to_string())
775                                    .unwrap_or_default();
776                                if let Some(tl::enums::MessageEntity::Pre(ref mut p)) =
777                                    entities.last_mut()
778                                {
779                                    p.language = lang;
780                                }
781                            } else {
782                                entities.push(tl::enums::MessageEntity::Code(
783                                    tl::types::MessageEntityCode {
784                                        offset,
785                                        length: len0,
786                                    },
787                                ));
788                            }
789                        }
790                        "pre" => entities.push(tl::enums::MessageEntity::Pre(
791                            tl::types::MessageEntityPre {
792                                offset,
793                                length: len0,
794                                language: String::new(),
795                            },
796                        )),
797                        "a" => {
798                            let href = attrs
799                                .iter()
800                                .find(|a| a.name.local.as_ref() == "href")
801                                .map(|a| {
802                                    let v: &str = a.value.as_ref();
803                                    v.to_string()
804                                })
805                                .unwrap_or_default();
806                            const MENTION_PFX: &str = "tg://user?id=";
807                            if href.starts_with(MENTION_PFX) {
808                                if let Ok(uid) = href[MENTION_PFX.len()..].parse::<i64>() {
809                                    entities.push(tl::enums::MessageEntity::MentionName(
810                                        tl::types::MessageEntityMentionName {
811                                            offset,
812                                            length: len0,
813                                            user_id: uid,
814                                        },
815                                    ));
816                                }
817                            } else {
818                                entities.push(tl::enums::MessageEntity::TextUrl(
819                                    tl::types::MessageEntityTextUrl {
820                                        offset,
821                                        length: len0,
822                                        url: href,
823                                    },
824                                ));
825                            }
826                        }
827                        "tg-emoji" => {
828                            let doc_id = attrs
829                                .iter()
830                                .find(|a| a.name.local.as_ref() == "emoji-id")
831                                .and_then(|a| {
832                                    let v: &str = a.value.as_ref();
833                                    v.parse::<i64>().ok()
834                                })
835                                .unwrap_or(0);
836                            entities.push(tl::enums::MessageEntity::CustomEmoji(
837                                tl::types::MessageEntityCustomEmoji {
838                                    offset,
839                                    length: len0,
840                                    document_id: doc_id,
841                                },
842                            ));
843                        }
844                        "br" => {
845                            text.push('\n');
846                            offset += 1;
847                        }
848                        _ => {}
849                    }
850                }
851
852                // End tags
853                Token::TagToken(Tag {
854                    kind: TagKind::EndTag,
855                    name,
856                    ..
857                }) => {
858                    match name.as_ref() {
859                        "b" | "strong" => close_ent!(Bold),
860                        "i" | "em" => close_ent!(Italic),
861                        "u" => close_ent!(Underline),
862                        "s" | "del" | "strike" => close_ent!(Strike),
863                        "tg-spoiler" => close_ent!(Spoiler),
864                        "code" => {
865                            // Inside open <pre>: pre absorbs the code tag.
866                            let in_pre = entities.last().map_or(
867                                false,
868                                |e| matches!(e, tl::enums::MessageEntity::Pre(p) if p.length == 0),
869                            );
870                            if !in_pre {
871                                close_ent!(Code);
872                            }
873                        }
874                        "pre" => close_ent!(Pre),
875                        "a" => match entities.last() {
876                            Some(tl::enums::MessageEntity::MentionName(_)) => {
877                                close_ent!(MentionName)
878                            }
879                            _ => close_ent!(TextUrl),
880                        },
881                        "tg-emoji" => close_ent!(CustomEmoji),
882                        _ => {}
883                    }
884                }
885
886                // Text content
887                Token::CharacterTokens(s) => {
888                    let s_str: &str = s.as_ref();
889                    offset += s_str.encode_utf16().count() as i32;
890                    text.push_str(s_str);
891                }
892
893                _ => {}
894            }
895
896            self.text.replace(text);
897            self.entities.replace(entities);
898            self.offset.replace(offset);
899            TokenSinkResult::Continue
900        }
901    }
902
903    let mut input = BufferQueue::default();
904    input.push_back(StrTendril::from_slice(html).try_reinterpret().unwrap());
905
906    let tok = Tokenizer::new(
907        Sink {
908            text: Cell::new(String::with_capacity(html.len())),
909            entities: Cell::new(Vec::new()),
910            offset: Cell::new(0),
911        },
912        Default::default(),
913    );
914    let _ = tok.feed(&mut input);
915    tok.end();
916
917    let Sink { text, entities, .. } = tok.sink;
918    (text.take(), entities.take())
919}
920
921// HTML generator (always available, no html5ever dependency)
922
923/// Generate Telegram-compatible HTML from plain text + entities.
924pub fn generate_html(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
925    use tl::enums::MessageEntity as ME;
926
927    let mut markers: Vec<(i32, bool, String)> = Vec::new();
928
929    for ent in entities {
930        let (off, len, open, close) = match ent {
931            ME::Bold(e) => (e.offset, e.length, "<b>".into(), "</b>".into()),
932            ME::Italic(e) => (e.offset, e.length, "<i>".into(), "</i>".into()),
933            ME::Underline(e) => (e.offset, e.length, "<u>".into(), "</u>".into()),
934            ME::Strike(e) => (e.offset, e.length, "<s>".into(), "</s>".into()),
935            ME::Spoiler(e) => (
936                e.offset,
937                e.length,
938                "<tg-spoiler>".into(),
939                "</tg-spoiler>".into(),
940            ),
941            ME::Code(e) => (e.offset, e.length, "<code>".into(), "</code>".into()),
942            ME::Pre(e) => {
943                let lang = if e.language.is_empty() {
944                    String::new()
945                } else {
946                    format!(" class=\"language-{}\"", e.language)
947                };
948                (
949                    e.offset,
950                    e.length,
951                    format!("<pre><code{lang}>"),
952                    "</code></pre>".into(),
953                )
954            }
955            ME::TextUrl(e) => (
956                e.offset,
957                e.length,
958                format!("<a href=\"{}\">", escape_html(&e.url)),
959                "</a>".into(),
960            ),
961            ME::MentionName(e) => (
962                e.offset,
963                e.length,
964                format!("<a href=\"tg://user?id={}\">", e.user_id),
965                "</a>".into(),
966            ),
967            ME::CustomEmoji(e) => (
968                e.offset,
969                e.length,
970                format!("<tg-emoji emoji-id=\"{}\">", e.document_id),
971                "</tg-emoji>".into(),
972            ),
973            _ => continue,
974        };
975        markers.push((off, true, open));
976        markers.push((off + len, false, close));
977    }
978
979    markers.sort_by(|(a_pos, a_open, _), (b_pos, b_open, _)| {
980        a_pos.cmp(b_pos).then_with(|| b_open.cmp(a_open))
981    });
982
983    let mut result =
984        String::with_capacity(text.len() + markers.iter().map(|(_, _, s)| s.len()).sum::<usize>());
985    let mut marker_idx = 0;
986    let mut utf16_pos: i32 = 0;
987
988    for ch in text.chars() {
989        while marker_idx < markers.len() && markers[marker_idx].0 <= utf16_pos {
990            result.push_str(&markers[marker_idx].2);
991            marker_idx += 1;
992        }
993        match ch {
994            '&' => result.push_str("&amp;"),
995            '<' => result.push_str("&lt;"),
996            '>' => result.push_str("&gt;"),
997            '"' => result.push_str("&quot;"),
998            c => result.push(c),
999        }
1000        utf16_pos += ch.len_utf16() as i32;
1001    }
1002    while marker_idx < markers.len() {
1003        result.push_str(&markers[marker_idx].2);
1004        marker_idx += 1;
1005    }
1006
1007    result
1008}
1009
1010fn escape_html(s: &str) -> String {
1011    s.replace('&', "&amp;")
1012        .replace('<', "&lt;")
1013        .replace('>', "&gt;")
1014        .replace('"', "&quot;")
1015}
1016
1017// Tests
1018
1019#[cfg(test)]
1020mod tests {
1021    use super::*;
1022
1023    #[test]
1024    fn markdown_bold() {
1025        let (text, ents) = parse_markdown("Hello **world**!");
1026        assert_eq!(text, "Hello world!");
1027        assert_eq!(ents.len(), 1);
1028        if let tl::enums::MessageEntity::Bold(b) = &ents[0] {
1029            assert_eq!(b.offset, 6);
1030            assert_eq!(b.length, 5);
1031        } else {
1032            panic!("expected bold");
1033        }
1034    }
1035
1036    #[test]
1037    fn markdown_bold_single_asterisk() {
1038        let (text, ents) = parse_markdown("*bold*");
1039        assert_eq!(text, "bold");
1040        assert!(matches!(ents[0], tl::enums::MessageEntity::Bold(_)));
1041    }
1042
1043    #[test]
1044    fn markdown_italic_double_underscore() {
1045        let (text, ents) = parse_markdown("__italic__");
1046        assert_eq!(text, "italic");
1047        assert!(matches!(ents[0], tl::enums::MessageEntity::Italic(_)));
1048    }
1049
1050    #[test]
1051    fn markdown_italic_single_underscore() {
1052        let (text, ents) = parse_markdown("_italic_");
1053        assert_eq!(text, "italic");
1054        assert!(matches!(ents[0], tl::enums::MessageEntity::Italic(_)));
1055    }
1056
1057    #[test]
1058    fn markdown_inline_code() {
1059        let (text, ents) = parse_markdown("Use `foo()` to do it");
1060        assert_eq!(text, "Use foo() to do it");
1061        assert!(matches!(ents[0], tl::enums::MessageEntity::Code(_)));
1062    }
1063
1064    #[test]
1065    fn markdown_code_block_with_lang() {
1066        let (text, ents) = parse_markdown("```rust\nfn main() {}\n```");
1067        assert_eq!(text, "fn main() {}");
1068        if let tl::enums::MessageEntity::Pre(p) = &ents[0] {
1069            assert_eq!(p.language, "rust");
1070            assert_eq!(p.offset, 0);
1071        } else {
1072            panic!("expected pre");
1073        }
1074    }
1075
1076    #[test]
1077    fn markdown_code_block_no_lang() {
1078        let (text, ents) = parse_markdown("```\nhello\n```");
1079        assert_eq!(text, "hello");
1080        if let tl::enums::MessageEntity::Pre(p) = &ents[0] {
1081            assert_eq!(p.language, "");
1082        } else {
1083            panic!("expected pre");
1084        }
1085    }
1086
1087    #[test]
1088    fn markdown_strike() {
1089        let (text, ents) = parse_markdown("~~strike~~");
1090        assert_eq!(text, "strike");
1091        assert!(matches!(ents[0], tl::enums::MessageEntity::Strike(_)));
1092    }
1093
1094    #[test]
1095    fn markdown_spoiler() {
1096        let (text, ents) = parse_markdown("||spoiler||");
1097        assert_eq!(text, "spoiler");
1098        assert!(matches!(ents[0], tl::enums::MessageEntity::Spoiler(_)));
1099    }
1100
1101    #[test]
1102    fn markdown_text_url() {
1103        let (text, ents) = parse_markdown("[click](https://example.com)");
1104        assert_eq!(text, "click");
1105        if let tl::enums::MessageEntity::TextUrl(e) = &ents[0] {
1106            assert_eq!(e.url, "https://example.com");
1107        } else {
1108            panic!("expected text url");
1109        }
1110    }
1111
1112    #[test]
1113    fn markdown_mention() {
1114        let (text, ents) = parse_markdown("[User](tg://user?id=42)");
1115        assert_eq!(text, "User");
1116        if let tl::enums::MessageEntity::MentionName(e) = &ents[0] {
1117            assert_eq!(e.user_id, 42);
1118        } else {
1119            panic!("expected mention name");
1120        }
1121    }
1122
1123    #[test]
1124    fn markdown_custom_emoji() {
1125        let (text, ents) = parse_markdown("![👍](tg://emoji?id=5368324170671202286)");
1126        assert_eq!(text, "👍");
1127        if let tl::enums::MessageEntity::CustomEmoji(e) = &ents[0] {
1128            assert_eq!(e.document_id, 5368324170671202286);
1129        } else {
1130            panic!("expected custom emoji");
1131        }
1132    }
1133
1134    #[test]
1135    fn markdown_backslash_escape() {
1136        let (text, ents) = parse_markdown(r"\*not bold\*");
1137        assert_eq!(text, "*not bold*");
1138        assert!(ents.is_empty());
1139    }
1140
1141    #[test]
1142    fn markdown_nested() {
1143        let (text, ents) = parse_markdown("**bold __italic__ end**");
1144        assert_eq!(text, "bold italic end");
1145        assert_eq!(ents.len(), 2);
1146        assert!(
1147            ents.iter()
1148                .any(|e| matches!(e, tl::enums::MessageEntity::Bold(_)))
1149        );
1150        assert!(
1151            ents.iter()
1152                .any(|e| matches!(e, tl::enums::MessageEntity::Italic(_)))
1153        );
1154    }
1155
1156    #[test]
1157    fn generate_markdown_pre() {
1158        let entities = vec![tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
1159            offset: 0,
1160            length: 12,
1161            language: "rust".into(),
1162        })];
1163        let md = generate_markdown("fn main() {}", &entities);
1164        assert_eq!(md, "```rust\nfn main() {}\n```");
1165    }
1166
1167    #[test]
1168    fn generate_markdown_text_url() {
1169        let entities = vec![tl::enums::MessageEntity::TextUrl(
1170            tl::types::MessageEntityTextUrl {
1171                offset: 0,
1172                length: 5,
1173                url: "https://example.com".into(),
1174            },
1175        )];
1176        let md = generate_markdown("click", &entities);
1177        assert_eq!(md, "[click](https://example.com)");
1178    }
1179
1180    #[test]
1181    fn generate_markdown_mention() {
1182        let entities = vec![tl::enums::MessageEntity::MentionName(
1183            tl::types::MessageEntityMentionName {
1184                offset: 0,
1185                length: 4,
1186                user_id: 99,
1187            },
1188        )];
1189        let md = generate_markdown("User", &entities);
1190        assert_eq!(md, "[User](tg://user?id=99)");
1191    }
1192
1193    #[test]
1194    fn generate_markdown_custom_emoji() {
1195        let entities = vec![tl::enums::MessageEntity::CustomEmoji(
1196            tl::types::MessageEntityCustomEmoji {
1197                offset: 0,
1198                length: 2,
1199                document_id: 123456,
1200            },
1201        )];
1202        let md = generate_markdown("👍", &entities);
1203        assert_eq!(md, "![👍](tg://emoji?id=123456)");
1204    }
1205
1206    #[test]
1207    fn generate_markdown_escapes_special_chars() {
1208        let (_, empty): (_, Vec<_>) = (String::new(), vec![]);
1209        let md = generate_markdown("1 * 2 = 2", &empty);
1210        assert_eq!(md, r"1 \* 2 = 2");
1211    }
1212
1213    #[test]
1214    fn markdown_roundtrip_url() {
1215        let original = "click";
1216        let entities = vec![tl::enums::MessageEntity::TextUrl(
1217            tl::types::MessageEntityTextUrl {
1218                offset: 0,
1219                length: 5,
1220                url: "https://example.com".into(),
1221            },
1222        )];
1223        let md = generate_markdown(original, &entities);
1224        let (back, ents2) = parse_markdown(&md);
1225        assert_eq!(back, original);
1226        if let tl::enums::MessageEntity::TextUrl(e) = &ents2[0] {
1227            assert_eq!(e.url, "https://example.com");
1228        } else {
1229            panic!("roundtrip url failed");
1230        }
1231    }
1232
1233    #[test]
1234    fn html_bold_italic() {
1235        let (text, ents) = parse_html("<b>bold</b> and <i>italic</i>");
1236        assert_eq!(text, "bold and italic");
1237        assert_eq!(ents.len(), 2);
1238    }
1239
1240    #[test]
1241    fn html_link() {
1242        let (text, ents) = parse_html("<a href=\"https://example.com\">click</a>");
1243        assert_eq!(text, "click");
1244        if let tl::enums::MessageEntity::TextUrl(e) = &ents[0] {
1245            assert_eq!(e.url, "https://example.com");
1246        } else {
1247            panic!("expected text url");
1248        }
1249    }
1250
1251    // HTML entity decoding is a hand-rolled-only feature; html5ever handles it natively.
1252    #[cfg(not(feature = "html5ever"))]
1253    #[test]
1254    fn html_entities_decoded() {
1255        let (text, _) = parse_html("A &amp; B &lt;3&gt;");
1256        assert_eq!(text, "A & B <3>");
1257    }
1258
1259    #[test]
1260    fn generate_html_roundtrip() {
1261        let original = "Hello world";
1262        let entities = vec![tl::enums::MessageEntity::Bold(
1263            tl::types::MessageEntityBold {
1264                offset: 0,
1265                length: 5,
1266            },
1267        )];
1268        let html = generate_html(original, &entities);
1269        assert_eq!(html, "<b>Hello</b> world");
1270        let (back, ents2) = parse_html(&html);
1271        assert_eq!(back, original);
1272        assert_eq!(ents2.len(), 1);
1273    }
1274}