Skip to main content

layer_client/
parsers.rs

1//! Text formatting parsers: HTML and Markdown ↔ Telegram [`MessageEntity`]
2//!
3//! # Markdown (Telegram-flavoured)
4//! Supported: `**bold**`, `__italic__`, `~~strike~~`, `||spoiler||`, `` `code` ``,
5//! ` ```lang\npre``` `, `[text](url)`, `[text](tg://user?id=123)`
6//!
7//! # HTML
8//! Supported tags: `<b>`, `<strong>`, `<i>`, `<em>`, `<u>`, `<s>`, `<del>`,
9//! `<code>`, `<pre>`, `<tg-spoiler>`, `<a href="url">`,
10//! `<tg-emoji emoji-id="id">text</tg-emoji>`
11//!
12//! # Feature gates
13//! * `html`     : enables `parse_html` / `generate_html` via the built-in hand-rolled
14//! parser (zero extra deps).
15//! * `html5ever`: replaces `parse_html` with a spec-compliant html5ever tokenizer.
16//! `generate_html` is always the same hand-rolled generator.
17
18use layer_tl_types as tl;
19
20// Markdown
21
22/// Parse Telegram-flavoured markdown into (plain_text, entities).
23pub fn parse_markdown(text: &str) -> (String, Vec<tl::enums::MessageEntity>) {
24    let mut out = String::with_capacity(text.len());
25    let mut ents = Vec::new();
26    let chars: Vec<char> = text.chars().collect();
27    let n = chars.len();
28    let mut i = 0;
29    let mut open_stack: Vec<(MarkdownTag, i32)> = Vec::new();
30    let mut utf16_off: i32 = 0;
31
32    macro_rules! push_char {
33        ($c:expr) => {{
34            let c: char = $c;
35            out.push(c);
36            utf16_off += c.len_utf16() as i32;
37        }};
38    }
39
40    while i < n {
41        // code block ```lang\n...```
42        if i + 2 < n && chars[i] == '`' && chars[i + 1] == '`' && chars[i + 2] == '`' {
43            let start = i + 3;
44            let mut j = start;
45            while j + 2 < n {
46                if chars[j] == '`' && chars[j + 1] == '`' && chars[j + 2] == '`' {
47                    break;
48                }
49                j += 1;
50            }
51            if j + 2 < n {
52                let block: String = chars[start..j].iter().collect();
53                let (lang, code) = if let Some(nl) = block.find('\n') {
54                    (block[..nl].trim().to_string(), block[nl + 1..].to_string())
55                } else {
56                    (String::new(), block)
57                };
58                let code_off = utf16_off;
59                let code_utf16: i32 = code.encode_utf16().count() as i32;
60                ents.push(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
61                    offset: code_off,
62                    length: code_utf16,
63                    language: lang,
64                }));
65                for c in code.chars() {
66                    push_char!(c);
67                }
68                i = j + 3;
69                continue;
70            }
71        }
72
73        // inline code
74        if chars[i] == '`' {
75            let start = i + 1;
76            let mut j = start;
77            while j < n && chars[j] != '`' {
78                j += 1;
79            }
80            if j < n {
81                let code: String = chars[start..j].iter().collect();
82                let code_off = utf16_off;
83                let code_utf16: i32 = code.encode_utf16().count() as i32;
84                ents.push(tl::enums::MessageEntity::Code(
85                    tl::types::MessageEntityCode {
86                        offset: code_off,
87                        length: code_utf16,
88                    },
89                ));
90                for c in code.chars() {
91                    push_char!(c);
92                }
93                i = j + 1;
94                continue;
95            }
96        }
97
98        // [text](url)
99        if chars[i] == '[' {
100            let text_start = i + 1;
101            let mut j = text_start;
102            let mut depth = 1i32;
103            while j < n {
104                if chars[j] == '[' {
105                    depth += 1;
106                }
107                if chars[j] == ']' {
108                    depth -= 1;
109                    if depth == 0 {
110                        break;
111                    }
112                }
113                j += 1;
114            }
115            if j < n && j + 1 < n && chars[j + 1] == '(' {
116                let link_start = j + 2;
117                let mut k = link_start;
118                while k < n && chars[k] != ')' {
119                    k += 1;
120                }
121                if k < n {
122                    let inner_text: String = chars[text_start..j].iter().collect();
123                    let url: String = chars[link_start..k].iter().collect();
124                    const MENTION_PFX: &str = "tg://user?id=";
125                    let ent_off = utf16_off;
126                    for c in inner_text.chars() {
127                        push_char!(c);
128                    }
129                    let ent_len = utf16_off - ent_off;
130                    if let Some(stripped) = url.strip_prefix(MENTION_PFX) {
131                        if let Ok(uid) = stripped.parse::<i64>() {
132                            ents.push(tl::enums::MessageEntity::MentionName(
133                                tl::types::MessageEntityMentionName {
134                                    offset: ent_off,
135                                    length: ent_len,
136                                    user_id: uid,
137                                },
138                            ));
139                        }
140                    } else {
141                        ents.push(tl::enums::MessageEntity::TextUrl(
142                            tl::types::MessageEntityTextUrl {
143                                offset: ent_off,
144                                length: ent_len,
145                                url,
146                            },
147                        ));
148                    }
149                    i = k + 1;
150                    continue;
151                }
152            }
153        }
154
155        // two-char delimiters
156        let two: Option<(&str, MarkdownTag)> = if i + 1 < n {
157            match [chars[i], chars[i + 1]] {
158                ['*', '*'] => Some(("**", MarkdownTag::Bold)),
159                ['_', '_'] => Some(("__", MarkdownTag::Italic)),
160                ['~', '~'] => Some(("~~", MarkdownTag::Strike)),
161                ['|', '|'] => Some(("||", MarkdownTag::Spoiler)),
162                _ => None,
163            }
164        } else {
165            None
166        };
167
168        if let Some((_delim, tag)) = two {
169            if let Some(pos) = open_stack.iter().rposition(|(t, _)| *t == tag) {
170                let (_, start_off) = open_stack.remove(pos);
171                let length = utf16_off - start_off;
172                let entity = match tag {
173                    MarkdownTag::Bold => {
174                        tl::enums::MessageEntity::Bold(tl::types::MessageEntityBold {
175                            offset: start_off,
176                            length,
177                        })
178                    }
179                    MarkdownTag::Italic => {
180                        tl::enums::MessageEntity::Italic(tl::types::MessageEntityItalic {
181                            offset: start_off,
182                            length,
183                        })
184                    }
185                    MarkdownTag::Strike => {
186                        tl::enums::MessageEntity::Strike(tl::types::MessageEntityStrike {
187                            offset: start_off,
188                            length,
189                        })
190                    }
191                    MarkdownTag::Spoiler => {
192                        tl::enums::MessageEntity::Spoiler(tl::types::MessageEntitySpoiler {
193                            offset: start_off,
194                            length,
195                        })
196                    }
197                };
198                if length > 0 {
199                    ents.push(entity);
200                }
201            } else {
202                open_stack.push((tag, utf16_off));
203            }
204            i += 2;
205            continue;
206        }
207
208        push_char!(chars[i]);
209        i += 1;
210    }
211
212    (out, ents)
213}
214
215#[derive(Debug, Clone, Copy, PartialEq, Eq)]
216enum MarkdownTag {
217    Bold,
218    Italic,
219    Strike,
220    Spoiler,
221}
222
223/// Generate Telegram markdown from plain text + entities.
224pub fn generate_markdown(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
225    use tl::enums::MessageEntity as ME;
226    let mut insertions: Vec<(i32, &'static str)> = Vec::new();
227    for ent in entities {
228        match ent {
229            ME::Bold(e) => {
230                insertions.push((e.offset, "**"));
231                insertions.push((e.offset + e.length, "**"));
232            }
233            ME::Italic(e) => {
234                insertions.push((e.offset, "__"));
235                insertions.push((e.offset + e.length, "__"));
236            }
237            ME::Strike(e) => {
238                insertions.push((e.offset, "~~"));
239                insertions.push((e.offset + e.length, "~~"));
240            }
241            ME::Spoiler(e) => {
242                insertions.push((e.offset, "||"));
243                insertions.push((e.offset + e.length, "||"));
244            }
245            ME::Code(e) => {
246                insertions.push((e.offset, "`"));
247                insertions.push((e.offset + e.length, "`"));
248            }
249            _ => {}
250        }
251    }
252    insertions.sort_by_key(|&(pos, _)| pos);
253
254    let mut result = String::with_capacity(text.len() + insertions.len() * 4);
255    let mut ins_idx = 0;
256    let mut utf16_pos: i32 = 0;
257    for ch in text.chars() {
258        while ins_idx < insertions.len() && insertions[ins_idx].0 <= utf16_pos {
259            result.push_str(insertions[ins_idx].1);
260            ins_idx += 1;
261        }
262        result.push(ch);
263        utf16_pos += ch.len_utf16() as i32;
264    }
265    while ins_idx < insertions.len() {
266        result.push_str(insertions[ins_idx].1);
267        ins_idx += 1;
268    }
269    result
270}
271
272// HTML parser: built-in hand-rolled (no extra deps)
273// Compiled when `html5ever` feature is NOT active.
274
275/// Parse a Telegram-compatible HTML string into (plain_text, entities).
276///
277/// Hand-rolled, zero-dependency implementation.  Override with the
278/// `html5ever` Cargo feature for a spec-compliant tokenizer.
279#[cfg(not(feature = "html5ever"))]
280pub fn parse_html(html: &str) -> (String, Vec<tl::enums::MessageEntity>) {
281    let mut out = String::with_capacity(html.len());
282    let mut ents = Vec::new();
283    let mut stack: Vec<(HtmlTag, i32, Option<String>)> = Vec::new();
284    let mut utf16_off: i32 = 0;
285
286    let bytes = html.as_bytes();
287    let len = bytes.len();
288    let mut i = 0;
289
290    while i < len {
291        if bytes[i] == b'<' {
292            let tag_start = i + 1;
293            let mut j = tag_start;
294            while j < len && bytes[j] != b'>' {
295                j += 1;
296            }
297            let tag_content = &html[tag_start..j];
298            i = j + 1;
299
300            let is_close = tag_content.starts_with('/');
301            let tag_str = if is_close {
302                tag_content[1..].trim()
303            } else {
304                tag_content.trim()
305            };
306            let (tag_name, attrs) = parse_tag(tag_str);
307
308            if is_close {
309                if let Some(pos) = stack.iter().rposition(|(t, _, _)| t.name() == tag_name) {
310                    let (htag, start_off, extra) = stack.remove(pos);
311                    let length = utf16_off - start_off;
312                    if length > 0 {
313                        let entity = match htag {
314                            HtmlTag::Bold => Some(tl::enums::MessageEntity::Bold(
315                                tl::types::MessageEntityBold {
316                                    offset: start_off,
317                                    length,
318                                },
319                            )),
320                            HtmlTag::Italic => Some(tl::enums::MessageEntity::Italic(
321                                tl::types::MessageEntityItalic {
322                                    offset: start_off,
323                                    length,
324                                },
325                            )),
326                            HtmlTag::Underline => Some(tl::enums::MessageEntity::Underline(
327                                tl::types::MessageEntityUnderline {
328                                    offset: start_off,
329                                    length,
330                                },
331                            )),
332                            HtmlTag::Strike => Some(tl::enums::MessageEntity::Strike(
333                                tl::types::MessageEntityStrike {
334                                    offset: start_off,
335                                    length,
336                                },
337                            )),
338                            HtmlTag::Spoiler => Some(tl::enums::MessageEntity::Spoiler(
339                                tl::types::MessageEntitySpoiler {
340                                    offset: start_off,
341                                    length,
342                                },
343                            )),
344                            HtmlTag::Code => Some(tl::enums::MessageEntity::Code(
345                                tl::types::MessageEntityCode {
346                                    offset: start_off,
347                                    length,
348                                },
349                            )),
350                            HtmlTag::Pre => {
351                                Some(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
352                                    offset: start_off,
353                                    length,
354                                    language: extra.unwrap_or_default(),
355                                }))
356                            }
357                            HtmlTag::Link(url) => {
358                                const PFX: &str = "tg://user?id=";
359                                if let Some(stripped) = url.strip_prefix(PFX) {
360                                    stripped.parse::<i64>().ok().map(|uid| {
361                                        tl::enums::MessageEntity::MentionName(
362                                            tl::types::MessageEntityMentionName {
363                                                offset: start_off,
364                                                length,
365                                                user_id: uid,
366                                            },
367                                        )
368                                    })
369                                } else {
370                                    Some(tl::enums::MessageEntity::TextUrl(
371                                        tl::types::MessageEntityTextUrl {
372                                            offset: start_off,
373                                            length,
374                                            url,
375                                        },
376                                    ))
377                                }
378                            }
379                            HtmlTag::CustomEmoji(id) => {
380                                Some(tl::enums::MessageEntity::CustomEmoji(
381                                    tl::types::MessageEntityCustomEmoji {
382                                        offset: start_off,
383                                        length,
384                                        document_id: id,
385                                    },
386                                ))
387                            }
388                            HtmlTag::Unknown => None,
389                        };
390                        if let Some(e) = entity {
391                            ents.push(e);
392                        }
393                    }
394                }
395            } else {
396                let htag = match tag_name {
397                    "b" | "strong" => HtmlTag::Bold,
398                    "i" | "em" => HtmlTag::Italic,
399                    "u" => HtmlTag::Underline,
400                    "s" | "del" | "strike" => HtmlTag::Strike,
401                    "tg-spoiler" => HtmlTag::Spoiler,
402                    "code" => HtmlTag::Code,
403                    "pre" => HtmlTag::Pre,
404                    "a" => HtmlTag::Link(
405                        attrs
406                            .iter()
407                            .find(|(k, _)| k == "href")
408                            .map(|(_, v)| v.clone())
409                            .unwrap_or_default(),
410                    ),
411                    "tg-emoji" => HtmlTag::CustomEmoji(
412                        attrs
413                            .iter()
414                            .find(|(k, _)| k == "emoji-id")
415                            .and_then(|(_, v)| v.parse::<i64>().ok())
416                            .unwrap_or(0),
417                    ),
418                    "br" => {
419                        out.push('\n');
420                        utf16_off += 1;
421                        continue;
422                    }
423                    _ => HtmlTag::Unknown,
424                };
425                stack.push((htag, utf16_off, None));
426            }
427        } else {
428            let text_start = i;
429            while i < len && bytes[i] != b'<' {
430                i += 1;
431            }
432            let decoded = decode_html_entities(&html[text_start..i]);
433            for ch in decoded.chars() {
434                out.push(ch);
435                utf16_off += ch.len_utf16() as i32;
436            }
437        }
438    }
439
440    (out, ents)
441}
442
443#[cfg(not(feature = "html5ever"))]
444fn decode_html_entities(s: &str) -> String {
445    s.replace("&amp;", "&")
446        .replace("&lt;", "<")
447        .replace("&gt;", ">")
448        .replace("&quot;", "\"")
449        .replace("&#39;", "'")
450        .replace("&nbsp;", "\u{00A0}")
451}
452
453#[cfg(not(feature = "html5ever"))]
454fn parse_tag(s: &str) -> (&str, Vec<(String, String)>) {
455    let mut parts = s.splitn(2, char::is_whitespace);
456    let name = parts.next().unwrap_or("").trim_end_matches('/');
457    let attrs = parse_attrs(parts.next().unwrap_or(""));
458    (name, attrs)
459}
460
461#[cfg(not(feature = "html5ever"))]
462fn parse_attrs(s: &str) -> Vec<(String, String)> {
463    let mut result = Vec::new();
464    let mut rem = s.trim();
465    while !rem.is_empty() {
466        if let Some(eq) = rem.find('=') {
467            let key = rem[..eq].trim().to_string();
468            rem = rem[eq + 1..].trim_start();
469            let (val, rest) = if let Some(s) = rem.strip_prefix('"') {
470                let end = s.find('"').map(|p| p + 1).unwrap_or(rem.len() - 1);
471                (rem[1..end].to_string(), &rem[end + 1..])
472            } else if let Some(s) = rem.strip_prefix('\'') {
473                let end = s.find('\'').map(|p| p + 1).unwrap_or(rem.len() - 1);
474                (rem[1..end].to_string(), &rem[end + 1..])
475            } else {
476                let end = rem.find(char::is_whitespace).unwrap_or(rem.len());
477                (rem[..end].to_string(), &rem[end..])
478            };
479            result.push((key, val));
480            rem = rest.trim_start();
481        } else {
482            break;
483        }
484    }
485    result
486}
487
488#[cfg(not(feature = "html5ever"))]
489#[allow(dead_code)]
490#[derive(Debug, Clone)]
491enum HtmlTag {
492    Bold,
493    Italic,
494    Underline,
495    Strike,
496    Spoiler,
497    Code,
498    Pre,
499    Link(String),
500    CustomEmoji(i64),
501    Unknown,
502}
503
504#[cfg(not(feature = "html5ever"))]
505impl HtmlTag {
506    fn name(&self) -> &str {
507        match self {
508            Self::Bold => "b",
509            Self::Italic => "i",
510            Self::Underline => "u",
511            Self::Strike => "s",
512            Self::Spoiler => "tg-spoiler",
513            Self::Code => "code",
514            Self::Pre => "pre",
515            Self::Link(_) => "a",
516            Self::CustomEmoji(_) => "tg-emoji",
517            Self::Unknown => "",
518        }
519    }
520}
521
522// HTML parser: html5ever backend
523// Compiled when `html5ever` feature IS active; overrides the built-in parser.
524
525/// Parse a Telegram-compatible HTML string into (plain_text, entities).
526///
527/// Uses the [`html5ever`] spec-compliant tokenizer.  Enable the `html5ever`
528/// Cargo feature to activate this implementation.
529#[cfg(feature = "html5ever")]
530#[cfg_attr(docsrs, doc(cfg(feature = "html5ever")))]
531pub fn parse_html(html: &str) -> (String, Vec<tl::enums::MessageEntity>) {
532    use html5ever::tendril::StrTendril;
533    use html5ever::tokenizer::{
534        BufferQueue, Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer,
535    };
536    use std::cell::Cell;
537
538    struct Sink {
539        text: Cell<String>,
540        entities: Cell<Vec<tl::enums::MessageEntity>>,
541        offset: Cell<i32>,
542    }
543
544    impl TokenSink for Sink {
545        type Handle = ();
546
547        fn process_token(&self, token: Token, _line: u64) -> TokenSinkResult<()> {
548            let mut text = self.text.take();
549            let mut entities = self.entities.take();
550            let mut offset = self.offset.get();
551
552            // Close the most-recent open entity of `$kind` (open = length==0).
553            // Removes the entity if start == end (zero-length element).
554            macro_rules! close_ent {
555                ($kind:ident) => {{
556                    if let Some(idx) = entities
557                        .iter()
558                        .rposition(|e| matches!(e, tl::enums::MessageEntity::$kind(_)))
559                    {
560                        let closed_len = {
561                            if let tl::enums::MessageEntity::$kind(ref mut inner) = entities[idx] {
562                                inner.length = offset - inner.offset;
563                                inner.length
564                            } else {
565                                unreachable!()
566                            }
567                        };
568                        if closed_len == 0 {
569                            entities.remove(idx);
570                        }
571                    }
572                }};
573            }
574
575            match token {
576                // Start tags
577                Token::TagToken(Tag {
578                    kind: TagKind::StartTag,
579                    name,
580                    attrs,
581                    ..
582                }) => {
583                    let len0 = 0i32;
584                    match name.as_ref() {
585                        "b" | "strong" => entities.push(tl::enums::MessageEntity::Bold(
586                            tl::types::MessageEntityBold {
587                                offset,
588                                length: len0,
589                            },
590                        )),
591                        "i" | "em" => entities.push(tl::enums::MessageEntity::Italic(
592                            tl::types::MessageEntityItalic {
593                                offset,
594                                length: len0,
595                            },
596                        )),
597                        "u" => entities.push(tl::enums::MessageEntity::Underline(
598                            tl::types::MessageEntityUnderline {
599                                offset,
600                                length: len0,
601                            },
602                        )),
603                        "s" | "del" | "strike" => entities.push(tl::enums::MessageEntity::Strike(
604                            tl::types::MessageEntityStrike {
605                                offset,
606                                length: len0,
607                            },
608                        )),
609                        "tg-spoiler" => entities.push(tl::enums::MessageEntity::Spoiler(
610                            tl::types::MessageEntitySpoiler {
611                                offset,
612                                length: len0,
613                            },
614                        )),
615                        "code" => {
616                            // Inside an open <pre>? Annotate language on the pre entity.
617                            let in_pre = entities.last().map_or(
618                                false,
619                                |e| matches!(e, tl::enums::MessageEntity::Pre(p) if p.length == 0),
620                            );
621                            if in_pre {
622                                let lang = attrs
623                                    .iter()
624                                    .find(|a| a.name.local.as_ref() == "class")
625                                    .and_then(|a| {
626                                        let v: &str = a.value.as_ref();
627                                        v.strip_prefix("language-")
628                                    })
629                                    .map(|s| s.to_string())
630                                    .unwrap_or_default();
631                                if let Some(tl::enums::MessageEntity::Pre(ref mut p)) =
632                                    entities.last_mut()
633                                {
634                                    p.language = lang;
635                                }
636                            } else {
637                                entities.push(tl::enums::MessageEntity::Code(
638                                    tl::types::MessageEntityCode {
639                                        offset,
640                                        length: len0,
641                                    },
642                                ));
643                            }
644                        }
645                        "pre" => entities.push(tl::enums::MessageEntity::Pre(
646                            tl::types::MessageEntityPre {
647                                offset,
648                                length: len0,
649                                language: String::new(),
650                            },
651                        )),
652                        "a" => {
653                            let href = attrs
654                                .iter()
655                                .find(|a| a.name.local.as_ref() == "href")
656                                .map(|a| {
657                                    let v: &str = a.value.as_ref();
658                                    v.to_string()
659                                })
660                                .unwrap_or_default();
661                            const MENTION_PFX: &str = "tg://user?id=";
662                            if href.starts_with(MENTION_PFX) {
663                                if let Ok(uid) = href[MENTION_PFX.len()..].parse::<i64>() {
664                                    entities.push(tl::enums::MessageEntity::MentionName(
665                                        tl::types::MessageEntityMentionName {
666                                            offset,
667                                            length: len0,
668                                            user_id: uid,
669                                        },
670                                    ));
671                                }
672                            } else {
673                                entities.push(tl::enums::MessageEntity::TextUrl(
674                                    tl::types::MessageEntityTextUrl {
675                                        offset,
676                                        length: len0,
677                                        url: href,
678                                    },
679                                ));
680                            }
681                        }
682                        "tg-emoji" => {
683                            let doc_id = attrs
684                                .iter()
685                                .find(|a| a.name.local.as_ref() == "emoji-id")
686                                .and_then(|a| {
687                                    let v: &str = a.value.as_ref();
688                                    v.parse::<i64>().ok()
689                                })
690                                .unwrap_or(0);
691                            entities.push(tl::enums::MessageEntity::CustomEmoji(
692                                tl::types::MessageEntityCustomEmoji {
693                                    offset,
694                                    length: len0,
695                                    document_id: doc_id,
696                                },
697                            ));
698                        }
699                        "br" => {
700                            text.push('\n');
701                            offset += 1;
702                        }
703                        _ => {}
704                    }
705                }
706
707                // End tags
708                Token::TagToken(Tag {
709                    kind: TagKind::EndTag,
710                    name,
711                    ..
712                }) => {
713                    match name.as_ref() {
714                        "b" | "strong" => close_ent!(Bold),
715                        "i" | "em" => close_ent!(Italic),
716                        "u" => close_ent!(Underline),
717                        "s" | "del" | "strike" => close_ent!(Strike),
718                        "tg-spoiler" => close_ent!(Spoiler),
719                        "code" => {
720                            // Inside open <pre>: pre absorbs the code tag.
721                            let in_pre = entities.last().map_or(
722                                false,
723                                |e| matches!(e, tl::enums::MessageEntity::Pre(p) if p.length == 0),
724                            );
725                            if !in_pre {
726                                close_ent!(Code);
727                            }
728                        }
729                        "pre" => close_ent!(Pre),
730                        "a" => match entities.last() {
731                            Some(tl::enums::MessageEntity::MentionName(_)) => {
732                                close_ent!(MentionName)
733                            }
734                            _ => close_ent!(TextUrl),
735                        },
736                        "tg-emoji" => close_ent!(CustomEmoji),
737                        _ => {}
738                    }
739                }
740
741                // Text content
742                Token::CharacterTokens(s) => {
743                    let s_str: &str = s.as_ref();
744                    offset += s_str.encode_utf16().count() as i32;
745                    text.push_str(s_str);
746                }
747
748                _ => {}
749            }
750
751            self.text.replace(text);
752            self.entities.replace(entities);
753            self.offset.replace(offset);
754            TokenSinkResult::Continue
755        }
756    }
757
758    let mut input = BufferQueue::default();
759    input.push_back(StrTendril::from_slice(html).try_reinterpret().unwrap());
760
761    let tok = Tokenizer::new(
762        Sink {
763            text: Cell::new(String::with_capacity(html.len())),
764            entities: Cell::new(Vec::new()),
765            offset: Cell::new(0),
766        },
767        Default::default(),
768    );
769    let _ = tok.feed(&mut input);
770    tok.end();
771
772    let Sink { text, entities, .. } = tok.sink;
773    (text.take(), entities.take())
774}
775
776// HTML generator (always available, no html5ever dependency)
777
778/// Generate Telegram-compatible HTML from plain text + entities.
779pub fn generate_html(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
780    use tl::enums::MessageEntity as ME;
781
782    let mut markers: Vec<(i32, bool, String)> = Vec::new();
783
784    for ent in entities {
785        let (off, len, open, close) = match ent {
786            ME::Bold(e) => (e.offset, e.length, "<b>".into(), "</b>".into()),
787            ME::Italic(e) => (e.offset, e.length, "<i>".into(), "</i>".into()),
788            ME::Underline(e) => (e.offset, e.length, "<u>".into(), "</u>".into()),
789            ME::Strike(e) => (e.offset, e.length, "<s>".into(), "</s>".into()),
790            ME::Spoiler(e) => (
791                e.offset,
792                e.length,
793                "<tg-spoiler>".into(),
794                "</tg-spoiler>".into(),
795            ),
796            ME::Code(e) => (e.offset, e.length, "<code>".into(), "</code>".into()),
797            ME::Pre(e) => {
798                let lang = if e.language.is_empty() {
799                    String::new()
800                } else {
801                    format!(" class=\"language-{}\"", e.language)
802                };
803                (
804                    e.offset,
805                    e.length,
806                    format!("<pre><code{lang}>"),
807                    "</code></pre>".into(),
808                )
809            }
810            ME::TextUrl(e) => (
811                e.offset,
812                e.length,
813                format!("<a href=\"{}\">", escape_html(&e.url)),
814                "</a>".into(),
815            ),
816            ME::MentionName(e) => (
817                e.offset,
818                e.length,
819                format!("<a href=\"tg://user?id={}\">", e.user_id),
820                "</a>".into(),
821            ),
822            ME::CustomEmoji(e) => (
823                e.offset,
824                e.length,
825                format!("<tg-emoji emoji-id=\"{}\">", e.document_id),
826                "</tg-emoji>".into(),
827            ),
828            _ => continue,
829        };
830        markers.push((off, true, open));
831        markers.push((off + len, false, close));
832    }
833
834    markers.sort_by(|(a_pos, a_open, _), (b_pos, b_open, _)| {
835        a_pos.cmp(b_pos).then_with(|| b_open.cmp(a_open))
836    });
837
838    let mut result =
839        String::with_capacity(text.len() + markers.iter().map(|(_, _, s)| s.len()).sum::<usize>());
840    let mut marker_idx = 0;
841    let mut utf16_pos: i32 = 0;
842
843    for ch in text.chars() {
844        while marker_idx < markers.len() && markers[marker_idx].0 <= utf16_pos {
845            result.push_str(&markers[marker_idx].2);
846            marker_idx += 1;
847        }
848        match ch {
849            '&' => result.push_str("&amp;"),
850            '<' => result.push_str("&lt;"),
851            '>' => result.push_str("&gt;"),
852            '"' => result.push_str("&quot;"),
853            c => result.push(c),
854        }
855        utf16_pos += ch.len_utf16() as i32;
856    }
857    while marker_idx < markers.len() {
858        result.push_str(&markers[marker_idx].2);
859        marker_idx += 1;
860    }
861
862    result
863}
864
865fn escape_html(s: &str) -> String {
866    s.replace('&', "&amp;")
867        .replace('<', "&lt;")
868        .replace('>', "&gt;")
869        .replace('"', "&quot;")
870}
871
872// Tests
873
874#[cfg(test)]
875mod tests {
876    use super::*;
877
878    #[test]
879    fn markdown_bold() {
880        let (text, ents) = parse_markdown("Hello **world**!");
881        assert_eq!(text, "Hello world!");
882        assert_eq!(ents.len(), 1);
883        if let tl::enums::MessageEntity::Bold(b) = &ents[0] {
884            assert_eq!(b.offset, 6);
885            assert_eq!(b.length, 5);
886        } else {
887            panic!("expected bold");
888        }
889    }
890
891    #[test]
892    fn markdown_inline_code() {
893        let (text, ents) = parse_markdown("Use `foo()` to do it");
894        assert_eq!(text, "Use foo() to do it");
895        assert!(matches!(ents[0], tl::enums::MessageEntity::Code(_)));
896    }
897
898    #[test]
899    fn html_bold_italic() {
900        let (text, ents) = parse_html("<b>bold</b> and <i>italic</i>");
901        assert_eq!(text, "bold and italic");
902        assert_eq!(ents.len(), 2);
903    }
904
905    #[test]
906    fn html_link() {
907        let (text, ents) = parse_html("<a href=\"https://example.com\">click</a>");
908        assert_eq!(text, "click");
909        if let tl::enums::MessageEntity::TextUrl(e) = &ents[0] {
910            assert_eq!(e.url, "https://example.com");
911        } else {
912            panic!("expected text url");
913        }
914    }
915
916    // HTML entity decoding is a hand-rolled-only feature; html5ever handles it natively.
917    #[cfg(not(feature = "html5ever"))]
918    #[test]
919    fn html_entities_decoded() {
920        let (text, _) = parse_html("A &amp; B &lt;3&gt;");
921        assert_eq!(text, "A & B <3>");
922    }
923
924    #[test]
925    fn generate_html_roundtrip() {
926        let original = "Hello world";
927        let entities = vec![tl::enums::MessageEntity::Bold(
928            tl::types::MessageEntityBold {
929                offset: 0,
930                length: 5,
931            },
932        )];
933        let html = generate_html(original, &entities);
934        assert_eq!(html, "<b>Hello</b> world");
935        let (back, ents2) = parse_html(&html);
936        assert_eq!(back, original);
937        assert_eq!(ents2.len(), 1);
938    }
939}