Skip to main content

layer_client/
parsers.rs

1//! Text formatting parsers: HTML and Markdown ↔ Telegram [`MessageEntity`]
2//!
3//! # Markdown (Telegram-flavoured)
4//! Supported: `**bold**`, `__italic__`, `~~strike~~`, `||spoiler||`, `` `code` ``,
5//! ` ```lang\npre``` `, `[text](url)`, `[text](tg://user?id=123)`
6//!
7//! # HTML
8//! Supported tags: `<b>`, `<strong>`, `<i>`, `<em>`, `<u>`, `<s>`, `<del>`,
9//! `<code>`, `<pre>`, `<tg-spoiler>`, `<a href="url">`,
10//! `<tg-emoji emoji-id="id">text</tg-emoji>`
11
12use layer_tl_types as tl;
13
14// ─── Markdown ─────────────────────────────────────────────────────────────────
15
16/// Parse Telegram-flavoured markdown into (plain_text, entities).
17///
18/// Supported syntax:
19/// - `**bold**`
20/// - `__italic__`
21/// - `~~strikethrough~~`
22/// - `||spoiler||`
23/// - `` `inline code` ``
24/// - ` ```lang\ncode\n``` `
25/// - `[text](url)` or `[text](tg://user?id=123)`
26pub fn parse_markdown(text: &str) -> (String, Vec<tl::enums::MessageEntity>) {
27    let mut out   = String::with_capacity(text.len());
28    let mut ents  = Vec::new();
29    let chars: Vec<char> = text.chars().collect();
30    let n = chars.len();
31    let mut i = 0;
32
33    // Stack of open tags: (variant, byte_offset_in_utf16)
34    // We track utf-16 offsets because Telegram uses them.
35    let mut open_stack: Vec<(MarkdownTag, i32)> = Vec::new();
36
37    // Current output utf-16 offset
38    let mut utf16_off: i32 = 0;
39
40    macro_rules! push_char {
41        ($c:expr) => {{
42            let c: char = $c;
43            out.push(c);
44            utf16_off += c.len_utf16() as i32;
45        }};
46    }
47
48    while i < n {
49        // ── code block ```lang\n...``` ──────────────────────────────────────
50        if i + 2 < n && chars[i] == '`' && chars[i+1] == '`' && chars[i+2] == '`' {
51            let start = i + 3;
52            // find closing ```
53            let mut j = start;
54            while j + 2 < n {
55                if chars[j] == '`' && chars[j+1] == '`' && chars[j+2] == '`' { break; }
56                j += 1;
57            }
58            if j + 2 < n {
59                // extract optional lang on first line
60                let block: String = chars[start..j].iter().collect();
61                let (lang, code) = if let Some(nl) = block.find('\n') {
62                    (block[..nl].trim().to_string(), block[nl+1..].to_string())
63                } else {
64                    (String::new(), block)
65                };
66                let code_off = utf16_off;
67                let code_utf16: i32 = code.encode_utf16().count() as i32;
68                ents.push(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre {
69                    offset: code_off, length: code_utf16, language: lang,
70                }));
71                for c in code.chars() { push_char!(c); }
72                i = j + 3;
73                continue;
74            }
75        }
76
77        // ── inline code ─────────────────────────────────────────────────────
78        if chars[i] == '`' {
79            let start = i + 1;
80            let mut j = start;
81            while j < n && chars[j] != '`' { j += 1; }
82            if j < n {
83                let code: String = chars[start..j].iter().collect();
84                let code_off = utf16_off;
85                let code_utf16: i32 = code.encode_utf16().count() as i32;
86                ents.push(tl::enums::MessageEntity::Code(tl::types::MessageEntityCode {
87                    offset: code_off, length: code_utf16,
88                }));
89                for c in code.chars() { push_char!(c); }
90                i = j + 1;
91                continue;
92            }
93        }
94
95        // ── [text](url) ─────────────────────────────────────────────────────
96        if chars[i] == '[' {
97            let text_start = i + 1;
98            let mut j = text_start;
99            let mut depth = 1i32;
100            while j < n {
101                if chars[j] == '[' { depth += 1; }
102                if chars[j] == ']' { depth -= 1; if depth == 0 { break; } }
103                j += 1;
104            }
105            if j < n && j + 1 < n && chars[j+1] == '(' {
106                let link_start = j + 2;
107                let mut k = link_start;
108                while k < n && chars[k] != ')' { k += 1; }
109                if k < n {
110                    let inner_text: String = chars[text_start..j].iter().collect();
111                    let url: String = chars[link_start..k].iter().collect();
112                    const MENTION_PFX: &str = "tg://user?id=";
113                    let ent_off = utf16_off;
114                    for c in inner_text.chars() { push_char!(c); }
115                    let ent_len = utf16_off - ent_off;
116                    if url.starts_with(MENTION_PFX) {
117                        if let Ok(uid) = url[MENTION_PFX.len()..].parse::<i64>() {
118                            ents.push(tl::enums::MessageEntity::MentionName(
119                                tl::types::MessageEntityMentionName { offset: ent_off, length: ent_len, user_id: uid }
120                            ));
121                        }
122                    } else {
123                        ents.push(tl::enums::MessageEntity::TextUrl(
124                            tl::types::MessageEntityTextUrl { offset: ent_off, length: ent_len, url }
125                        ));
126                    }
127                    i = k + 1;
128                    continue;
129                }
130            }
131        }
132
133        // ── two-char delimiters ──────────────────────────────────────────────
134        let two: Option<(&str, MarkdownTag)> = if i + 1 < n {
135            let pair = [chars[i], chars[i+1]];
136            match pair {
137                ['*','*'] => Some(("**", MarkdownTag::Bold)),
138                ['_','_'] => Some(("__", MarkdownTag::Italic)),
139                ['~','~'] => Some(("~~", MarkdownTag::Strike)),
140                ['|','|'] => Some(("||", MarkdownTag::Spoiler)),
141                _ => None,
142            }
143        } else { None };
144
145        if let Some((_delim, tag)) = two {
146            // check if closing
147            if let Some(pos) = open_stack.iter().rposition(|(t, _)| *t == tag) {
148                let (_, start_off) = open_stack.remove(pos);
149                let length = utf16_off - start_off;
150                let entity = match tag {
151                    MarkdownTag::Bold    => tl::enums::MessageEntity::Bold(tl::types::MessageEntityBold { offset: start_off, length }),
152                    MarkdownTag::Italic  => tl::enums::MessageEntity::Italic(tl::types::MessageEntityItalic { offset: start_off, length }),
153                    MarkdownTag::Strike  => tl::enums::MessageEntity::Strike(tl::types::MessageEntityStrike { offset: start_off, length }),
154                    MarkdownTag::Spoiler => tl::enums::MessageEntity::Spoiler(tl::types::MessageEntitySpoiler { offset: start_off, length }),
155                };
156                if length > 0 { ents.push(entity); }
157            } else {
158                open_stack.push((tag, utf16_off));
159            }
160            i += 2;
161            continue;
162        }
163
164        // ── regular character ────────────────────────────────────────────────
165        push_char!(chars[i]);
166        i += 1;
167    }
168
169    (out, ents)
170}
171
172#[derive(Debug, Clone, Copy, PartialEq, Eq)]
173enum MarkdownTag { Bold, Italic, Strike, Spoiler }
174
175/// Generate Telegram markdown from plain text + entities.
176pub fn generate_markdown(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
177    use tl::enums::MessageEntity as ME;
178
179    // collect (utf16_pos, marker_str) insertions
180    let mut insertions: Vec<(i32, &'static str)> = Vec::new();
181
182    for ent in entities {
183        match ent {
184            ME::Bold(e)    => { insertions.push((e.offset, "**")); insertions.push((e.offset+e.length, "**")); }
185            ME::Italic(e)  => { insertions.push((e.offset, "__")); insertions.push((e.offset+e.length, "__")); }
186            ME::Strike(e)  => { insertions.push((e.offset, "~~")); insertions.push((e.offset+e.length, "~~")); }
187            ME::Spoiler(e) => { insertions.push((e.offset, "||")); insertions.push((e.offset+e.length, "||")); }
188            ME::Code(e)    => { insertions.push((e.offset, "`"));  insertions.push((e.offset+e.length, "`")); }
189            _ => {} // complex types handled separately
190        }
191    }
192    insertions.sort_by_key(|&(pos, _)| pos);
193
194    // Insert markers at utf-16 positions
195    let mut result = String::with_capacity(text.len() + insertions.len() * 4);
196    let mut ins_idx = 0;
197    let mut utf16_pos: i32 = 0;
198
199    for ch in text.chars() {
200        while ins_idx < insertions.len() && insertions[ins_idx].0 <= utf16_pos {
201            result.push_str(insertions[ins_idx].1);
202            ins_idx += 1;
203        }
204        result.push(ch);
205        utf16_pos += ch.len_utf16() as i32;
206    }
207    while ins_idx < insertions.len() {
208        result.push_str(insertions[ins_idx].1);
209        ins_idx += 1;
210    }
211
212    // Handle pre/code blocks and links (append as-is for now – complex nesting handled by callers)
213    for ent in entities {
214        match ent {
215            tl::enums::MessageEntity::Pre(_) | tl::enums::MessageEntity::TextUrl(_) |
216            tl::enums::MessageEntity::MentionName(_) => {
217                // These require more complex insertion logic; callers should use parse_markdown
218                // for round-trip use cases.
219            }
220            _ => {}
221        }
222    }
223
224    result
225}
226
227// ─── HTML parser ──────────────────────────────────────────────────────────────
228
229/// Parse a Telegram-compatible HTML string into (plain_text, entities).
230///
231/// Supported tags:
232/// - `<b>` / `<strong>` → Bold
233/// - `<i>` / `<em>` → Italic
234/// - `<u>` → Underline
235/// - `<s>` / `<del>` / `<strike>` → Strikethrough
236/// - `<code>` → Code (inside `<pre>` → Pre with language)
237/// - `<pre>` → Pre block
238/// - `<tg-spoiler>` → Spoiler
239/// - `<a href="...">` → TextUrl or MentionName
240/// - `<tg-emoji emoji-id="...">` → CustomEmoji
241pub fn parse_html(html: &str) -> (String, Vec<tl::enums::MessageEntity>) {
242    let mut out    = String::with_capacity(html.len());
243    let mut ents   = Vec::new();
244    // stack of (tag_name, utf16_start)
245    let mut stack: Vec<(HtmlTag, i32, Option<String>)> = Vec::new();
246    let mut utf16_off: i32 = 0;
247
248    let bytes = html.as_bytes();
249    let len = bytes.len();
250    let mut i = 0;
251
252    while i < len {
253        if bytes[i] == b'<' {
254            // find end of tag
255            let tag_start = i + 1;
256            let mut j = tag_start;
257            while j < len && bytes[j] != b'>' { j += 1; }
258            let tag_content = &html[tag_start..j];
259            i = j + 1;
260
261            let is_close = tag_content.starts_with('/');
262            let tag_str = if is_close { tag_content[1..].trim() } else { tag_content.trim() };
263
264            // parse tag name and attributes
265            let (tag_name, attrs) = parse_tag(tag_str);
266
267            if is_close {
268                // pop from stack
269                if let Some(pos) = stack.iter().rposition(|(t, _, _)| t.name() == tag_name) {
270                    let (htag, start_off, extra) = stack.remove(pos);
271                    let length = utf16_off - start_off;
272                    if length > 0 {
273                        let entity = match htag {
274                            HtmlTag::Bold    => Some(tl::enums::MessageEntity::Bold(tl::types::MessageEntityBold { offset: start_off, length })),
275                            HtmlTag::Italic  => Some(tl::enums::MessageEntity::Italic(tl::types::MessageEntityItalic { offset: start_off, length })),
276                            HtmlTag::Underline => Some(tl::enums::MessageEntity::Underline(tl::types::MessageEntityUnderline { offset: start_off, length })),
277                            HtmlTag::Strike  => Some(tl::enums::MessageEntity::Strike(tl::types::MessageEntityStrike { offset: start_off, length })),
278                            HtmlTag::Spoiler => Some(tl::enums::MessageEntity::Spoiler(tl::types::MessageEntitySpoiler { offset: start_off, length })),
279                            HtmlTag::Code    => {
280                                // check if inside <pre>: if so, enrich parent pre with language
281                                Some(tl::enums::MessageEntity::Code(tl::types::MessageEntityCode { offset: start_off, length }))
282                            }
283                            HtmlTag::Pre     => {
284                                let lang = extra.unwrap_or_default();
285                                Some(tl::enums::MessageEntity::Pre(tl::types::MessageEntityPre { offset: start_off, length, language: lang }))
286                            }
287                            HtmlTag::Link(url) => {
288                                const PFX: &str = "tg://user?id=";
289                                if url.starts_with(PFX) {
290                                    if let Ok(uid) = url[PFX.len()..].parse::<i64>() {
291                                        Some(tl::enums::MessageEntity::MentionName(tl::types::MessageEntityMentionName { offset: start_off, length, user_id: uid }))
292                                    } else { None }
293                                } else {
294                                    Some(tl::enums::MessageEntity::TextUrl(tl::types::MessageEntityTextUrl { offset: start_off, length, url }))
295                                }
296                            }
297                            HtmlTag::CustomEmoji(id) => {
298                                Some(tl::enums::MessageEntity::CustomEmoji(tl::types::MessageEntityCustomEmoji { offset: start_off, length, document_id: id }))
299                            }
300                            HtmlTag::Unknown => None,
301                        };
302                        if let Some(e) = entity { ents.push(e); }
303                    }
304                }
305            } else {
306                // open tag
307                let htag = match tag_name {
308                    "b" | "strong" => HtmlTag::Bold,
309                    "i" | "em"     => HtmlTag::Italic,
310                    "u"            => HtmlTag::Underline,
311                    "s" | "del" | "strike" => HtmlTag::Strike,
312                    "tg-spoiler"   => HtmlTag::Spoiler,
313                    "code"         => HtmlTag::Code,
314                    "pre"          => HtmlTag::Pre,
315                    "a"            => {
316                        let href = attrs.iter()
317                            .find(|(k, _)| k == "href")
318                            .map(|(_, v)| v.clone())
319                            .unwrap_or_default();
320                        HtmlTag::Link(href)
321                    }
322                    "tg-emoji" => {
323                        let id = attrs.iter()
324                            .find(|(k, _)| k == "emoji-id")
325                            .and_then(|(_, v)| v.parse::<i64>().ok())
326                            .unwrap_or(0);
327                        HtmlTag::CustomEmoji(id)
328                    }
329                    "br" => {
330                        // Self-closing — emit newline
331                        out.push('\n');
332                        utf16_off += 1;
333                        continue;
334                    }
335                    _ => HtmlTag::Unknown,
336                };
337                stack.push((htag, utf16_off, None));
338            }
339        } else {
340            // Text (handle entities)
341            let text_start = i;
342            while i < len && bytes[i] != b'<' { i += 1; }
343            let raw_text = &html[text_start..i];
344            let decoded  = decode_html_entities(raw_text);
345            for ch in decoded.chars() {
346                out.push(ch);
347                utf16_off += ch.len_utf16() as i32;
348            }
349        }
350    }
351
352    (out, ents)
353}
354
355fn decode_html_entities(s: &str) -> String {
356    s.replace("&amp;", "&")
357     .replace("&lt;",  "<")
358     .replace("&gt;",  ">")
359     .replace("&quot;", "\"")
360     .replace("&#39;", "'")
361     .replace("&nbsp;", "\u{00A0}")
362}
363
364fn parse_tag(s: &str) -> (&str, Vec<(String, String)>) {
365    let mut parts = s.splitn(2, char::is_whitespace);
366    let name = parts.next().unwrap_or("").trim_end_matches('/');
367    let rest = parts.next().unwrap_or("");
368    let attrs = parse_attrs(rest);
369    (name, attrs)
370}
371
372fn parse_attrs(s: &str) -> Vec<(String, String)> {
373    let mut result = Vec::new();
374    let mut rem = s.trim();
375    while !rem.is_empty() {
376        // find '='
377        if let Some(eq) = rem.find('=') {
378            let key = rem[..eq].trim().to_string();
379            rem = rem[eq+1..].trim_start();
380            let (val, rest) = if rem.starts_with('"') {
381                let end = rem[1..].find('"').map(|p| p+1).unwrap_or(rem.len()-1);
382                (rem[1..end].to_string(), &rem[end+1..])
383            } else if rem.starts_with('\'') {
384                let end = rem[1..].find('\'').map(|p| p+1).unwrap_or(rem.len()-1);
385                (rem[1..end].to_string(), &rem[end+1..])
386            } else {
387                let end = rem.find(char::is_whitespace).unwrap_or(rem.len());
388                (rem[..end].to_string(), &rem[end..])
389            };
390            result.push((key, val));
391            rem = rest.trim_start();
392        } else {
393            break;
394        }
395    }
396    result
397}
398
399#[allow(dead_code)]
400#[derive(Debug, Clone)]
401enum HtmlTag {
402    Bold,
403    Italic,
404    Underline,
405    Strike,
406    Spoiler,
407    Code,
408    Pre,
409    Link(String),
410    CustomEmoji(i64),
411    Unknown,
412}
413
414impl HtmlTag {
415    fn name(&self) -> &str {
416        match self {
417            Self::Bold        => "b",
418            Self::Italic      => "i",
419            Self::Underline   => "u",
420            Self::Strike      => "s",
421            Self::Spoiler     => "tg-spoiler",
422            Self::Code        => "code",
423            Self::Pre         => "pre",
424            Self::Link(_)     => "a",
425            Self::CustomEmoji(_) => "tg-emoji",
426            Self::Unknown     => "",
427        }
428    }
429}
430
431/// Generate Telegram-compatible HTML from plain text + entities.
432pub fn generate_html(text: &str, entities: &[tl::enums::MessageEntity]) -> String {
433    use tl::enums::MessageEntity as ME;
434
435    // Build list of (utf16_pos, is_open, html_fragment)
436    let mut markers: Vec<(i32, bool, String)> = Vec::new();
437
438    for ent in entities {
439        let (off, len, open, close) = match ent {
440            ME::Bold(e)      => (e.offset, e.length, "<b>".into(), "</b>".into()),
441            ME::Italic(e)    => (e.offset, e.length, "<i>".into(), "</i>".into()),
442            ME::Underline(e) => (e.offset, e.length, "<u>".into(), "</u>".into()),
443            ME::Strike(e)    => (e.offset, e.length, "<s>".into(), "</s>".into()),
444            ME::Spoiler(e)   => (e.offset, e.length, "<tg-spoiler>".into(), "</tg-spoiler>".into()),
445            ME::Code(e)      => (e.offset, e.length, "<code>".into(), "</code>".into()),
446            ME::Pre(e)       => {
447                let lang = if e.language.is_empty() { String::new() }
448                           else { format!(" class=\"language-{}\"", e.language) };
449                (e.offset, e.length, format!("<pre><code{lang}>"), "</code></pre>".into())
450            }
451            ME::TextUrl(e)   => (e.offset, e.length, format!("<a href=\"{}\">", escape_html(&e.url)), "</a>".into()),
452            ME::MentionName(e) => (e.offset, e.length, format!("<a href=\"tg://user?id={}\">", e.user_id), "</a>".into()),
453            ME::CustomEmoji(e) => (e.offset, e.length, format!("<tg-emoji emoji-id=\"{}\">", e.document_id), "</tg-emoji>".into()),
454            _ => continue,
455        };
456        markers.push((off,       true,  open));
457        markers.push((off + len, false, close));
458    }
459
460    // Sort: opens before closes at same position
461    markers.sort_by(|(a_pos, a_open, _), (b_pos, b_open, _)| {
462        a_pos.cmp(b_pos).then_with(|| b_open.cmp(a_open)) // open=true sorts before close=false
463    });
464
465    let mut result = String::with_capacity(text.len() + markers.iter().map(|(_, _, s)| s.len()).sum::<usize>());
466    let mut marker_idx = 0;
467    let mut utf16_pos: i32 = 0;
468
469    for ch in text.chars() {
470        while marker_idx < markers.len() && markers[marker_idx].0 <= utf16_pos {
471            result.push_str(&markers[marker_idx].2);
472            marker_idx += 1;
473        }
474        // Escape the character
475        match ch {
476            '&' => result.push_str("&amp;"),
477            '<' => result.push_str("&lt;"),
478            '>' => result.push_str("&gt;"),
479            '"' => result.push_str("&quot;"),
480            c   => result.push(c),
481        }
482        utf16_pos += ch.len_utf16() as i32;
483    }
484    while marker_idx < markers.len() {
485        result.push_str(&markers[marker_idx].2);
486        marker_idx += 1;
487    }
488
489    result
490}
491
492fn escape_html(s: &str) -> String {
493    s.replace('&', "&amp;").replace('<', "&lt;").replace('>', "&gt;").replace('"', "&quot;")
494}
495
496// ─── Tests ────────────────────────────────────────────────────────────────────
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    #[test]
503    fn markdown_bold() {
504        let (text, ents) = parse_markdown("Hello **world**!");
505        assert_eq!(text, "Hello world!");
506        assert_eq!(ents.len(), 1);
507        if let tl::enums::MessageEntity::Bold(b) = &ents[0] {
508            assert_eq!(b.offset, 6);
509            assert_eq!(b.length, 5);
510        } else { panic!("expected bold"); }
511    }
512
513    #[test]
514    fn markdown_inline_code() {
515        let (text, ents) = parse_markdown("Use `foo()` to do it");
516        assert_eq!(text, "Use foo() to do it");
517        assert!(matches!(ents[0], tl::enums::MessageEntity::Code(_)));
518    }
519
520    #[test]
521    fn html_bold_italic() {
522        let (text, ents) = parse_html("<b>bold</b> and <i>italic</i>");
523        assert_eq!(text, "bold and italic");
524        assert_eq!(ents.len(), 2);
525    }
526
527    #[test]
528    fn html_link() {
529        let (text, ents) = parse_html("<a href=\"https://example.com\">click</a>");
530        assert_eq!(text, "click");
531        if let tl::enums::MessageEntity::TextUrl(e) = &ents[0] {
532            assert_eq!(e.url, "https://example.com");
533        } else { panic!("expected text url"); }
534    }
535
536    #[test]
537    fn html_entities_decoded() {
538        let (text, _) = parse_html("A &amp; B &lt;3&gt;");
539        assert_eq!(text, "A & B <3>");
540    }
541
542    #[test]
543    fn generate_html_roundtrip() {
544        let original = "Hello world";
545        let entities = vec![tl::enums::MessageEntity::Bold(tl::types::MessageEntityBold { offset: 0, length: 5 })];
546        let html = generate_html(original, &entities);
547        assert_eq!(html, "<b>Hello</b> world");
548        let (back, ents2) = parse_html(&html);
549        assert_eq!(back, original);
550        assert_eq!(ents2.len(), 1);
551    }
552}