tg_flows/types/
message_entity.rs

1use std::{cmp, ops::Range};
2
3use serde::{Deserialize, Serialize};
4
5use crate::types::{User, UserId};
6
7/// This object represents one special entity in a text message.
8///
9/// For example, hashtags, usernames, URLs, etc.
10///
11/// [The official docs](https://core.telegram.org/bots/api#messageentity).
12#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
13pub struct MessageEntity {
14    #[serde(flatten)]
15    pub kind: MessageEntityKind,
16
17    /// Offset in UTF-16 code units to the start of the entity.
18    pub offset: usize,
19
20    /// Length of the entity in UTF-16 code units.
21    pub length: usize,
22}
23
24/// A "parsed" [`MessageEntity`].
25///
26/// [`MessageEntity`] has offsets in UTF-**16** code units, but in Rust we
27/// mostly work with UTF-**8**. In order to use an entity we need to convert
28/// UTF-16 offsets to UTF-8 ones. This type represents a message entity with
29/// converted offsets and a reference to the text.
30///
31/// You can get [`MessageEntityRef`]s by calling [`parse_entities`] and
32/// [`parse_caption_entities`] methods of [`Message`] or by calling
33/// [`MessageEntityRef::parse`].
34///
35/// [`parse_entities`]: crate::types::Message::parse_entities
36/// [`parse_caption_entities`]: crate::types::Message::parse_caption_entities
37/// [`Message`]: crate::types::Message
38#[derive(Clone, Debug, Eq, Hash, PartialEq)]
39pub struct MessageEntityRef<'a> {
40    message: &'a str,
41    range: Range<usize>,
42    kind: &'a MessageEntityKind,
43}
44
45impl MessageEntity {
46    #[must_use]
47    pub const fn new(kind: MessageEntityKind, offset: usize, length: usize) -> Self {
48        Self {
49            kind,
50            offset,
51            length,
52        }
53    }
54
55    /// Create a message entity representing a bold text.
56    #[must_use]
57    pub const fn bold(offset: usize, length: usize) -> Self {
58        Self {
59            kind: MessageEntityKind::Bold,
60            offset,
61            length,
62        }
63    }
64
65    /// Create a message entity representing an italic text.
66    #[must_use]
67    pub const fn italic(offset: usize, length: usize) -> Self {
68        Self {
69            kind: MessageEntityKind::Italic,
70            offset,
71            length,
72        }
73    }
74
75    /// Create a message entity representing an underline text.
76    #[must_use]
77    pub const fn underline(offset: usize, length: usize) -> Self {
78        Self {
79            kind: MessageEntityKind::Underline,
80            offset,
81            length,
82        }
83    }
84
85    /// Create a message entity representing a strikethrough text.
86    #[must_use]
87    pub const fn strikethrough(offset: usize, length: usize) -> Self {
88        Self {
89            kind: MessageEntityKind::Strikethrough,
90            offset,
91            length,
92        }
93    }
94
95    /// Create a message entity representing a spoiler text.
96    #[must_use]
97    pub const fn spoiler(offset: usize, length: usize) -> Self {
98        Self {
99            kind: MessageEntityKind::Spoiler,
100            offset,
101            length,
102        }
103    }
104
105    /// Create a message entity representing a monowidth text.
106    #[must_use]
107    pub const fn code(offset: usize, length: usize) -> Self {
108        Self {
109            kind: MessageEntityKind::Code,
110            offset,
111            length,
112        }
113    }
114
115    /// Create a message entity representing a monowidth block.
116    #[must_use]
117    pub const fn pre(language: Option<String>, offset: usize, length: usize) -> Self {
118        Self {
119            kind: MessageEntityKind::Pre { language },
120            offset,
121            length,
122        }
123    }
124
125    /// Create a message entity representing a clickable text URL.
126    #[must_use]
127    pub const fn text_link(url: url::Url, offset: usize, length: usize) -> Self {
128        Self {
129            kind: MessageEntityKind::TextLink { url },
130            offset,
131            length,
132        }
133    }
134
135    /// Create a message entity representing a text mention.
136    ///
137    /// # Note
138    ///
139    /// If you don't have a complete [`User`] value, please use
140    /// [`MessageEntity::text_mention_id`] instead.
141    #[must_use]
142    pub const fn text_mention(user: User, offset: usize, length: usize) -> Self {
143        Self {
144            kind: MessageEntityKind::TextMention { user },
145            offset,
146            length,
147        }
148    }
149
150    /// Create a message entity representing a text link in the form of
151    /// `tg://user/?id=...` that mentions user with `user_id`.
152    #[must_use]
153    pub fn text_mention_id(user_id: UserId, offset: usize, length: usize) -> Self {
154        Self {
155            kind: MessageEntityKind::TextLink { url: user_id.url() },
156            offset,
157            length,
158        }
159    }
160
161    /// Create a message entity representing a custom emoji.
162    #[must_use]
163    pub const fn custom_emoji(custom_emoji_id: String, offset: usize, length: usize) -> Self {
164        Self {
165            kind: MessageEntityKind::CustomEmoji { custom_emoji_id },
166            offset,
167            length,
168        }
169    }
170
171    #[must_use]
172    pub fn kind(mut self, val: MessageEntityKind) -> Self {
173        self.kind = val;
174        self
175    }
176
177    #[must_use]
178    pub const fn offset(mut self, val: usize) -> Self {
179        self.offset = val;
180        self
181    }
182
183    #[must_use]
184    pub const fn length(mut self, val: usize) -> Self {
185        self.length = val;
186        self
187    }
188}
189
190impl<'a> MessageEntityRef<'a> {
191    /// Returns kind of this entity.
192    #[must_use]
193    pub fn kind(&self) -> &'a MessageEntityKind {
194        self.kind
195    }
196
197    /// Returns the text that this entity is related to.
198    #[must_use]
199    pub fn text(&self) -> &'a str {
200        &self.message[self.range.clone()]
201    }
202
203    /// Returns range that this entity is related to.
204    ///
205    /// The range is in bytes for UTF-8 encoding i.e. you can use it with common
206    /// Rust strings.
207    #[must_use]
208    pub fn range(&self) -> Range<usize> {
209        self.range.clone()
210    }
211
212    /// Returns the offset (in bytes, for UTF-8) to the start of this entity in
213    /// the original message.
214    #[must_use]
215    pub fn start(&self) -> usize {
216        self.range.start
217    }
218
219    /// Returns the offset (in bytes, for UTF-8) to the end of this entity in
220    /// the original message.
221    #[must_use]
222    pub fn end(&self) -> usize {
223        self.range.end
224    }
225
226    /// Returns the length of this entity in bytes for UTF-8 encoding.
227    #[allow(clippy::len_without_is_empty)]
228    #[must_use]
229    pub fn len(&self) -> usize {
230        self.range.len()
231    }
232
233    /// Returns the full text of the original message.
234    #[must_use]
235    pub fn message_text(&self) -> &'a str {
236        self.message
237    }
238
239    /// Parses telegram [`MessageEntity`]s converting offsets to UTF-8.
240    #[must_use]
241    pub fn parse(text: &'a str, entities: &'a [MessageEntity]) -> Vec<Self> {
242        // This creates entities with **wrong** offsets (UTF-16) that we later patch.
243        let mut entities: Vec<_> = entities
244            .iter()
245            .map(|e| Self {
246                message: text,
247                range: e.offset..e.offset + e.length,
248                kind: &e.kind,
249            })
250            .collect();
251
252        // Convert offsets
253
254        // References to all offsets that need patching
255        let mut offsets: Vec<&mut usize> = entities
256            .iter_mut()
257            .flat_map(
258                |Self {
259                     range: Range { start, end },
260                     ..
261                 }| [start, end],
262            )
263            .collect();
264
265        // Sort in decreasing order, so the smallest elements are at the end and can be
266        // removed more easily
267        offsets.sort_unstable_by_key(|&&mut offset| cmp::Reverse(offset));
268
269        let _ = text
270            .chars()
271            .chain(['\0']) // this is needed to process offset pointing at the end of the string
272            .try_fold((0, 0), |(len_utf8, len_utf16), c| {
273                // Stop if there are no more offsets to patch
274                if offsets.is_empty() {
275                    return None;
276                }
277
278                // Patch all offsets that can be patched
279                while offsets
280                    .last()
281                    .map(|&&mut offset| offset <= len_utf16)
282                    .unwrap_or(false)
283                {
284                    let offset = offsets.pop().unwrap();
285                    assert_eq!(*offset, len_utf16, "Invalid utf-16 offset");
286
287                    // Patch the offset to be UTF-8
288                    *offset = len_utf8;
289                }
290
291                // Update "running" length
292                Some((len_utf8 + c.len_utf8(), len_utf16 + c.len_utf16()))
293            });
294
295        entities
296    }
297}
298
299#[serde_with_macros::skip_serializing_none]
300#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
301#[serde(rename_all = "snake_case")]
302#[serde(tag = "type")]
303pub enum MessageEntityKind {
304    Mention,
305    Hashtag,
306    Cashtag,
307    BotCommand,
308    Url,
309    Email,
310    PhoneNumber,
311    Bold,
312    Italic,
313    Underline,
314    Strikethrough,
315    Spoiler,
316    Code,
317    Pre { language: Option<String> },
318    TextLink { url: url::Url },
319    TextMention { user: User },
320    CustomEmoji { custom_emoji_id: String }, // FIXME(waffle): newtype this
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326    use cool_asserts::assert_matches;
327    use MessageEntity;
328    use MessageEntityKind::*;
329
330    #[test]
331    fn recursive_kind() {
332        use serde_json::from_str;
333
334        assert_eq!(
335            MessageEntity {
336                kind: MessageEntityKind::TextLink {
337                    url: url::Url::parse("https://example.com").unwrap(),
338                },
339                offset: 1,
340                length: 2,
341            },
342            from_str::<MessageEntity>(
343                r#"{"type":"text_link","url":"https://example.com","offset":1,"length":2}"#
344            )
345            .unwrap()
346        );
347    }
348
349    #[test]
350    fn pre() {
351        use serde_json::from_str;
352
353        assert_eq!(
354            MessageEntity {
355                kind: MessageEntityKind::Pre {
356                    language: Some("rust".to_string())
357                },
358                offset: 1,
359                length: 2,
360            },
361            from_str::<MessageEntity>(r#"{"type":"pre","offset":1,"length":2,"language":"rust"}"#)
362                .unwrap()
363        );
364    }
365
366    // https://github.com/teloxide/teloxide-core/pull/145
367    #[test]
368    fn pre_with_none_language() {
369        use serde_json::to_string;
370
371        assert_eq!(
372            to_string(&MessageEntity {
373                kind: MessageEntityKind::Pre { language: None },
374                offset: 1,
375                length: 2,
376            })
377            .unwrap()
378            .find("language"),
379            None
380        );
381    }
382
383    #[test]
384    fn parse_быба() {
385        let parsed = MessageEntityRef::parse(
386            "быба",
387            &[
388                MessageEntity {
389                    kind: Strikethrough,
390                    offset: 0,
391                    length: 1,
392                },
393                MessageEntity {
394                    kind: Bold,
395                    offset: 1,
396                    length: 1,
397                },
398                MessageEntity {
399                    kind: Italic,
400                    offset: 2,
401                    length: 1,
402                },
403                MessageEntity {
404                    kind: Code,
405                    offset: 3,
406                    length: 1,
407                },
408            ],
409        );
410
411        assert_matches!(
412            parsed,
413            [
414                entity if entity.text() == "б" && entity.kind() == &Strikethrough,
415                entity if entity.text() == "ы" && entity.kind() == &Bold,
416                entity if entity.text() == "б" && entity.kind() == &Italic,
417                entity if entity.text() == "а" && entity.kind() == &Code,
418
419            ]
420        );
421    }
422
423    #[test]
424    fn parse_symbol_24bit() {
425        let parsed = MessageEntityRef::parse(
426            "xx আ #tt",
427            &[MessageEntity {
428                kind: Hashtag,
429                offset: 5,
430                length: 3,
431            }],
432        );
433
434        assert_matches!(
435            parsed,
436            [entity if entity.text() == "#tt" && entity.kind() == &Hashtag]
437        );
438    }
439
440    #[test]
441    fn parse_enclosed() {
442        let parsed = MessageEntityRef::parse(
443            "b i b",
444            // For some reason this is how telegram encodes <b>b <i>i<i/> b<b/>
445            &[
446                MessageEntity {
447                    kind: Bold,
448                    offset: 0,
449                    length: 2,
450                },
451                MessageEntity {
452                    kind: Bold,
453                    offset: 2,
454                    length: 3,
455                },
456                MessageEntity {
457                    kind: Italic,
458                    offset: 2,
459                    length: 1,
460                },
461            ],
462        );
463
464        assert_matches!(
465            parsed,
466            [
467                entity if entity.text() == "b " && entity.kind() == &Bold,
468                entity if entity.text() == "i b" && entity.kind() == &Bold,
469                entity if entity.text() == "i" && entity.kind() == &Italic,
470            ]
471        );
472    }
473
474    #[test]
475    fn parse_nothing() {
476        let parsed = MessageEntityRef::parse("a", &[]);
477        assert_eq!(parsed, []);
478    }
479
480    #[test]
481    fn parse_empty() {
482        // It should be impossible for this to be returned from telegram, but just to be
483        // sure
484        let parsed = MessageEntityRef::parse(
485            "",
486            &[
487                MessageEntity {
488                    kind: Bold,
489                    offset: 0,
490                    length: 0,
491                },
492                MessageEntity {
493                    kind: Italic,
494                    offset: 0,
495                    length: 0,
496                },
497            ],
498        );
499
500        assert_matches!(
501            parsed,
502            [
503                entity if entity.text() == "" && entity.kind() == &Bold,
504                entity if entity.text() == "" && entity.kind() == &Italic,
505            ]
506        );
507    }
508}