Skip to main content

proto_blue_api/
rich_text.rs

1//! Rich text with facet annotations (mentions, links, tags).
2//!
3//! Facets use UTF-8 byte offsets, which align naturally with Rust's `&str`.
4
5use regex::Regex;
6use std::sync::LazyLock;
7use unicode_segmentation::UnicodeSegmentation;
8
9/// A facet annotation on a sub-string of rich text.
10#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
11#[serde(rename_all = "camelCase")]
12pub struct Facet {
13    pub index: ByteSlice,
14    pub features: Vec<FacetFeature>,
15}
16
17/// UTF-8 byte range [start, end).
18#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
19#[serde(rename_all = "camelCase")]
20pub struct ByteSlice {
21    pub byte_start: usize,
22    pub byte_end: usize,
23}
24
25/// A facet feature — what kind of annotation this is.
26#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
27#[serde(tag = "$type")]
28pub enum FacetFeature {
29    #[serde(rename = "app.bsky.richtext.facet#mention")]
30    Mention { did: String },
31    #[serde(rename = "app.bsky.richtext.facet#link")]
32    Link { uri: String },
33    #[serde(rename = "app.bsky.richtext.facet#tag")]
34    Tag { tag: String },
35}
36
37/// A segment of rich text — either plain or annotated.
38#[derive(Debug, Clone)]
39pub struct RichTextSegment {
40    pub text: String,
41    pub facet: Option<Facet>,
42}
43
44impl RichTextSegment {
45    pub fn is_mention(&self) -> bool {
46        self.facet.as_ref().is_some_and(|f| {
47            f.features
48                .iter()
49                .any(|feat| matches!(feat, FacetFeature::Mention { .. }))
50        })
51    }
52
53    pub fn is_link(&self) -> bool {
54        self.facet.as_ref().is_some_and(|f| {
55            f.features
56                .iter()
57                .any(|feat| matches!(feat, FacetFeature::Link { .. }))
58        })
59    }
60
61    pub fn is_tag(&self) -> bool {
62        self.facet.as_ref().is_some_and(|f| {
63            f.features
64                .iter()
65                .any(|feat| matches!(feat, FacetFeature::Tag { .. }))
66        })
67    }
68}
69
70/// Rich text with facet annotations.
71///
72/// Text is stored as a UTF-8 string. Facet indices are UTF-8 byte offsets,
73/// which is Rust's native string indexing — no conversion needed.
74#[derive(Debug, Clone)]
75pub struct RichText {
76    text: String,
77    facets: Vec<Facet>,
78}
79
80impl RichText {
81    /// Create a new RichText from text, optionally with pre-detected facets.
82    pub fn new(text: impl Into<String>, facets: Option<Vec<Facet>>) -> Self {
83        let text = text.into();
84        let mut facets = facets.unwrap_or_default();
85        // Filter invalid facets and sort by byte_start
86        facets.retain(|f| f.index.byte_start < f.index.byte_end);
87        facets.sort_by_key(|f| f.index.byte_start);
88        RichText { text, facets }
89    }
90
91    /// The raw text.
92    pub fn text(&self) -> &str {
93        &self.text
94    }
95
96    /// The facets.
97    pub fn facets(&self) -> &[Facet] {
98        &self.facets
99    }
100
101    /// UTF-8 byte length.
102    pub fn len(&self) -> usize {
103        self.text.len()
104    }
105
106    /// Whether the text is empty.
107    pub fn is_empty(&self) -> bool {
108        self.text.is_empty()
109    }
110
111    /// Grapheme cluster count (user-perceived characters).
112    pub fn grapheme_length(&self) -> usize {
113        self.text.graphemes(true).count()
114    }
115
116    /// Detect facets (mentions, links, tags) in the text.
117    /// This does NOT resolve @mentions to DIDs — use `detect_facets_with_resolver`
118    /// for that. Mentions will have `did` set to the handle text.
119    pub fn detect_facets(&mut self) {
120        self.facets = detect_facets(&self.text);
121    }
122
123    /// Insert text at a UTF-8 byte offset, adjusting facets.
124    pub fn insert(&mut self, index: usize, insert_text: &str) {
125        let added = insert_text.len();
126        self.text.insert_str(index, insert_text);
127
128        for facet in &mut self.facets {
129            if index <= facet.index.byte_start {
130                // Insert before facet: shift both
131                facet.index.byte_start += added;
132                facet.index.byte_end += added;
133            } else if index < facet.index.byte_end {
134                // Insert inside facet: expand end
135                facet.index.byte_end += added;
136            }
137            // Insert after: no change
138        }
139    }
140
141    /// Delete a byte range [start, end), adjusting facets.
142    pub fn delete(&mut self, start: usize, end: usize) {
143        let removed = end - start;
144
145        // Replace the range in the string
146        self.text.replace_range(start..end, "");
147
148        for facet in &mut self.facets {
149            let fs = facet.index.byte_start;
150            let fe = facet.index.byte_end;
151
152            if start <= fs && end >= fe {
153                // A: Deletion spans entire facet → collapse
154                facet.index.byte_start = start;
155                facet.index.byte_end = start;
156            } else if start >= fe {
157                // B: Deletion entirely after facet → no change
158            } else if start > fs && end >= fe {
159                // C: Deletion overlaps end → truncate
160                facet.index.byte_end = start;
161            } else if start > fs && end < fe {
162                // D: Deletion entirely inside facet → shrink
163                facet.index.byte_end -= removed;
164            } else if start <= fs && end > fs && end < fe {
165                // E: Deletion overlaps start → shift start, shrink
166                facet.index.byte_start = start;
167                facet.index.byte_end -= removed;
168            } else if end <= fs {
169                // F: Deletion entirely before facet → shift both
170                facet.index.byte_start -= removed;
171                facet.index.byte_end -= removed;
172            }
173        }
174
175        // Remove collapsed facets
176        self.facets
177            .retain(|f| f.index.byte_start < f.index.byte_end);
178    }
179
180    /// Iterate over segments of the rich text.
181    pub fn segments(&self) -> Vec<RichTextSegment> {
182        if self.facets.is_empty() {
183            return vec![RichTextSegment {
184                text: self.text.clone(),
185                facet: None,
186            }];
187        }
188
189        let mut segments = Vec::new();
190        let mut cursor = 0;
191
192        for facet in &self.facets {
193            let start = facet.index.byte_start;
194            let end = facet.index.byte_end.min(self.text.len());
195
196            // Plain text before this facet
197            if cursor < start {
198                segments.push(RichTextSegment {
199                    text: self.text[cursor..start].to_string(),
200                    facet: None,
201                });
202            }
203
204            // The faceted segment
205            let seg_text = &self.text[start..end];
206            if !seg_text.trim().is_empty() {
207                segments.push(RichTextSegment {
208                    text: seg_text.to_string(),
209                    facet: Some(facet.clone()),
210                });
211            } else {
212                segments.push(RichTextSegment {
213                    text: seg_text.to_string(),
214                    facet: None,
215                });
216            }
217
218            cursor = end;
219        }
220
221        // Remaining text after last facet
222        if cursor < self.text.len() {
223            segments.push(RichTextSegment {
224                text: self.text[cursor..].to_string(),
225                facet: None,
226            });
227        }
228
229        segments
230    }
231}
232
233// --- Facet detection ---
234
235static MENTION_RE: LazyLock<Regex> = LazyLock::new(|| {
236    Regex::new(r"(?:^|\s|\()(@)([a-zA-Z0-9]([a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,})")
237        .expect("mention regex")
238});
239
240static URL_RE: LazyLock<Regex> =
241    LazyLock::new(|| Regex::new(r"(?:^|\s|\()(https?://[\S]+)").expect("url regex"));
242
243static TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
244    Regex::new(r"(?:^|\s)[##]([^\s\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}]*[^\d\s\p{Punctuation}\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}]+[^\s\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}]*)")
245        .expect("tag regex")
246});
247
248/// Detect facets in text without DID resolution.
249pub fn detect_facets(text: &str) -> Vec<Facet> {
250    let mut facets = Vec::new();
251
252    // Detect mentions: @handle.domain
253    for cap in MENTION_RE.captures_iter(text) {
254        let handle_match = cap.get(2).unwrap();
255        let handle = handle_match.as_str();
256
257        // byte_start from the '@', not from any leading whitespace
258        let at_match = cap.get(1).unwrap();
259        let byte_start = at_match.start();
260        let byte_end = handle_match.end();
261
262        facets.push(Facet {
263            index: ByteSlice {
264                byte_start,
265                byte_end,
266            },
267            features: vec![FacetFeature::Mention {
268                did: handle.to_string(),
269            }],
270        });
271    }
272
273    // Detect URLs
274    for cap in URL_RE.captures_iter(text) {
275        let url_match = cap.get(1).unwrap();
276        let mut uri = url_match.as_str().to_string();
277        let byte_start = url_match.start();
278        let mut byte_end = url_match.end();
279
280        // Strip trailing punctuation
281        while uri.ends_with(['.', ',', ';', ':', '!', '?']) {
282            uri.pop();
283            byte_end -= 1;
284        }
285
286        // Strip trailing ')' if no '(' in URL
287        if uri.ends_with(')') && !uri.contains('(') {
288            uri.pop();
289            byte_end -= 1;
290        }
291
292        facets.push(Facet {
293            index: ByteSlice {
294                byte_start,
295                byte_end,
296            },
297            features: vec![FacetFeature::Link { uri }],
298        });
299    }
300
301    // Detect hashtags: #tag
302    for cap in TAG_RE.captures_iter(text) {
303        let tag_match = cap.get(1).unwrap();
304        let tag = tag_match.as_str();
305
306        // Limit tags to 64 chars
307        if tag.is_empty() || tag.len() > 64 {
308            continue;
309        }
310
311        // Strip trailing punctuation from tag
312        let tag_trimmed = tag.trim_end_matches(|c: char| c.is_ascii_punctuation());
313        if tag_trimmed.is_empty() {
314            continue;
315        }
316
317        // The full match includes the '#', find its byte position
318        let full_match = cap.get(0).unwrap();
319        // Find the '#' or '#' in the full match
320        let hash_pos = full_match
321            .as_str()
322            .find('#')
323            .or_else(|| full_match.as_str().find('#'))
324            .unwrap_or(0);
325        let byte_start = full_match.start() + hash_pos;
326        let byte_end = byte_start + 1 + tag_trimmed.len(); // '#' + tag text
327
328        // Clamp to not exceed text bounds
329        let byte_end = byte_end.min(text.len());
330
331        facets.push(Facet {
332            index: ByteSlice {
333                byte_start,
334                byte_end,
335            },
336            features: vec![FacetFeature::Tag {
337                tag: tag_trimmed.to_string(),
338            }],
339        });
340    }
341
342    facets.sort_by_key(|f| f.index.byte_start);
343    facets
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn basic_text_no_facets() {
352        let rt = RichText::new("Hello, world!", None);
353        assert_eq!(rt.text(), "Hello, world!");
354        assert!(rt.facets().is_empty());
355        assert_eq!(rt.len(), 13);
356        assert_eq!(rt.grapheme_length(), 13);
357    }
358
359    #[test]
360    fn detect_mention() {
361        let mut rt = RichText::new("Hello @alice.bsky.social!", None);
362        rt.detect_facets();
363        assert_eq!(rt.facets().len(), 1);
364        let f = &rt.facets()[0];
365        assert!(
366            matches!(&f.features[0], FacetFeature::Mention { did } if did == "alice.bsky.social")
367        );
368        assert_eq!(
369            &rt.text()[f.index.byte_start..f.index.byte_end],
370            "@alice.bsky.social"
371        );
372    }
373
374    #[test]
375    fn detect_url() {
376        let mut rt = RichText::new("Check https://example.com/path here", None);
377        rt.detect_facets();
378        assert_eq!(rt.facets().len(), 1);
379        let f = &rt.facets()[0];
380        assert!(
381            matches!(&f.features[0], FacetFeature::Link { uri } if uri == "https://example.com/path")
382        );
383    }
384
385    #[test]
386    fn detect_url_strips_trailing_punctuation() {
387        let mut rt = RichText::new("Visit https://example.com.", None);
388        rt.detect_facets();
389        assert_eq!(rt.facets().len(), 1);
390        let f = &rt.facets()[0];
391        assert!(
392            matches!(&f.features[0], FacetFeature::Link { uri } if uri == "https://example.com")
393        );
394    }
395
396    #[test]
397    fn detect_url_strips_trailing_paren_without_open() {
398        let mut rt = RichText::new("(see https://example.com/page)", None);
399        rt.detect_facets();
400        assert_eq!(rt.facets().len(), 1);
401        let f = &rt.facets()[0];
402        // URL doesn't contain '(' so trailing ')' is stripped
403        assert!(
404            matches!(&f.features[0], FacetFeature::Link { uri } if uri == "https://example.com/page")
405        );
406    }
407
408    #[test]
409    fn detect_hashtag() {
410        let mut rt = RichText::new("Hello #atproto world", None);
411        rt.detect_facets();
412        assert_eq!(rt.facets().len(), 1);
413        let f = &rt.facets()[0];
414        assert!(matches!(&f.features[0], FacetFeature::Tag { tag } if tag == "atproto"));
415    }
416
417    #[test]
418    fn detect_multiple_facets() {
419        let mut rt = RichText::new("@alice.test posted https://example.com #cool", None);
420        rt.detect_facets();
421        assert_eq!(rt.facets().len(), 3);
422        assert!(
423            rt.facets()[0]
424                .features
425                .iter()
426                .any(|f| matches!(f, FacetFeature::Mention { .. }))
427        );
428        assert!(
429            rt.facets()[1]
430                .features
431                .iter()
432                .any(|f| matches!(f, FacetFeature::Link { .. }))
433        );
434        assert!(
435            rt.facets()[2]
436                .features
437                .iter()
438                .any(|f| matches!(f, FacetFeature::Tag { .. }))
439        );
440    }
441
442    #[test]
443    fn segments_no_facets() {
444        let rt = RichText::new("Hello world", None);
445        let segs = rt.segments();
446        assert_eq!(segs.len(), 1);
447        assert_eq!(segs[0].text, "Hello world");
448        assert!(segs[0].facet.is_none());
449    }
450
451    #[test]
452    fn segments_with_facets() {
453        let mut rt = RichText::new("Hello @alice.test world", None);
454        rt.detect_facets();
455        let segs = rt.segments();
456        assert_eq!(segs.len(), 3);
457        assert_eq!(segs[0].text, "Hello ");
458        assert!(segs[0].facet.is_none());
459        assert_eq!(segs[1].text, "@alice.test");
460        assert!(segs[1].is_mention());
461        assert_eq!(segs[2].text, " world");
462        assert!(segs[2].facet.is_none());
463    }
464
465    #[test]
466    fn insert_before_facet() {
467        let facets = vec![Facet {
468            index: ByteSlice {
469                byte_start: 6,
470                byte_end: 11,
471            },
472            features: vec![FacetFeature::Tag {
473                tag: "test".to_string(),
474            }],
475        }];
476        let mut rt = RichText::new("Hello #test", Some(facets));
477        rt.insert(0, "Hey ");
478        assert_eq!(rt.text(), "Hey Hello #test");
479        assert_eq!(rt.facets()[0].index.byte_start, 10);
480        assert_eq!(rt.facets()[0].index.byte_end, 15);
481    }
482
483    #[test]
484    fn insert_inside_facet() {
485        let facets = vec![Facet {
486            index: ByteSlice {
487                byte_start: 0,
488                byte_end: 5,
489            },
490            features: vec![FacetFeature::Link {
491                uri: "https://example.com".to_string(),
492            }],
493        }];
494        let mut rt = RichText::new("Hello world", Some(facets));
495        rt.insert(3, "XX");
496        assert_eq!(rt.text(), "HelXXlo world");
497        assert_eq!(rt.facets()[0].index.byte_start, 0);
498        assert_eq!(rt.facets()[0].index.byte_end, 7);
499    }
500
501    #[test]
502    fn delete_before_facet() {
503        let facets = vec![Facet {
504            index: ByteSlice {
505                byte_start: 6,
506                byte_end: 11,
507            },
508            features: vec![FacetFeature::Tag {
509                tag: "test".to_string(),
510            }],
511        }];
512        let mut rt = RichText::new("Hello #test", Some(facets));
513        rt.delete(0, 6);
514        assert_eq!(rt.text(), "#test");
515        assert_eq!(rt.facets()[0].index.byte_start, 0);
516        assert_eq!(rt.facets()[0].index.byte_end, 5);
517    }
518
519    #[test]
520    fn delete_spanning_facet_removes_it() {
521        let facets = vec![Facet {
522            index: ByteSlice {
523                byte_start: 6,
524                byte_end: 11,
525            },
526            features: vec![FacetFeature::Tag {
527                tag: "test".to_string(),
528            }],
529        }];
530        let mut rt = RichText::new("Hello #test world", Some(facets));
531        rt.delete(5, 12);
532        assert_eq!(rt.text(), "Helloworld");
533        assert!(rt.facets().is_empty());
534    }
535
536    #[test]
537    fn grapheme_length_emoji() {
538        let rt = RichText::new("Hi 👋🏽", None);
539        // "Hi " = 3 graphemes, flag + skin tone = 1 grapheme
540        assert_eq!(rt.grapheme_length(), 4);
541        // But byte length is much longer (emoji is multi-byte)
542        assert!(rt.len() > 4);
543    }
544
545    #[test]
546    fn utf8_byte_offsets_work_natively() {
547        // In Rust, string indexing is already UTF-8 bytes
548        let text = "Héllo @alice.test";
549        let mut rt = RichText::new(text, None);
550        rt.detect_facets();
551        assert_eq!(rt.facets().len(), 1);
552        let f = &rt.facets()[0];
553        assert_eq!(
554            &rt.text()[f.index.byte_start..f.index.byte_end],
555            "@alice.test"
556        );
557    }
558
559    #[test]
560    fn empty_text() {
561        let rt = RichText::new("", None);
562        assert!(rt.is_empty());
563        assert_eq!(rt.len(), 0);
564        assert_eq!(rt.grapheme_length(), 0);
565        let segs = rt.segments();
566        assert_eq!(segs.len(), 1);
567        assert_eq!(segs[0].text, "");
568    }
569
570    #[test]
571    fn facet_feature_serde_roundtrip() {
572        let facet = Facet {
573            index: ByteSlice {
574                byte_start: 0,
575                byte_end: 5,
576            },
577            features: vec![FacetFeature::Mention {
578                did: "did:plc:abc123".to_string(),
579            }],
580        };
581        let json = serde_json::to_string(&facet).unwrap();
582        assert!(json.contains("app.bsky.richtext.facet#mention"));
583        let parsed: Facet = serde_json::from_str(&json).unwrap();
584        assert_eq!(parsed.index.byte_start, 0);
585        assert!(
586            matches!(&parsed.features[0], FacetFeature::Mention { did } if did == "did:plc:abc123")
587        );
588    }
589}