Skip to main content

proto_blue_api/
rich_text.rs

1//! Rich text with facet annotations (mentions, links, tags).
2//!
3//! Facets use UTF-8 byte offsets, which align naturally with Rust's `&str`.
4
5use regex::Regex;
6use std::sync::LazyLock;
7use unicode_segmentation::UnicodeSegmentation;
8
9/// A facet annotation on a sub-string of rich text.
10#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
11#[serde(rename_all = "camelCase")]
12pub struct Facet {
13    pub index: ByteSlice,
14    pub features: Vec<FacetFeature>,
15}
16
17/// UTF-8 byte range [start, end).
18#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
19#[serde(rename_all = "camelCase")]
20pub struct ByteSlice {
21    pub byte_start: usize,
22    pub byte_end: usize,
23}
24
25/// A facet feature — what kind of annotation this is.
26#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
27#[serde(tag = "$type")]
28pub enum FacetFeature {
29    #[serde(rename = "app.bsky.richtext.facet#mention")]
30    Mention { did: String },
31    #[serde(rename = "app.bsky.richtext.facet#link")]
32    Link { uri: String },
33    #[serde(rename = "app.bsky.richtext.facet#tag")]
34    Tag { tag: String },
35}
36
37/// A segment of rich text — either plain or annotated.
38#[derive(Debug, Clone)]
39pub struct RichTextSegment {
40    pub text: String,
41    pub facet: Option<Facet>,
42}
43
44impl RichTextSegment {
45    #[must_use]
46    pub fn is_mention(&self) -> bool {
47        self.facet.as_ref().is_some_and(|f| {
48            f.features
49                .iter()
50                .any(|feat| matches!(feat, FacetFeature::Mention { .. }))
51        })
52    }
53
54    #[must_use]
55    pub fn is_link(&self) -> bool {
56        self.facet.as_ref().is_some_and(|f| {
57            f.features
58                .iter()
59                .any(|feat| matches!(feat, FacetFeature::Link { .. }))
60        })
61    }
62
63    #[must_use]
64    pub fn is_tag(&self) -> bool {
65        self.facet.as_ref().is_some_and(|f| {
66            f.features
67                .iter()
68                .any(|feat| matches!(feat, FacetFeature::Tag { .. }))
69        })
70    }
71}
72
73/// Rich text with facet annotations.
74///
75/// Text is stored as a UTF-8 string. Facet indices are UTF-8 byte offsets,
76/// which is Rust's native string indexing — no conversion needed.
77#[derive(Debug, Clone)]
78pub struct RichText {
79    text: String,
80    facets: Vec<Facet>,
81}
82
83impl RichText {
84    /// Create a new `RichText` from text, optionally with pre-detected facets.
85    pub fn new(text: impl Into<String>, facets: Option<Vec<Facet>>) -> Self {
86        let text = text.into();
87        let mut facets = facets.unwrap_or_default();
88        // Filter invalid facets and sort by byte_start
89        facets.retain(|f| f.index.byte_start < f.index.byte_end);
90        facets.sort_by_key(|f| f.index.byte_start);
91        Self { text, facets }
92    }
93
94    /// The raw text.
95    #[must_use]
96    pub fn text(&self) -> &str {
97        &self.text
98    }
99
100    /// The facets.
101    #[must_use]
102    pub fn facets(&self) -> &[Facet] {
103        &self.facets
104    }
105
106    /// UTF-8 byte length.
107    #[must_use]
108    pub fn len(&self) -> usize {
109        self.text.len()
110    }
111
112    /// Whether the text is empty.
113    #[must_use]
114    pub fn is_empty(&self) -> bool {
115        self.text.is_empty()
116    }
117
118    /// Grapheme cluster count (user-perceived characters).
119    #[must_use]
120    pub fn grapheme_length(&self) -> usize {
121        self.text.graphemes(true).count()
122    }
123
124    /// Detect facets (mentions, links, tags) in the text.
125    /// This does NOT resolve @mentions to DIDs — use `detect_facets_with_resolver`
126    /// for that. Mentions will have `did` set to the handle text.
127    pub fn detect_facets(&mut self) {
128        self.facets = detect_facets(&self.text);
129    }
130
131    /// Insert text at a UTF-8 byte offset, adjusting facets.
132    pub fn insert(&mut self, index: usize, insert_text: &str) {
133        let added = insert_text.len();
134        self.text.insert_str(index, insert_text);
135
136        for facet in &mut self.facets {
137            if index <= facet.index.byte_start {
138                // Insert before facet: shift both
139                facet.index.byte_start += added;
140                facet.index.byte_end += added;
141            } else if index < facet.index.byte_end {
142                // Insert inside facet: expand end
143                facet.index.byte_end += added;
144            }
145            // Insert after: no change
146        }
147    }
148
149    /// Delete a byte range [start, end), adjusting facets.
150    pub fn delete(&mut self, start: usize, end: usize) {
151        let removed = end - start;
152
153        // Replace the range in the string
154        self.text.replace_range(start..end, "");
155
156        for facet in &mut self.facets {
157            let fs = facet.index.byte_start;
158            let fe = facet.index.byte_end;
159
160            if start <= fs && end >= fe {
161                // A: Deletion spans entire facet → collapse
162                facet.index.byte_start = start;
163                facet.index.byte_end = start;
164            } else if start >= fe {
165                // B: Deletion entirely after facet → no change
166            } else if start > fs && end >= fe {
167                // C: Deletion overlaps end → truncate
168                facet.index.byte_end = start;
169            } else if start > fs && end < fe {
170                // D: Deletion entirely inside facet → shrink
171                facet.index.byte_end -= removed;
172            } else if start <= fs && end > fs && end < fe {
173                // E: Deletion overlaps start → shift start, shrink
174                facet.index.byte_start = start;
175                facet.index.byte_end -= removed;
176            } else if end <= fs {
177                // F: Deletion entirely before facet → shift both
178                facet.index.byte_start -= removed;
179                facet.index.byte_end -= removed;
180            }
181        }
182
183        // Remove collapsed facets
184        self.facets
185            .retain(|f| f.index.byte_start < f.index.byte_end);
186    }
187
188    /// Iterate over segments of the rich text.
189    #[must_use]
190    pub fn segments(&self) -> Vec<RichTextSegment> {
191        if self.facets.is_empty() {
192            return vec![RichTextSegment {
193                text: self.text.clone(),
194                facet: None,
195            }];
196        }
197
198        let mut segments = Vec::new();
199        let mut cursor = 0;
200
201        for facet in &self.facets {
202            let start = facet.index.byte_start;
203            let end = facet.index.byte_end.min(self.text.len());
204
205            // Plain text before this facet
206            if cursor < start {
207                segments.push(RichTextSegment {
208                    text: self.text[cursor..start].to_string(),
209                    facet: None,
210                });
211            }
212
213            // The faceted segment
214            let seg_text = &self.text[start..end];
215            if seg_text.trim().is_empty() {
216                segments.push(RichTextSegment {
217                    text: seg_text.to_string(),
218                    facet: None,
219                });
220            } else {
221                segments.push(RichTextSegment {
222                    text: seg_text.to_string(),
223                    facet: Some(facet.clone()),
224                });
225            }
226
227            cursor = end;
228        }
229
230        // Remaining text after last facet
231        if cursor < self.text.len() {
232            segments.push(RichTextSegment {
233                text: self.text[cursor..].to_string(),
234                facet: None,
235            });
236        }
237
238        segments
239    }
240}
241
242// --- Facet detection ---
243
244static MENTION_RE: LazyLock<Regex> = LazyLock::new(|| {
245    Regex::new(r"(?:^|\s|\()(@)([a-zA-Z0-9]([a-zA-Z0-9.-]*[a-zA-Z0-9])?\.[a-zA-Z]{2,})")
246        .expect("mention regex")
247});
248
249static URL_RE: LazyLock<Regex> =
250    LazyLock::new(|| Regex::new(r"(?:^|\s|\()(https?://[\S]+)").expect("url regex"));
251
252static TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
253    Regex::new(r"(?:^|\s)[##]([^\s\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}]*[^\d\s\p{Punctuation}\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}]+[^\s\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}]*)")
254        .expect("tag regex")
255});
256
257/// Detect facets in text without DID resolution.
258pub fn detect_facets(text: &str) -> Vec<Facet> {
259    let mut facets = Vec::new();
260
261    // Detect mentions: @handle.domain
262    for cap in MENTION_RE.captures_iter(text) {
263        let handle_match = cap.get(2).unwrap();
264        let handle = handle_match.as_str();
265
266        // byte_start from the '@', not from any leading whitespace
267        let at_match = cap.get(1).unwrap();
268        let byte_start = at_match.start();
269        let byte_end = handle_match.end();
270
271        facets.push(Facet {
272            index: ByteSlice {
273                byte_start,
274                byte_end,
275            },
276            features: vec![FacetFeature::Mention {
277                did: handle.to_string(),
278            }],
279        });
280    }
281
282    // Detect URLs
283    for cap in URL_RE.captures_iter(text) {
284        let url_match = cap.get(1).unwrap();
285        let mut uri = url_match.as_str().to_string();
286        let byte_start = url_match.start();
287        let mut byte_end = url_match.end();
288
289        // Strip trailing punctuation
290        while uri.ends_with(['.', ',', ';', ':', '!', '?']) {
291            uri.pop();
292            byte_end -= 1;
293        }
294
295        // Strip trailing ')' if no '(' in URL
296        if uri.ends_with(')') && !uri.contains('(') {
297            uri.pop();
298            byte_end -= 1;
299        }
300
301        facets.push(Facet {
302            index: ByteSlice {
303                byte_start,
304                byte_end,
305            },
306            features: vec![FacetFeature::Link { uri }],
307        });
308    }
309
310    // Detect hashtags: #tag
311    for cap in TAG_RE.captures_iter(text) {
312        let tag_match = cap.get(1).unwrap();
313        let tag = tag_match.as_str();
314
315        // Limit tags to 64 chars
316        if tag.is_empty() || tag.len() > 64 {
317            continue;
318        }
319
320        // Strip trailing punctuation from tag
321        let tag_trimmed = tag.trim_end_matches(|c: char| c.is_ascii_punctuation());
322        if tag_trimmed.is_empty() {
323            continue;
324        }
325
326        // The full match includes the '#', find its byte position
327        let full_match = cap.get(0).unwrap();
328        // Find the '#' or '#' in the full match
329        let hash_pos = full_match
330            .as_str()
331            .find('#')
332            .or_else(|| full_match.as_str().find('#'))
333            .unwrap_or(0);
334        let byte_start = full_match.start() + hash_pos;
335        let byte_end = byte_start + 1 + tag_trimmed.len(); // '#' + tag text
336
337        // Clamp to not exceed text bounds
338        let byte_end = byte_end.min(text.len());
339
340        facets.push(Facet {
341            index: ByteSlice {
342                byte_start,
343                byte_end,
344            },
345            features: vec![FacetFeature::Tag {
346                tag: tag_trimmed.to_string(),
347            }],
348        });
349    }
350
351    facets.sort_by_key(|f| f.index.byte_start);
352    facets
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    #[test]
360    fn basic_text_no_facets() {
361        let rt = RichText::new("Hello, world!", None);
362        assert_eq!(rt.text(), "Hello, world!");
363        assert!(rt.facets().is_empty());
364        assert_eq!(rt.len(), 13);
365        assert_eq!(rt.grapheme_length(), 13);
366    }
367
368    #[test]
369    fn detect_mention() {
370        let mut rt = RichText::new("Hello @alice.bsky.social!", None);
371        rt.detect_facets();
372        assert_eq!(rt.facets().len(), 1);
373        let f = &rt.facets()[0];
374        assert!(
375            matches!(&f.features[0], FacetFeature::Mention { did } if did == "alice.bsky.social")
376        );
377        assert_eq!(
378            &rt.text()[f.index.byte_start..f.index.byte_end],
379            "@alice.bsky.social"
380        );
381    }
382
383    #[test]
384    fn detect_url() {
385        let mut rt = RichText::new("Check https://example.com/path here", None);
386        rt.detect_facets();
387        assert_eq!(rt.facets().len(), 1);
388        let f = &rt.facets()[0];
389        assert!(
390            matches!(&f.features[0], FacetFeature::Link { uri } if uri == "https://example.com/path")
391        );
392    }
393
394    #[test]
395    fn detect_url_strips_trailing_punctuation() {
396        let mut rt = RichText::new("Visit https://example.com.", None);
397        rt.detect_facets();
398        assert_eq!(rt.facets().len(), 1);
399        let f = &rt.facets()[0];
400        assert!(
401            matches!(&f.features[0], FacetFeature::Link { uri } if uri == "https://example.com")
402        );
403    }
404
405    #[test]
406    fn detect_url_strips_trailing_paren_without_open() {
407        let mut rt = RichText::new("(see https://example.com/page)", None);
408        rt.detect_facets();
409        assert_eq!(rt.facets().len(), 1);
410        let f = &rt.facets()[0];
411        // URL doesn't contain '(' so trailing ')' is stripped
412        assert!(
413            matches!(&f.features[0], FacetFeature::Link { uri } if uri == "https://example.com/page")
414        );
415    }
416
417    #[test]
418    fn detect_hashtag() {
419        let mut rt = RichText::new("Hello #atproto world", None);
420        rt.detect_facets();
421        assert_eq!(rt.facets().len(), 1);
422        let f = &rt.facets()[0];
423        assert!(matches!(&f.features[0], FacetFeature::Tag { tag } if tag == "atproto"));
424    }
425
426    #[test]
427    fn detect_multiple_facets() {
428        let mut rt = RichText::new("@alice.test posted https://example.com #cool", None);
429        rt.detect_facets();
430        assert_eq!(rt.facets().len(), 3);
431        assert!(
432            rt.facets()[0]
433                .features
434                .iter()
435                .any(|f| matches!(f, FacetFeature::Mention { .. }))
436        );
437        assert!(
438            rt.facets()[1]
439                .features
440                .iter()
441                .any(|f| matches!(f, FacetFeature::Link { .. }))
442        );
443        assert!(
444            rt.facets()[2]
445                .features
446                .iter()
447                .any(|f| matches!(f, FacetFeature::Tag { .. }))
448        );
449    }
450
451    #[test]
452    fn segments_no_facets() {
453        let rt = RichText::new("Hello world", None);
454        let segs = rt.segments();
455        assert_eq!(segs.len(), 1);
456        assert_eq!(segs[0].text, "Hello world");
457        assert!(segs[0].facet.is_none());
458    }
459
460    #[test]
461    fn segments_with_facets() {
462        let mut rt = RichText::new("Hello @alice.test world", None);
463        rt.detect_facets();
464        let segs = rt.segments();
465        assert_eq!(segs.len(), 3);
466        assert_eq!(segs[0].text, "Hello ");
467        assert!(segs[0].facet.is_none());
468        assert_eq!(segs[1].text, "@alice.test");
469        assert!(segs[1].is_mention());
470        assert_eq!(segs[2].text, " world");
471        assert!(segs[2].facet.is_none());
472    }
473
474    #[test]
475    fn insert_before_facet() {
476        let facets = vec![Facet {
477            index: ByteSlice {
478                byte_start: 6,
479                byte_end: 11,
480            },
481            features: vec![FacetFeature::Tag {
482                tag: "test".to_string(),
483            }],
484        }];
485        let mut rt = RichText::new("Hello #test", Some(facets));
486        rt.insert(0, "Hey ");
487        assert_eq!(rt.text(), "Hey Hello #test");
488        assert_eq!(rt.facets()[0].index.byte_start, 10);
489        assert_eq!(rt.facets()[0].index.byte_end, 15);
490    }
491
492    #[test]
493    fn insert_inside_facet() {
494        let facets = vec![Facet {
495            index: ByteSlice {
496                byte_start: 0,
497                byte_end: 5,
498            },
499            features: vec![FacetFeature::Link {
500                uri: "https://example.com".to_string(),
501            }],
502        }];
503        let mut rt = RichText::new("Hello world", Some(facets));
504        rt.insert(3, "XX");
505        assert_eq!(rt.text(), "HelXXlo world");
506        assert_eq!(rt.facets()[0].index.byte_start, 0);
507        assert_eq!(rt.facets()[0].index.byte_end, 7);
508    }
509
510    #[test]
511    fn delete_before_facet() {
512        let facets = vec![Facet {
513            index: ByteSlice {
514                byte_start: 6,
515                byte_end: 11,
516            },
517            features: vec![FacetFeature::Tag {
518                tag: "test".to_string(),
519            }],
520        }];
521        let mut rt = RichText::new("Hello #test", Some(facets));
522        rt.delete(0, 6);
523        assert_eq!(rt.text(), "#test");
524        assert_eq!(rt.facets()[0].index.byte_start, 0);
525        assert_eq!(rt.facets()[0].index.byte_end, 5);
526    }
527
528    #[test]
529    fn delete_spanning_facet_removes_it() {
530        let facets = vec![Facet {
531            index: ByteSlice {
532                byte_start: 6,
533                byte_end: 11,
534            },
535            features: vec![FacetFeature::Tag {
536                tag: "test".to_string(),
537            }],
538        }];
539        let mut rt = RichText::new("Hello #test world", Some(facets));
540        rt.delete(5, 12);
541        assert_eq!(rt.text(), "Helloworld");
542        assert!(rt.facets().is_empty());
543    }
544
545    #[test]
546    fn grapheme_length_emoji() {
547        let rt = RichText::new("Hi 👋🏽", None);
548        // "Hi " = 3 graphemes, flag + skin tone = 1 grapheme
549        assert_eq!(rt.grapheme_length(), 4);
550        // But byte length is much longer (emoji is multi-byte)
551        assert!(rt.len() > 4);
552    }
553
554    #[test]
555    fn utf8_byte_offsets_work_natively() {
556        // In Rust, string indexing is already UTF-8 bytes
557        let text = "Héllo @alice.test";
558        let mut rt = RichText::new(text, None);
559        rt.detect_facets();
560        assert_eq!(rt.facets().len(), 1);
561        let f = &rt.facets()[0];
562        assert_eq!(
563            &rt.text()[f.index.byte_start..f.index.byte_end],
564            "@alice.test"
565        );
566    }
567
568    #[test]
569    fn empty_text() {
570        let rt = RichText::new("", None);
571        assert!(rt.is_empty());
572        assert_eq!(rt.len(), 0);
573        assert_eq!(rt.grapheme_length(), 0);
574        let segs = rt.segments();
575        assert_eq!(segs.len(), 1);
576        assert_eq!(segs[0].text, "");
577    }
578
579    #[test]
580    fn facet_feature_serde_roundtrip() {
581        let facet = Facet {
582            index: ByteSlice {
583                byte_start: 0,
584                byte_end: 5,
585            },
586            features: vec![FacetFeature::Mention {
587                did: "did:plc:abc123".to_string(),
588            }],
589        };
590        let json = serde_json::to_string(&facet).unwrap();
591        assert!(json.contains("app.bsky.richtext.facet#mention"));
592        let parsed: Facet = serde_json::from_str(&json).unwrap();
593        assert_eq!(parsed.index.byte_start, 0);
594        assert!(
595            matches!(&parsed.features[0], FacetFeature::Mention { did } if did == "did:plc:abc123")
596        );
597    }
598}