saorsa_core/
segment.rs

1//! Segment type — the fundamental rendering unit.
2
3use crate::style::Style;
4use unicode_segmentation::UnicodeSegmentation;
5use unicode_width::UnicodeWidthStr;
6
7/// A piece of styled text, the fundamental rendering unit.
8///
9/// Every widget's render method produces lines of segments.
10#[derive(Clone, Debug, PartialEq, Eq)]
11pub struct Segment {
12    /// The text content.
13    pub text: String,
14    /// The style applied to this segment.
15    pub style: Style,
16    /// Whether this is a control sequence (not visible text).
17    pub is_control: bool,
18}
19
20impl Segment {
21    /// Create a new segment with default style.
22    pub fn new(text: impl Into<String>) -> Self {
23        Self {
24            text: text.into(),
25            style: Style::default(),
26            is_control: false,
27        }
28    }
29
30    /// Create a new segment with the given style.
31    pub fn styled(text: impl Into<String>, style: Style) -> Self {
32        Self {
33            text: text.into(),
34            style,
35            is_control: false,
36        }
37    }
38
39    /// Create a control segment (not rendered as visible text).
40    pub fn control(text: impl Into<String>) -> Self {
41        Self {
42            text: text.into(),
43            style: Style::default(),
44            is_control: true,
45        }
46    }
47
48    /// Create a blank segment (spaces) of the given width.
49    pub fn blank(width: u16) -> Self {
50        Self {
51            text: " ".repeat(width as usize),
52            style: Style::default(),
53            is_control: false,
54        }
55    }
56
57    /// Display width in terminal cells.
58    pub fn width(&self) -> usize {
59        if self.is_control {
60            return 0;
61        }
62        UnicodeWidthStr::width(self.text.as_str())
63    }
64
65    /// Display width in terminal cells (alias for width()).
66    pub fn display_width(&self) -> usize {
67        self.width()
68    }
69
70    /// Returns true if the segment has no text.
71    pub fn is_empty(&self) -> bool {
72        self.text.is_empty()
73    }
74
75    /// Returns each grapheme cluster in this segment together with its display width.
76    ///
77    /// Combining marks (zero-width) are grouped with their base character into
78    /// a single grapheme cluster by the Unicode segmentation algorithm.
79    pub fn grapheme_widths(&self) -> Vec<(String, usize)> {
80        if self.is_control {
81            return Vec::new();
82        }
83        self.text
84            .graphemes(true)
85            .map(|g| (g.to_string(), UnicodeWidthStr::width(g)))
86            .collect()
87    }
88
89    /// Returns the number of grapheme clusters in this segment.
90    ///
91    /// This counts user-perceived characters, so a base character followed by
92    /// combining diacritics counts as one.
93    pub fn char_count(&self) -> usize {
94        if self.is_control {
95            return 0;
96        }
97        self.text.graphemes(true).count()
98    }
99
100    /// Truncate this segment to at most `max_width` display columns.
101    ///
102    /// If the segment is already within `max_width`, returns an identical segment.
103    /// If a wide character straddles the boundary, it is excluded (the result may
104    /// be slightly shorter than `max_width`).
105    pub fn truncate_to_width(&self, max_width: usize) -> Segment {
106        self.split_at(max_width).0
107    }
108
109    /// Pad this segment with trailing spaces to reach `target_width` display columns.
110    ///
111    /// If the segment is already at or wider than `target_width`, returns unchanged.
112    pub fn pad_to_width(&self, target_width: usize) -> Segment {
113        let current = self.width();
114        if current >= target_width {
115            return self.clone();
116        }
117        let padding = target_width - current;
118        let mut text = self.text.clone();
119        for _ in 0..padding {
120            text.push(' ');
121        }
122        Segment::styled(text, self.style.clone())
123    }
124
125    /// Split this segment at the given display-width offset.
126    ///
127    /// Returns (left, right) where left has the specified display width.
128    /// If the offset falls in the middle of a wide character, the left side
129    /// is padded with a space and the right side gets a leading space.
130    ///
131    /// Combining marks (zero-width diacritics) are kept attached to their
132    /// base character: if the split point falls between a base character and
133    /// its combining marks, the combining marks travel with the base.
134    pub fn split_at(&self, offset: usize) -> (Segment, Segment) {
135        if offset == 0 {
136            return (
137                Segment::styled(String::new(), self.style.clone()),
138                self.clone(),
139            );
140        }
141        if offset >= self.width() {
142            return (
143                self.clone(),
144                Segment::styled(String::new(), self.style.clone()),
145            );
146        }
147
148        // Collect graphemes with their widths
149        let graphemes: Vec<(&str, usize)> = self
150            .text
151            .graphemes(true)
152            .map(|g| (g, UnicodeWidthStr::width(g)))
153            .collect();
154
155        let mut left = String::new();
156        let mut current_width = 0;
157        let mut split_idx = 0; // index of first grapheme that goes to right side
158        let mut need_left_pad = false;
159
160        for (i, &(grapheme, gw)) in graphemes.iter().enumerate() {
161            if current_width + gw > offset {
162                // This grapheme would exceed the offset.
163                if current_width < offset && gw > 1 {
164                    // Wide char straddles the boundary — pad left with space
165                    left.push(' ');
166                    need_left_pad = true;
167                }
168                split_idx = i;
169                break;
170            }
171            left.push_str(grapheme);
172            current_width += gw;
173            if current_width == offset {
174                // Check if the next grapheme(s) are zero-width combining marks
175                // that should stay with the current base character
176                let mut j = i + 1;
177                while j < graphemes.len() && graphemes[j].1 == 0 {
178                    left.push_str(graphemes[j].0);
179                    j += 1;
180                }
181                split_idx = j;
182                break;
183            }
184        }
185
186        // Build right side from remaining graphemes
187        let mut right = String::new();
188        if need_left_pad {
189            // The wide char was split; put a space on the right as placeholder
190            right.push(' ');
191            // Skip the straddled grapheme
192            for &(grapheme, _) in &graphemes[split_idx + 1..] {
193                right.push_str(grapheme);
194            }
195        } else {
196            for &(grapheme, _) in &graphemes[split_idx..] {
197                right.push_str(grapheme);
198            }
199        }
200
201        (
202            Segment::styled(left, self.style.clone()),
203            Segment::styled(right, self.style.clone()),
204        )
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    #[test]
213    fn ascii_width() {
214        assert_eq!(Segment::new("hello").width(), 5);
215    }
216
217    #[test]
218    fn empty_width() {
219        assert_eq!(Segment::new("").width(), 0);
220    }
221
222    #[test]
223    fn control_width_is_zero() {
224        assert_eq!(Segment::control("ESC[1m").width(), 0);
225    }
226
227    #[test]
228    fn cjk_width() {
229        // CJK characters are 2 cells wide
230        assert_eq!(Segment::new("\u{4e16}\u{754c}").width(), 4); // 世界
231    }
232
233    #[test]
234    fn split_ascii() {
235        let s = Segment::new("hello");
236        let (l, r) = s.split_at(3);
237        assert_eq!(l.text, "hel");
238        assert_eq!(r.text, "lo");
239    }
240
241    #[test]
242    fn split_at_zero() {
243        let s = Segment::new("hello");
244        let (l, r) = s.split_at(0);
245        assert_eq!(l.text, "");
246        assert_eq!(r.text, "hello");
247    }
248
249    #[test]
250    fn split_at_end() {
251        let s = Segment::new("hello");
252        let (l, r) = s.split_at(5);
253        assert_eq!(l.text, "hello");
254        assert_eq!(r.text, "");
255    }
256
257    #[test]
258    fn split_beyond_end() {
259        let s = Segment::new("hi");
260        let (l, r) = s.split_at(100);
261        assert_eq!(l.text, "hi");
262        assert_eq!(r.text, "");
263    }
264
265    #[test]
266    fn is_empty() {
267        assert!(Segment::new("").is_empty());
268        assert!(!Segment::new("x").is_empty());
269    }
270
271    #[test]
272    fn styled_preserves_style_on_split() {
273        let s = Segment::styled("hello", Style::new().bold(true));
274        let (l, r) = s.split_at(2);
275        assert!(l.style.bold);
276        assert!(r.style.bold);
277    }
278
279    // --- Task 5: Unicode edge case tests ---
280
281    #[test]
282    fn emoji_width_is_two() {
283        // Most emoji are 2 columns wide
284        let s = Segment::new("\u{1f600}"); // grinning face
285        assert_eq!(s.width(), 2);
286    }
287
288    #[test]
289    fn emoji_at_split_boundary() {
290        // "A" (1) + emoji (2) + "B" (1) = width 4
291        let s = Segment::new("A\u{1f600}B");
292        assert_eq!(s.width(), 4);
293
294        // Split at offset 1 — before the emoji
295        let (l, r) = s.split_at(1);
296        assert_eq!(l.text, "A");
297        assert_eq!(r.text, "\u{1f600}B");
298
299        // Split at offset 2 — in the middle of the emoji
300        // The emoji is width 2 and starts at offset 1, so offset 2 is mid-emoji
301        let (l2, r2) = s.split_at(2);
302        // left should get "A" + space (padding for straddled emoji)
303        assert_eq!(l2.text, "A ");
304        assert_eq!(l2.width(), 2);
305        // right should get space (placeholder) + "B"
306        assert_eq!(r2.text, " B");
307    }
308
309    #[test]
310    fn combining_diacritics_width() {
311        // 'e' followed by combining acute accent (U+0301) = single grapheme cluster "e\u{0301}"
312        let s = Segment::new("e\u{0301}"); // é as decomposed
313        // Should be width 1 (single character with combining mark)
314        assert_eq!(s.width(), 1);
315        assert_eq!(s.char_count(), 1);
316    }
317
318    #[test]
319    fn mixed_ascii_emoji_cjk() {
320        // "Hi" (2) + emoji (2) + CJK 世 (2) = width 6
321        let s = Segment::new("Hi\u{1f600}\u{4e16}");
322        assert_eq!(s.width(), 6);
323        assert_eq!(s.char_count(), 4); // H, i, emoji, CJK
324    }
325
326    #[test]
327    fn grapheme_widths_returns_correct_values() {
328        let s = Segment::new("A\u{4e16}B");
329        let widths = s.grapheme_widths();
330        assert_eq!(widths.len(), 3);
331        assert_eq!(widths[0], ("A".to_string(), 1));
332        assert_eq!(widths[1], ("\u{4e16}".to_string(), 2));
333        assert_eq!(widths[2], ("B".to_string(), 1));
334    }
335
336    #[test]
337    fn char_count_returns_grapheme_cluster_count() {
338        // "Hello" = 5 grapheme clusters
339        assert_eq!(Segment::new("Hello").char_count(), 5);
340        // Empty = 0
341        assert_eq!(Segment::new("").char_count(), 0);
342        // CJK characters
343        assert_eq!(Segment::new("\u{4e16}\u{754c}").char_count(), 2);
344        // Control segments return 0
345        assert_eq!(Segment::control("ESC").char_count(), 0);
346    }
347
348    #[test]
349    fn split_preserves_combining_marks() {
350        // "ae\u{0301}b" = "a" + "e\u{0301}" + "b" (3 graphemes, width 3)
351        let s = Segment::new("ae\u{0301}b");
352        assert_eq!(s.width(), 3);
353        assert_eq!(s.char_count(), 3);
354
355        // Split at offset 1 — between "a" and "e\u{0301}"
356        let (l, r) = s.split_at(1);
357        assert_eq!(l.text, "a");
358        // The combining mark should stay attached to "e"
359        assert_eq!(r.text, "e\u{0301}b");
360
361        // Split at offset 2 — between "e\u{0301}" and "b"
362        let (l2, r2) = s.split_at(2);
363        assert_eq!(l2.text, "ae\u{0301}");
364        assert_eq!(r2.text, "b");
365    }
366
367    #[test]
368    fn empty_segment_grapheme_operations() {
369        let s = Segment::new("");
370        assert_eq!(s.grapheme_widths().len(), 0);
371        assert_eq!(s.char_count(), 0);
372        let (l, r) = s.split_at(0);
373        assert_eq!(l.text, "");
374        assert_eq!(r.text, "");
375    }
376
377    #[test]
378    fn grapheme_widths_empty_for_control() {
379        let s = Segment::control("\x1b[1m");
380        assert!(s.grapheme_widths().is_empty());
381    }
382
383    // --- Task 5: truncate_to_width and pad_to_width tests ---
384
385    #[test]
386    fn truncate_to_width_ascii_exact_fit() {
387        let s = Segment::new("hello");
388        let truncated = s.truncate_to_width(5);
389        assert_eq!(truncated.text, "hello");
390        assert_eq!(truncated.width(), 5);
391    }
392
393    #[test]
394    fn truncate_to_width_cuts_before_wide_char_at_boundary() {
395        // "A" (1) + "世" (2) + "B" (1) = width 4
396        let s = Segment::new("A\u{4e16}B");
397        assert_eq!(s.width(), 4);
398        // Truncate to width 2 — the wide char starts at offset 1 and spans 1..3,
399        // so at max_width=2 it straddles the boundary. split_at pads left with space.
400        let truncated = s.truncate_to_width(2);
401        assert_eq!(truncated.width(), 2);
402        assert_eq!(truncated.text, "A ");
403    }
404
405    #[test]
406    fn truncate_to_width_zero_gives_empty() {
407        let s = Segment::new("hello");
408        let truncated = s.truncate_to_width(0);
409        assert_eq!(truncated.text, "");
410        assert_eq!(truncated.width(), 0);
411    }
412
413    #[test]
414    fn truncate_to_width_beyond_length_unchanged() {
415        let s = Segment::new("hi");
416        let truncated = s.truncate_to_width(100);
417        assert_eq!(truncated.text, "hi");
418        assert_eq!(truncated.width(), 2);
419    }
420
421    #[test]
422    fn pad_to_width_adds_trailing_spaces() {
423        let s = Segment::new("AB");
424        let padded = s.pad_to_width(5);
425        assert_eq!(padded.text, "AB   ");
426        assert_eq!(padded.width(), 5);
427    }
428
429    #[test]
430    fn pad_to_width_already_at_target_unchanged() {
431        let s = Segment::new("hello");
432        let padded = s.pad_to_width(5);
433        assert_eq!(padded.text, "hello");
434    }
435
436    #[test]
437    fn pad_to_width_already_wider_unchanged() {
438        let s = Segment::new("hello world");
439        let padded = s.pad_to_width(5);
440        assert_eq!(padded.text, "hello world");
441    }
442
443    #[test]
444    fn style_preserved_through_truncation_and_padding() {
445        let style = Style::new().bold(true);
446        let s = Segment::styled("hello world", style.clone());
447
448        let truncated = s.truncate_to_width(5);
449        assert!(truncated.style.bold);
450        assert_eq!(truncated.style, style);
451
452        let padded = s.pad_to_width(20);
453        assert!(padded.style.bold);
454        assert_eq!(padded.style, style);
455    }
456
457    // --- Multi-codepoint emoji tests ---
458
459    #[test]
460    fn zwj_family_emoji_width() {
461        // ZWJ family emoji: man + ZWJ + woman + ZWJ + girl
462        let s = Segment::new("\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}");
463        // Should be width 2 (rendered as a single 2-column-wide grapheme)
464        assert_eq!(s.width(), 2);
465    }
466
467    #[test]
468    fn zwj_family_emoji_grapheme_widths() {
469        let s = Segment::new("\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}");
470        let widths = s.grapheme_widths();
471        // Single grapheme cluster
472        assert_eq!(widths.len(), 1);
473        // Width should be 2
474        assert_eq!(widths[0].1, 2);
475    }
476
477    #[test]
478    fn flag_emoji_width() {
479        // US flag: regional indicator U + regional indicator S
480        let s = Segment::new("\u{1F1FA}\u{1F1F8}");
481        assert_eq!(s.width(), 2);
482    }
483
484    #[test]
485    fn skin_tone_emoji_width() {
486        // Thumbs up + medium skin tone modifier
487        let s = Segment::new("\u{1F44D}\u{1F3FD}");
488        assert_eq!(s.width(), 2);
489    }
490
491    #[test]
492    fn split_segment_at_zwj_emoji_boundary() {
493        // "A" (width 1) + ZWJ family emoji (width 2) + "B" (width 1) = width 4
494        let s = Segment::new("A\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}B");
495        assert_eq!(s.width(), 4);
496
497        // Split at offset 1 — just after "A", before emoji
498        let (l, r) = s.split_at(1);
499        assert_eq!(l.text, "A");
500        assert_eq!(l.width(), 1);
501        // Right should start with the family emoji
502        assert_eq!(r.width(), 3); // emoji(2) + B(1)
503    }
504
505    #[test]
506    fn char_count_with_complex_emoji() {
507        // ZWJ family is one grapheme cluster
508        let s = Segment::new("\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}");
509        assert_eq!(s.char_count(), 1);
510    }
511
512    #[test]
513    fn mixed_ascii_zwj_emoji_cjk() {
514        // "Hi" (2) + family emoji (2) + CJK 世 (2) + "!" (1) = 7
515        let s = Segment::new("Hi\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{4e16}!");
516        assert_eq!(s.width(), 7);
517        assert_eq!(s.char_count(), 5); // H, i, family, 世, !
518    }
519
520    #[test]
521    fn keycap_sequence_handling() {
522        // Keycap "#": # + VS16 + combining enclosing keycap
523        let s = Segment::new("#\u{FE0F}\u{20E3}");
524        // This is a single grapheme cluster
525        assert_eq!(s.char_count(), 1);
526        // Width depends on unicode-width crate version, but should be reasonable
527        let w = s.width();
528        assert!((1..=2).contains(&w));
529    }
530}
saorsa_core/segment.rs

saorsa_core/
segment.rs