runefix_core/grapheme/
basic.rs

1//! Basic grapheme-aware width processing functions.
2//!
3//! This module provides the core, always-available APIs for:
4//!
5//! - Unicode grapheme segmentation
6//! - Terminal-style display width measurement
7//! - Safe truncation and line wrapping
8//!
9//! These functions use a default [`terminal`](crate::policy::WidthPolicy::terminal) layout strategy,
10//! without requiring any additional features.
11//!
12//! See [`policy_ext`](crate::grapheme::policy_ext) for configurable width behavior.
13
14use crate::width::get_display_width;
15use unicode_segmentation::UnicodeSegmentation;
16
17/// Returns all Unicode grapheme clusters in the input string, following UAX #29.
18///
19/// A **grapheme cluster** is the smallest unit of text that a user perceives as a single character.
20/// This function implements [Unicodeยฎ Standard Annex #29](https://unicode.org/reports/tr29/),
21/// including support for extended grapheme clusters such as:
22///
23/// - Emoji ZWJ sequences (e.g., ๐Ÿ‘ฉโ€โค๏ธโ€๐Ÿ’‹โ€๐Ÿ‘จ)
24/// - Hangul syllables
25/// - Combining accents (e.g., eฬ)
26///
27/// This API is Unicode-compliant and suitable for user-facing string segmentation.
28///
29/// # Arguments
30///
31/// * `s` โ€“ The input string to split.
32///
33/// # Returns
34///
35/// A `Vec<&str>` where each item is a Unicode grapheme cluster.
36///
37/// # Example
38///
39/// ```rust
40/// use runefix_core::graphemes;
41///
42/// let clusters = graphemes("Love๐Ÿ‘ฉโ€โค๏ธโ€๐Ÿ’‹โ€๐Ÿ‘จ็ˆฑ");
43/// assert_eq!(clusters, vec!["L", "o", "v", "e", "๐Ÿ‘ฉโ€โค๏ธโ€๐Ÿ’‹โ€๐Ÿ‘จ", "็ˆฑ"]);
44/// ```
45pub fn graphemes(s: &str) -> Vec<&str> {
46    UnicodeSegmentation::graphemes(s, true).collect()
47}
48
49/// Returns the total display width (in columns) of a string, based on grapheme clusters.
50///
51/// This function segments the input string into Unicode grapheme clusters and sums
52/// the display width of each one using [`display_width`]. The result reflects
53/// how much horizontal space the entire string occupies in a monospace terminal,
54/// accounting for wide characters such as CJK ideographs and emoji.
55///
56/// # Arguments
57///
58/// * `s` - The input string to measure
59///
60/// # Returns
61///
62/// The total display width of the string in terminal columns.
63///
64/// # Example
65///
66/// ```rust
67/// use runefix_core::display_width;
68///
69/// let width = display_width("Hi๏ผŒไธ–็•Œ");
70/// assert_eq!(width, 8); // 1 + 1 + 2 + 2 + 2
71/// ```
72pub fn display_width(s: &str) -> usize {
73    UnicodeSegmentation::graphemes(s, true)
74        .map(get_display_width)
75        .sum()
76}
77
78/// Returns the display width (in columns) of each grapheme cluster in the input string.
79///
80/// This function segments the input string into Unicode grapheme clusters and computes
81/// the display width of each one individually. It is useful for scenarios like monospace
82/// text layout, visual alignment, or rendering terminals where East Asian characters
83/// and emoji take more than one column.
84///
85/// # Arguments
86///
87/// * `s` - The input string to analyze
88///
89/// # Returns
90///
91/// A vector of display widths (`usize`) for each grapheme cluster in order.
92///
93/// # Example
94///
95/// ```rust
96/// use runefix_core::display_widths;
97///
98/// let widths = display_widths("Hi๏ผŒไธ–็•Œ");
99/// assert_eq!(widths, vec![1, 1, 2, 2, 2]);
100/// ```
101pub fn display_widths(s: &str) -> Vec<usize> {
102    UnicodeSegmentation::graphemes(s, true)
103        .map(get_display_width)
104        .collect()
105}
106
107/// Returns the display width of each grapheme cluster in the input string.
108///
109/// This function splits the string into Unicode grapheme clusters and pairs
110/// each one with its terminal display width (in columns). This is useful for
111/// visually aligned rendering, layout calculation, and Unicode debugging,
112/// especially with complex emoji or East Asian characters.
113///
114/// # Arguments
115///
116/// * `s` - The input string to analyze
117///
118/// # Returns
119///
120/// A vector of tuples, where each item is a grapheme cluster and its
121/// corresponding display width: `(&str, usize)`
122///
123/// # Example
124///
125/// ```rust
126/// use runefix_core::grapheme_widths;
127///
128/// let result = grapheme_widths("Hi๏ผŒไธ–็•Œ");
129/// assert_eq!(
130///     result,
131///     vec![("H", 1), ("i", 1), ("๏ผŒ", 2), ("ไธ–", 2), ("็•Œ", 2)]
132/// );
133/// ```
134pub fn grapheme_widths(s: &str) -> Vec<(&str, usize)> {
135    UnicodeSegmentation::graphemes(s, true)
136        .map(|g| (g, get_display_width(g)))
137        .collect()
138}
139
140/// Truncates a string by display width while preserving grapheme cluster boundaries.
141///
142/// This function ensures that wide characters such as emoji or CJK ideographs are
143/// never split in the middle. It safely cuts off the string so that its total
144/// display width does not exceed the given `max_width`, making it ideal for
145/// terminal or TUI rendering.
146///
147/// # Arguments
148///
149/// * `s` - The input string to truncate
150/// * `max_width` - Maximum allowed display width in terminal columns
151///
152/// # Returns
153///
154/// A string slice that fits within the specified display width without cutting graphemes.
155///
156/// # Example
157///
158/// ```rust
159/// use runefix_core::truncate_by_width;
160///
161/// let s = "Hi ๐Ÿ‘‹๏ผŒไธ–็•Œ";
162/// let short = truncate_by_width(s, 6);
163/// assert_eq!(short, "Hi ๐Ÿ‘‹");
164/// ```
165pub fn truncate_by_width(s: &str, max_width: usize) -> &str {
166    let mut total_width = 0;
167    let mut end_byte = 0;
168
169    for g in UnicodeSegmentation::graphemes(s, true) {
170        let w: usize = get_display_width(g);
171
172        if total_width + w > max_width {
173            break;
174        }
175
176        total_width += w;
177        end_byte += g.len(); // Byte offset to cut safely
178    }
179
180    &s[..end_byte]
181}
182
183/// Splits a string into lines based on display width, preserving grapheme boundaries.
184///
185/// This function ensures that wide characters such as emoji, CJK ideographs, or
186/// fullwidth punctuation are not split mid-grapheme. It breaks the input string
187/// into a sequence of lines, each with a total display width that does not exceed
188/// the given `max_width`. Ideal for terminal word wrapping and monospace layout.
189///
190/// # Arguments
191///
192/// * `s` - The input string to wrap
193/// * `max_width` - Maximum display width (in columns) for each line
194///
195/// # Returns
196///
197/// A vector of strings, each representing a wrapped line within the given width.
198///
199/// # Example
200///
201/// ```rust
202/// use runefix_core::split_by_width;
203///
204/// let lines = split_by_width("Hello ๐Ÿ‘‹ ไธ–็•Œ๏ผ", 5);
205/// assert_eq!(lines, vec!["Hello", " ๐Ÿ‘‹ ", "ไธ–็•Œ", "๏ผ"]);
206/// ```
207pub fn split_by_width(s: &str, max_width: usize) -> Vec<String> {
208    let mut result = Vec::new();
209    let mut current_line = String::new();
210    let mut current_width = 0;
211
212    for g in UnicodeSegmentation::graphemes(s, true) {
213        let w: usize = get_display_width(g);
214
215        if current_width + w > max_width && !current_line.is_empty() {
216            result.push(current_line.clone());
217            current_line.clear();
218            current_width = 0;
219        }
220
221        current_line.push_str(g);
222        current_width += w;
223    }
224
225    if !current_line.is_empty() {
226        result.push(current_line);
227    }
228
229    result
230}