runefix_core/grapheme/basic.rs
1//! Basic grapheme-aware width processing functions.
2//!
3//! This module provides the core, always-available APIs for:
4//!
5//! - Unicode grapheme segmentation
6//! - Terminal-style display width measurement
7//! - Safe truncation and line wrapping
8//!
9//! These functions use a default [`terminal`](crate::policy::WidthPolicy::terminal) layout strategy,
10//! without requiring any additional features.
11//!
12//! See [`policy_ext`](crate::grapheme::policy_ext) for configurable width behavior.
13
14use crate::width::get_display_width;
15use unicode_segmentation::UnicodeSegmentation;
16
17/// Returns all Unicode grapheme clusters in the input string, following UAX #29.
18///
19/// A **grapheme cluster** is the smallest unit of text that a user perceives as a single character.
20/// This function implements [Unicodeยฎ Standard Annex #29](https://unicode.org/reports/tr29/),
21/// including support for extended grapheme clusters such as:
22///
23/// - Emoji ZWJ sequences (e.g., ๐ฉโโค๏ธโ๐โ๐จ)
24/// - Hangul syllables
25/// - Combining accents (e.g., eฬ)
26///
27/// This API is Unicode-compliant and suitable for user-facing string segmentation.
28///
29/// # Arguments
30///
31/// * `s` โ The input string to split.
32///
33/// # Returns
34///
35/// A `Vec<&str>` where each item is a Unicode grapheme cluster.
36///
37/// # Example
38///
39/// ```rust
40/// use runefix_core::graphemes;
41///
42/// let clusters = graphemes("Love๐ฉโโค๏ธโ๐โ๐จ็ฑ");
43/// assert_eq!(clusters, vec!["L", "o", "v", "e", "๐ฉโโค๏ธโ๐โ๐จ", "็ฑ"]);
44/// ```
45pub fn graphemes(s: &str) -> Vec<&str> {
46 UnicodeSegmentation::graphemes(s, true).collect()
47}
48
49/// Returns the total display width (in columns) of a string, based on grapheme clusters.
50///
51/// This function segments the input string into Unicode grapheme clusters and sums
52/// the display width of each one using [`display_width`]. The result reflects
53/// how much horizontal space the entire string occupies in a monospace terminal,
54/// accounting for wide characters such as CJK ideographs and emoji.
55///
56/// # Arguments
57///
58/// * `s` - The input string to measure
59///
60/// # Returns
61///
62/// The total display width of the string in terminal columns.
63///
64/// # Example
65///
66/// ```rust
67/// use runefix_core::display_width;
68///
69/// let width = display_width("Hi๏ผไธ็");
70/// assert_eq!(width, 8); // 1 + 1 + 2 + 2 + 2
71/// ```
72pub fn display_width(s: &str) -> usize {
73 UnicodeSegmentation::graphemes(s, true)
74 .map(get_display_width)
75 .sum()
76}
77
78/// Returns the display width (in columns) of each grapheme cluster in the input string.
79///
80/// This function segments the input string into Unicode grapheme clusters and computes
81/// the display width of each one individually. It is useful for scenarios like monospace
82/// text layout, visual alignment, or rendering terminals where East Asian characters
83/// and emoji take more than one column.
84///
85/// # Arguments
86///
87/// * `s` - The input string to analyze
88///
89/// # Returns
90///
91/// A vector of display widths (`usize`) for each grapheme cluster in order.
92///
93/// # Example
94///
95/// ```rust
96/// use runefix_core::display_widths;
97///
98/// let widths = display_widths("Hi๏ผไธ็");
99/// assert_eq!(widths, vec![1, 1, 2, 2, 2]);
100/// ```
101pub fn display_widths(s: &str) -> Vec<usize> {
102 UnicodeSegmentation::graphemes(s, true)
103 .map(get_display_width)
104 .collect()
105}
106
107/// Returns the display width of each grapheme cluster in the input string.
108///
109/// This function splits the string into Unicode grapheme clusters and pairs
110/// each one with its terminal display width (in columns). This is useful for
111/// visually aligned rendering, layout calculation, and Unicode debugging,
112/// especially with complex emoji or East Asian characters.
113///
114/// # Arguments
115///
116/// * `s` - The input string to analyze
117///
118/// # Returns
119///
120/// A vector of tuples, where each item is a grapheme cluster and its
121/// corresponding display width: `(&str, usize)`
122///
123/// # Example
124///
125/// ```rust
126/// use runefix_core::grapheme_widths;
127///
128/// let result = grapheme_widths("Hi๏ผไธ็");
129/// assert_eq!(
130/// result,
131/// vec![("H", 1), ("i", 1), ("๏ผ", 2), ("ไธ", 2), ("็", 2)]
132/// );
133/// ```
134pub fn grapheme_widths(s: &str) -> Vec<(&str, usize)> {
135 UnicodeSegmentation::graphemes(s, true)
136 .map(|g| (g, get_display_width(g)))
137 .collect()
138}
139
140/// Truncates a string by display width while preserving grapheme cluster boundaries.
141///
142/// This function ensures that wide characters such as emoji or CJK ideographs are
143/// never split in the middle. It safely cuts off the string so that its total
144/// display width does not exceed the given `max_width`, making it ideal for
145/// terminal or TUI rendering.
146///
147/// # Arguments
148///
149/// * `s` - The input string to truncate
150/// * `max_width` - Maximum allowed display width in terminal columns
151///
152/// # Returns
153///
154/// A string slice that fits within the specified display width without cutting graphemes.
155///
156/// # Example
157///
158/// ```rust
159/// use runefix_core::truncate_by_width;
160///
161/// let s = "Hi ๐๏ผไธ็";
162/// let short = truncate_by_width(s, 6);
163/// assert_eq!(short, "Hi ๐");
164/// ```
165pub fn truncate_by_width(s: &str, max_width: usize) -> &str {
166 let mut total_width = 0;
167 let mut end_byte = 0;
168
169 for g in UnicodeSegmentation::graphemes(s, true) {
170 let w: usize = get_display_width(g);
171
172 if total_width + w > max_width {
173 break;
174 }
175
176 total_width += w;
177 end_byte += g.len(); // Byte offset to cut safely
178 }
179
180 &s[..end_byte]
181}
182
183/// Splits a string into lines based on display width, preserving grapheme boundaries.
184///
185/// This function ensures that wide characters such as emoji, CJK ideographs, or
186/// fullwidth punctuation are not split mid-grapheme. It breaks the input string
187/// into a sequence of lines, each with a total display width that does not exceed
188/// the given `max_width`. Ideal for terminal word wrapping and monospace layout.
189///
190/// # Arguments
191///
192/// * `s` - The input string to wrap
193/// * `max_width` - Maximum display width (in columns) for each line
194///
195/// # Returns
196///
197/// A vector of strings, each representing a wrapped line within the given width.
198///
199/// # Example
200///
201/// ```rust
202/// use runefix_core::split_by_width;
203///
204/// let lines = split_by_width("Hello ๐ ไธ็๏ผ", 5);
205/// assert_eq!(lines, vec!["Hello", " ๐ ", "ไธ็", "๏ผ"]);
206/// ```
207pub fn split_by_width(s: &str, max_width: usize) -> Vec<String> {
208 let mut result = Vec::new();
209 let mut current_line = String::new();
210 let mut current_width = 0;
211
212 for g in UnicodeSegmentation::graphemes(s, true) {
213 let w: usize = get_display_width(g);
214
215 if current_width + w > max_width && !current_line.is_empty() {
216 result.push(current_line.clone());
217 current_line.clear();
218 current_width = 0;
219 }
220
221 current_line.push_str(g);
222 current_width += w;
223 }
224
225 if !current_line.is_empty() {
226 result.push(current_line);
227 }
228
229 result
230}