string_width/width_calculation.rs
1/// Main width calculation logic
2///
3/// This module contains the core string width calculation functionality,
4/// bringing together emoji detection, character classification, and
5/// East Asian width handling.
6use east_asian_width::east_asian_width;
7use regex::Regex;
8use std::sync::OnceLock;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::character_classification::{find_main_character_optimized, is_halfwidth_fullwidth};
12use crate::emoji::is_rgi_emoji;
13use crate::options::{AmbiguousWidthTreatment, StringWidthOptions};
14
15/// Unicode ranges for zero-width characters
16///
17/// This pattern matches characters that should not contribute to display width.
18/// Includes:
19/// - C0/C1 control characters (0000-001F, 007F-009F)
20/// - Combining diacritical marks (0300-036F)
21/// - Zero-width spaces and joiners (200B-200F)
22/// - Bidirectional formatting (202A-202E)
23/// - Various format characters (2060-206F)
24/// - Variation selectors (FE00-FE0F, FE20-FE2F)
25/// - Byte order mark (FEFF)
26/// - Interlinear annotation (FFF9-FFFB)
27/// - Tag characters (E0000-E007F)
28const ZERO_WIDTH_PATTERN: &str = concat!(
29 r"^[\u{0000}-\u{001F}", // C0 controls
30 r"\u{007F}-\u{009F}", // C1 controls
31 r"\u{00AD}", // Soft hyphen
32 r"\u{0300}-\u{036F}", // Combining diacriticals
33 r"\u{200B}-\u{200F}", // Zero-width spaces/joiners
34 r"\u{202A}-\u{202E}", // Bidirectional formatting
35 r"\u{2060}-\u{206F}", // Word joiner, etc.
36 r"\u{FE00}-\u{FE0F}", // Variation selectors
37 r"\u{FE20}-\u{FE2F}", // Combining half marks
38 r"\u{FEFF}", // Zero width no-break space
39 r"\u{FFF9}-\u{FFFB}", // Interlinear annotation
40 r"\u{E0000}-\u{E007F}", // Tag characters
41 r"]+$"
42);
43
44/// Control character ranges for ANSI-aware processing
45/// Maximum value for C0 control characters
46const CONTROL_CHAR_MAX: u32 = 0x1F;
47/// DEL character code point
48const DEL_CHAR: u32 = 0x7F;
49/// Maximum value for C1 control characters
50const C1_CONTROL_MAX: u32 = 0x9F;
51
52/// Compiled regex for zero-width and non-printing characters
53///
54/// This static regex is compiled once and reused for all zero-width checks.
55static ZERO_WIDTH_CLUSTER_REGEX: OnceLock<Regex> = OnceLock::new();
56
57/// Get the compiled zero-width regex, initializing it if necessary
58///
59/// # Returns
60///
61/// A reference to the compiled regex for zero-width character detection
62///
63/// # Panics
64///
65/// Panics if the regex pattern is invalid (should never happen with our constant pattern)
66fn get_zero_width_regex() -> &'static Regex {
67 ZERO_WIDTH_CLUSTER_REGEX
68 .get_or_init(|| Regex::new(ZERO_WIDTH_PATTERN).expect("Zero-width regex should be valid"))
69}
70
71/// Checks if a grapheme cluster consists entirely of zero-width characters
72///
73/// Uses a compiled regex that is guaranteed to be valid at compile time.
74///
75/// # Arguments
76///
77/// * `segment` - The grapheme cluster to check
78///
79/// # Returns
80///
81/// `true` if the entire segment consists of zero-width characters, `false` otherwise
82fn is_zero_width_cluster(segment: &str) -> bool {
83 get_zero_width_regex().is_match(segment)
84}
85
86/// Calculates additional width from trailing Halfwidth and Fullwidth Forms
87///
88/// This function processes characters after the first character in a grapheme
89/// cluster to account for additional width from combining characters in the
90/// Halfwidth and Fullwidth Forms block.
91///
92/// # Arguments
93///
94/// * `segment` - The grapheme cluster to analyze
95/// * `ambiguous_as_wide` - Whether to treat ambiguous characters as wide
96///
97/// # Returns
98///
99/// The additional width contributed by trailing characters
100fn calculate_trailing_width(segment: &str, ambiguous_as_wide: bool) -> usize {
101 if segment.len() <= 1 {
102 return 0;
103 }
104
105 segment
106 .chars()
107 .skip(1)
108 .filter(|&ch| is_halfwidth_fullwidth(ch))
109 .map(|ch| east_asian_width((ch as u32, ambiguous_as_wide)).as_usize())
110 .sum()
111}
112
113/// Main function that calculates the display width of a string
114///
115/// This is the core width calculation function that processes a string
116/// grapheme by grapheme, handling:
117/// - ANSI escape sequences (optionally)
118/// - Zero-width character clusters
119/// - Emoji sequences
120/// - East Asian character widths
121/// - Combining characters and modifiers
122///
123/// # Arguments
124///
125/// * `input` - The string to measure
126/// * `options` - Configuration options for the calculation
127///
128/// # Returns
129///
130/// The display width of the string in terminal columns
131///
132/// # Examples
133///
134/// ```rust
135/// use string_width::{string_width_with_options, StringWidthOptions};
136///
137/// let options = StringWidthOptions::default();
138/// assert_eq!(string_width_with_options("Hello", options.clone()), 5);
139/// assert_eq!(string_width_with_options("๐", options), 2);
140/// ```
141pub fn string_width_with_options(input: &str, options: StringWidthOptions) -> usize {
142 if input.is_empty() {
143 return 0;
144 }
145
146 let processed_input = prepare_input(input, options.count_ansi);
147 calculate_grapheme_widths(&processed_input, &options)
148}
149
150/// Prepares input string by optionally stripping ANSI escape sequences
151///
152/// # Arguments
153///
154/// * `input` - The input string
155/// * `count_ansi` - Whether to preserve ANSI sequences
156///
157/// # Returns
158///
159/// Either a borrowed reference to the original string (if preserving ANSI)
160/// or an owned string with ANSI sequences stripped
161fn prepare_input(input: &str, count_ansi: bool) -> std::borrow::Cow<'_, str> {
162 if count_ansi {
163 std::borrow::Cow::Borrowed(input)
164 } else {
165 let stripped_bytes = strip_ansi_escapes::strip(input);
166 let stripped = String::from_utf8_lossy(&stripped_bytes).into_owned();
167 std::borrow::Cow::Owned(stripped)
168 }
169}
170
171/// Calculates the total width by processing grapheme clusters
172///
173/// # Arguments
174///
175/// * `input` - The string to process
176/// * `options` - Configuration options
177///
178/// # Returns
179///
180/// The total display width
181fn calculate_grapheme_widths(input: &str, options: &StringWidthOptions) -> usize {
182 if input.is_empty() {
183 return 0;
184 }
185
186 let ambiguous_as_wide = options.ambiguous_width == AmbiguousWidthTreatment::Wide;
187
188 input
189 .graphemes(true)
190 .map(|segment| calculate_segment_width(segment, options, ambiguous_as_wide))
191 .sum()
192}
193
194/// Calculates the width of a single grapheme segment
195///
196/// # Arguments
197///
198/// * `segment` - The grapheme cluster to measure
199/// * `options` - Configuration options
200/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
201///
202/// # Returns
203///
204/// The display width of the segment
205fn calculate_segment_width(
206 segment: &str,
207 options: &StringWidthOptions,
208 ambiguous_as_wide: bool,
209) -> usize {
210 if !options.count_ansi && is_zero_width_cluster(segment) {
211 return 0;
212 }
213
214 if is_rgi_emoji(segment) {
215 return 2;
216 }
217
218 if options.count_ansi {
219 calculate_ansi_aware_width(segment, ambiguous_as_wide)
220 } else {
221 calculate_standard_width(segment, ambiguous_as_wide)
222 }
223}
224
225/// Calculates width when ANSI escape sequences should be counted
226///
227/// In this mode, control characters are given width 1 instead of being
228/// treated as zero-width.
229///
230/// # Arguments
231///
232/// * `segment` - The grapheme cluster to measure
233/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
234///
235/// # Returns
236///
237/// The display width including control characters
238fn calculate_ansi_aware_width(segment: &str, ambiguous_as_wide: bool) -> usize {
239 segment
240 .chars()
241 .map(|ch| {
242 let code_point = ch as u32;
243 if code_point <= CONTROL_CHAR_MAX || (DEL_CHAR..=C1_CONTROL_MAX).contains(&code_point) {
244 1 // Control characters count as width 1
245 } else {
246 east_asian_width((code_point, ambiguous_as_wide)).as_usize()
247 }
248 })
249 .sum()
250}
251
252/// Calculates width using standard East Asian width rules
253///
254/// This is the standard width calculation that ignores control characters
255/// and uses Unicode East Asian Width properties.
256///
257/// # Arguments
258///
259/// * `segment` - The grapheme cluster to measure
260/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
261///
262/// # Returns
263///
264/// The display width using standard rules
265fn calculate_standard_width(segment: &str, ambiguous_as_wide: bool) -> usize {
266 let main_char_info = find_main_character_optimized(segment);
267 match main_char_info {
268 Some(info) => {
269 let base_width = east_asian_width((info.code_point, ambiguous_as_wide)).as_usize();
270 let trailing_width = calculate_trailing_width(segment, ambiguous_as_wide);
271 base_width + trailing_width
272 }
273 None => 0,
274 }
275}
276
277/// Trait for types that can have their display width calculated
278///
279/// This trait provides a clean API for calculating string width
280/// with better naming than the original StringWidthInput.
281pub trait DisplayWidth {
282 /// Calculate the display width using default options
283 fn display_width(&self) -> usize;
284
285 /// Calculate the display width with custom options
286 fn display_width_with_options(&self, options: StringWidthOptions) -> usize;
287}
288
289impl DisplayWidth for str {
290 /// Calculate the display width using default options
291 ///
292 /// # Examples
293 ///
294 /// ```rust
295 /// use string_width::DisplayWidth;
296 ///
297 /// assert_eq!("Hello".display_width(), 5);
298 /// assert_eq!("๐".display_width(), 2);
299 /// ```
300 fn display_width(&self) -> usize {
301 string_width_with_options(self, StringWidthOptions::default())
302 }
303
304 /// Calculate the display width with custom options
305 ///
306 /// # Arguments
307 ///
308 /// * `options` - Configuration options for the calculation
309 ///
310 /// # Examples
311 ///
312 /// ```rust
313 /// use string_width::{DisplayWidth, StringWidthOptions};
314 ///
315 /// let options = StringWidthOptions::builder()
316 /// .count_ansi(true)
317 /// .build();
318 /// assert_eq!("\x1b[31mRed\x1b[0m".display_width_with_options(options), 12);
319 /// ```
320 fn display_width_with_options(&self, options: StringWidthOptions) -> usize {
321 string_width_with_options(self, options)
322 }
323}
324
325impl DisplayWidth for String {
326 /// Calculate the display width using default options
327 ///
328 /// # Examples
329 ///
330 /// ```rust
331 /// use string_width::DisplayWidth;
332 ///
333 /// let text = String::from("Hello ๐");
334 /// assert_eq!(text.display_width(), 8);
335 /// ```
336 fn display_width(&self) -> usize {
337 self.as_str().display_width()
338 }
339
340 /// Calculate the display width with custom options
341 ///
342 /// # Arguments
343 ///
344 /// * `options` - Configuration options for the calculation
345 ///
346 /// # Examples
347 ///
348 /// ```rust
349 /// use string_width::{DisplayWidth, StringWidthOptions};
350 ///
351 /// let text = String::from("ยฑรรท");
352 /// let options = StringWidthOptions::builder()
353 /// .ambiguous_as_wide()
354 /// .build();
355 /// assert_eq!(text.display_width_with_options(options), 6);
356 /// ```
357 fn display_width_with_options(&self, options: StringWidthOptions) -> usize {
358 self.as_str().display_width_with_options(options)
359 }
360}
361
362/// Convenience function for calculating string width with default options
363///
364/// This is the main public API for simple width calculations.
365/// It uses default options (no ANSI counting, ambiguous as narrow).
366///
367/// # Arguments
368///
369/// * `input` - The string to measure
370///
371/// # Returns
372///
373/// The display width of the string in terminal columns
374///
375/// # Examples
376///
377/// ```rust
378/// use string_width::string_width;
379///
380/// assert_eq!(string_width("Hello"), 5);
381/// assert_eq!(string_width("ไฝ ๅฅฝ"), 4); // Chinese characters
382/// assert_eq!(string_width("๐"), 2); // Emoji
383/// assert_eq!(string_width("\x1b[31mRed\x1b[0m"), 3); // ANSI stripped
384/// ```
385pub fn string_width(input: &str) -> usize {
386 input.display_width()
387}