string_width/
width_calculation.rs

1/// Main width calculation logic
2///
3/// This module contains the core string width calculation functionality,
4/// bringing together emoji detection, character classification, and
5/// East Asian width handling.
6use east_asian_width::east_asian_width;
7use regex::Regex;
8use std::sync::OnceLock;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::character_classification::{find_main_character_optimized, is_halfwidth_fullwidth};
12use crate::emoji::is_rgi_emoji;
13use crate::options::{AmbiguousWidthTreatment, StringWidthOptions};
14
15/// Unicode ranges for zero-width characters
16///
17/// This pattern matches characters that should not contribute to display width.
18/// Includes:
19/// - C0/C1 control characters (0000-001F, 007F-009F)
20/// - Combining diacritical marks (0300-036F)
21/// - Zero-width spaces and joiners (200B-200F)
22/// - Bidirectional formatting (202A-202E)
23/// - Various format characters (2060-206F)
24/// - Variation selectors (FE00-FE0F, FE20-FE2F)
25/// - Byte order mark (FEFF)
26/// - Interlinear annotation (FFF9-FFFB)
27/// - Tag characters (E0000-E007F)
28const ZERO_WIDTH_PATTERN: &str = concat!(
29    r"^[\u{0000}-\u{001F}", // C0 controls
30    r"\u{007F}-\u{009F}",   // C1 controls
31    r"\u{00AD}",            // Soft hyphen
32    r"\u{0300}-\u{036F}",   // Combining diacriticals
33    r"\u{200B}-\u{200F}",   // Zero-width spaces/joiners
34    r"\u{202A}-\u{202E}",   // Bidirectional formatting
35    r"\u{2060}-\u{206F}",   // Word joiner, etc.
36    r"\u{FE00}-\u{FE0F}",   // Variation selectors
37    r"\u{FE20}-\u{FE2F}",   // Combining half marks
38    r"\u{FEFF}",            // Zero width no-break space
39    r"\u{FFF9}-\u{FFFB}",   // Interlinear annotation
40    r"\u{E0000}-\u{E007F}", // Tag characters
41    r"]+$"
42);
43
44/// Control character ranges for ANSI-aware processing
45/// Maximum value for C0 control characters
46const CONTROL_CHAR_MAX: u32 = 0x1F;
47/// DEL character code point
48const DEL_CHAR: u32 = 0x7F;
49/// Maximum value for C1 control characters
50const C1_CONTROL_MAX: u32 = 0x9F;
51
52/// Compiled regex for zero-width and non-printing characters
53///
54/// This static regex is compiled once and reused for all zero-width checks.
55static ZERO_WIDTH_CLUSTER_REGEX: OnceLock<Regex> = OnceLock::new();
56
57/// Get the compiled zero-width regex, initializing it if necessary
58///
59/// # Returns
60///
61/// A reference to the compiled regex for zero-width character detection
62///
63/// # Panics
64///
65/// Panics if the regex pattern is invalid (should never happen with our constant pattern)
66fn get_zero_width_regex() -> &'static Regex {
67    ZERO_WIDTH_CLUSTER_REGEX
68        .get_or_init(|| Regex::new(ZERO_WIDTH_PATTERN).expect("Zero-width regex should be valid"))
69}
70
71/// Checks if a grapheme cluster consists entirely of zero-width characters
72///
73/// Uses a compiled regex that is guaranteed to be valid at compile time.
74///
75/// # Arguments
76///
77/// * `segment` - The grapheme cluster to check
78///
79/// # Returns
80///
81/// `true` if the entire segment consists of zero-width characters, `false` otherwise
82fn is_zero_width_cluster(segment: &str) -> bool {
83    get_zero_width_regex().is_match(segment)
84}
85
86/// Calculates additional width from trailing Halfwidth and Fullwidth Forms
87///
88/// This function processes characters after the first character in a grapheme
89/// cluster to account for additional width from combining characters in the
90/// Halfwidth and Fullwidth Forms block.
91///
92/// # Arguments
93///
94/// * `segment` - The grapheme cluster to analyze
95/// * `ambiguous_as_wide` - Whether to treat ambiguous characters as wide
96///
97/// # Returns
98///
99/// The additional width contributed by trailing characters
100fn calculate_trailing_width(segment: &str, ambiguous_as_wide: bool) -> usize {
101    if segment.len() <= 1 {
102        return 0;
103    }
104
105    segment
106        .chars()
107        .skip(1)
108        .filter(|&ch| is_halfwidth_fullwidth(ch))
109        .map(|ch| east_asian_width((ch as u32, ambiguous_as_wide)).as_usize())
110        .sum()
111}
112
113/// Main function that calculates the display width of a string
114///
115/// This is the core width calculation function that processes a string
116/// grapheme by grapheme, handling:
117/// - ANSI escape sequences (optionally)
118/// - Zero-width character clusters
119/// - Emoji sequences
120/// - East Asian character widths
121/// - Combining characters and modifiers
122///
123/// # Arguments
124///
125/// * `input` - The string to measure
126/// * `options` - Configuration options for the calculation
127///
128/// # Returns
129///
130/// The display width of the string in terminal columns
131///
132/// # Examples
133///
134/// ```rust
135/// use string_width::{string_width_with_options, StringWidthOptions};
136///
137/// let options = StringWidthOptions::default();
138/// assert_eq!(string_width_with_options("Hello", options.clone()), 5);
139/// assert_eq!(string_width_with_options("๐Ÿ˜€", options), 2);
140/// ```
141pub fn string_width_with_options(input: &str, options: StringWidthOptions) -> usize {
142    if input.is_empty() {
143        return 0;
144    }
145
146    let processed_input = prepare_input(input, options.count_ansi);
147    calculate_grapheme_widths(&processed_input, &options)
148}
149
150/// Prepares input string by optionally stripping ANSI escape sequences
151///
152/// # Arguments
153///
154/// * `input` - The input string
155/// * `count_ansi` - Whether to preserve ANSI sequences
156///
157/// # Returns
158///
159/// Either a borrowed reference to the original string (if preserving ANSI)
160/// or an owned string with ANSI sequences stripped
161fn prepare_input(input: &str, count_ansi: bool) -> std::borrow::Cow<'_, str> {
162    if count_ansi {
163        std::borrow::Cow::Borrowed(input)
164    } else {
165        let stripped_bytes = strip_ansi_escapes::strip(input);
166        let stripped = String::from_utf8_lossy(&stripped_bytes).into_owned();
167        std::borrow::Cow::Owned(stripped)
168    }
169}
170
171/// Calculates the total width by processing grapheme clusters
172///
173/// # Arguments
174///
175/// * `input` - The string to process
176/// * `options` - Configuration options
177///
178/// # Returns
179///
180/// The total display width
181fn calculate_grapheme_widths(input: &str, options: &StringWidthOptions) -> usize {
182    if input.is_empty() {
183        return 0;
184    }
185
186    let ambiguous_as_wide = options.ambiguous_width == AmbiguousWidthTreatment::Wide;
187
188    input
189        .graphemes(true)
190        .map(|segment| calculate_segment_width(segment, options, ambiguous_as_wide))
191        .sum()
192}
193
194/// Calculates the width of a single grapheme segment
195///
196/// # Arguments
197///
198/// * `segment` - The grapheme cluster to measure
199/// * `options` - Configuration options
200/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
201///
202/// # Returns
203///
204/// The display width of the segment
205fn calculate_segment_width(
206    segment: &str,
207    options: &StringWidthOptions,
208    ambiguous_as_wide: bool,
209) -> usize {
210    if !options.count_ansi && is_zero_width_cluster(segment) {
211        return 0;
212    }
213
214    if is_rgi_emoji(segment) {
215        return 2;
216    }
217
218    if options.count_ansi {
219        calculate_ansi_aware_width(segment, ambiguous_as_wide)
220    } else {
221        calculate_standard_width(segment, ambiguous_as_wide)
222    }
223}
224
225/// Calculates width when ANSI escape sequences should be counted
226///
227/// In this mode, control characters are given width 1 instead of being
228/// treated as zero-width.
229///
230/// # Arguments
231///
232/// * `segment` - The grapheme cluster to measure
233/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
234///
235/// # Returns
236///
237/// The display width including control characters
238fn calculate_ansi_aware_width(segment: &str, ambiguous_as_wide: bool) -> usize {
239    segment
240        .chars()
241        .map(|ch| {
242            let code_point = ch as u32;
243            if code_point <= CONTROL_CHAR_MAX || (DEL_CHAR..=C1_CONTROL_MAX).contains(&code_point) {
244                1 // Control characters count as width 1
245            } else {
246                east_asian_width((code_point, ambiguous_as_wide)).as_usize()
247            }
248        })
249        .sum()
250}
251
252/// Calculates width using standard East Asian width rules
253///
254/// This is the standard width calculation that ignores control characters
255/// and uses Unicode East Asian Width properties.
256///
257/// # Arguments
258///
259/// * `segment` - The grapheme cluster to measure
260/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
261///
262/// # Returns
263///
264/// The display width using standard rules
265fn calculate_standard_width(segment: &str, ambiguous_as_wide: bool) -> usize {
266    let main_char_info = find_main_character_optimized(segment);
267    match main_char_info {
268        Some(info) => {
269            let base_width = east_asian_width((info.code_point, ambiguous_as_wide)).as_usize();
270            let trailing_width = calculate_trailing_width(segment, ambiguous_as_wide);
271            base_width + trailing_width
272        }
273        None => 0,
274    }
275}
276
277/// Trait for types that can have their display width calculated
278///
279/// This trait provides a clean API for calculating string width
280/// with better naming than the original StringWidthInput.
281pub trait DisplayWidth {
282    /// Calculate the display width using default options
283    fn display_width(&self) -> usize;
284
285    /// Calculate the display width with custom options
286    fn display_width_with_options(&self, options: StringWidthOptions) -> usize;
287}
288
289impl DisplayWidth for str {
290    /// Calculate the display width using default options
291    ///
292    /// # Examples
293    ///
294    /// ```rust
295    /// use string_width::DisplayWidth;
296    ///
297    /// assert_eq!("Hello".display_width(), 5);
298    /// assert_eq!("๐Ÿ˜€".display_width(), 2);
299    /// ```
300    fn display_width(&self) -> usize {
301        string_width_with_options(self, StringWidthOptions::default())
302    }
303
304    /// Calculate the display width with custom options
305    ///
306    /// # Arguments
307    ///
308    /// * `options` - Configuration options for the calculation
309    ///
310    /// # Examples
311    ///
312    /// ```rust
313    /// use string_width::{DisplayWidth, StringWidthOptions};
314    ///
315    /// let options = StringWidthOptions::builder()
316    ///     .count_ansi(true)
317    ///     .build();
318    /// assert_eq!("\x1b[31mRed\x1b[0m".display_width_with_options(options), 12);
319    /// ```
320    fn display_width_with_options(&self, options: StringWidthOptions) -> usize {
321        string_width_with_options(self, options)
322    }
323}
324
325impl DisplayWidth for String {
326    /// Calculate the display width using default options
327    ///
328    /// # Examples
329    ///
330    /// ```rust
331    /// use string_width::DisplayWidth;
332    ///
333    /// let text = String::from("Hello ๐ŸŒ");
334    /// assert_eq!(text.display_width(), 8);
335    /// ```
336    fn display_width(&self) -> usize {
337        self.as_str().display_width()
338    }
339
340    /// Calculate the display width with custom options
341    ///
342    /// # Arguments
343    ///
344    /// * `options` - Configuration options for the calculation
345    ///
346    /// # Examples
347    ///
348    /// ```rust
349    /// use string_width::{DisplayWidth, StringWidthOptions};
350    ///
351    /// let text = String::from("ยฑร—รท");
352    /// let options = StringWidthOptions::builder()
353    ///     .ambiguous_as_wide()
354    ///     .build();
355    /// assert_eq!(text.display_width_with_options(options), 6);
356    /// ```
357    fn display_width_with_options(&self, options: StringWidthOptions) -> usize {
358        self.as_str().display_width_with_options(options)
359    }
360}
361
362/// Convenience function for calculating string width with default options
363///
364/// This is the main public API for simple width calculations.
365/// It uses default options (no ANSI counting, ambiguous as narrow).
366///
367/// # Arguments
368///
369/// * `input` - The string to measure
370///
371/// # Returns
372///
373/// The display width of the string in terminal columns
374///
375/// # Examples
376///
377/// ```rust
378/// use string_width::string_width;
379///
380/// assert_eq!(string_width("Hello"), 5);
381/// assert_eq!(string_width("ไฝ ๅฅฝ"), 4);  // Chinese characters
382/// assert_eq!(string_width("๐Ÿ˜€"), 2);  // Emoji
383/// assert_eq!(string_width("\x1b[31mRed\x1b[0m"), 3);  // ANSI stripped
384/// ```
385pub fn string_width(input: &str) -> usize {
386    input.display_width()
387}