string-width 0.1.0

Accurate Unicode string width calculation for terminal applications, handling emoji, East Asian characters, combining marks, and ANSI escape sequences
Documentation
/// Main width calculation logic
///
/// This module contains the core string width calculation functionality,
/// bringing together emoji detection, character classification, and
/// East Asian width handling.
use east_asian_width::east_asian_width;
use regex::Regex;
use std::sync::OnceLock;
use unicode_segmentation::UnicodeSegmentation;

use crate::character_classification::{find_main_character_optimized, is_halfwidth_fullwidth};
use crate::emoji::is_rgi_emoji;
use crate::options::{AmbiguousWidthTreatment, StringWidthOptions};

/// Unicode ranges for zero-width characters
///
/// This pattern matches characters that should not contribute to display width.
/// Includes:
/// - C0/C1 control characters (0000-001F, 007F-009F)
/// - Combining diacritical marks (0300-036F)
/// - Zero-width spaces and joiners (200B-200F)
/// - Bidirectional formatting (202A-202E)
/// - Various format characters (2060-206F)
/// - Variation selectors (FE00-FE0F, FE20-FE2F)
/// - Byte order mark (FEFF)
/// - Interlinear annotation (FFF9-FFFB)
/// - Tag characters (E0000-E007F)
const ZERO_WIDTH_PATTERN: &str = concat!(
    r"^[\u{0000}-\u{001F}", // C0 controls
    r"\u{007F}-\u{009F}",   // C1 controls
    r"\u{00AD}",            // Soft hyphen
    r"\u{0300}-\u{036F}",   // Combining diacriticals
    r"\u{200B}-\u{200F}",   // Zero-width spaces/joiners
    r"\u{202A}-\u{202E}",   // Bidirectional formatting
    r"\u{2060}-\u{206F}",   // Word joiner, etc.
    r"\u{FE00}-\u{FE0F}",   // Variation selectors
    r"\u{FE20}-\u{FE2F}",   // Combining half marks
    r"\u{FEFF}",            // Zero width no-break space
    r"\u{FFF9}-\u{FFFB}",   // Interlinear annotation
    r"\u{E0000}-\u{E007F}", // Tag characters
    r"]+$"
);

/// Control character ranges for ANSI-aware processing
/// Maximum value for C0 control characters
const CONTROL_CHAR_MAX: u32 = 0x1F;
/// DEL character code point
const DEL_CHAR: u32 = 0x7F;
/// Maximum value for C1 control characters
const C1_CONTROL_MAX: u32 = 0x9F;

/// Compiled regex for zero-width and non-printing characters
///
/// This static regex is compiled once and reused for all zero-width checks.
static ZERO_WIDTH_CLUSTER_REGEX: OnceLock<Regex> = OnceLock::new();

/// Get the compiled zero-width regex, initializing it if necessary
///
/// # Returns
///
/// A reference to the compiled regex for zero-width character detection
///
/// # Panics
///
/// Panics if the regex pattern is invalid (should never happen with our constant pattern)
fn get_zero_width_regex() -> &'static Regex {
    ZERO_WIDTH_CLUSTER_REGEX
        .get_or_init(|| Regex::new(ZERO_WIDTH_PATTERN).expect("Zero-width regex should be valid"))
}

/// Checks if a grapheme cluster consists entirely of zero-width characters
///
/// Uses a compiled regex that is guaranteed to be valid at compile time.
///
/// # Arguments
///
/// * `segment` - The grapheme cluster to check
///
/// # Returns
///
/// `true` if the entire segment consists of zero-width characters, `false` otherwise
fn is_zero_width_cluster(segment: &str) -> bool {
    get_zero_width_regex().is_match(segment)
}

/// Calculates additional width from trailing Halfwidth and Fullwidth Forms
///
/// This function processes characters after the first character in a grapheme
/// cluster to account for additional width from combining characters in the
/// Halfwidth and Fullwidth Forms block.
///
/// # Arguments
///
/// * `segment` - The grapheme cluster to analyze
/// * `ambiguous_as_wide` - Whether to treat ambiguous characters as wide
///
/// # Returns
///
/// The additional width contributed by trailing characters
fn calculate_trailing_width(segment: &str, ambiguous_as_wide: bool) -> usize {
    if segment.len() <= 1 {
        return 0;
    }

    segment
        .chars()
        .skip(1)
        .filter(|&ch| is_halfwidth_fullwidth(ch))
        .map(|ch| east_asian_width((ch as u32, ambiguous_as_wide)).as_usize())
        .sum()
}

/// Main function that calculates the display width of a string
///
/// This is the core width calculation function that processes a string
/// grapheme by grapheme, handling:
/// - ANSI escape sequences (optionally)
/// - Zero-width character clusters
/// - Emoji sequences
/// - East Asian character widths
/// - Combining characters and modifiers
///
/// # Arguments
///
/// * `input` - The string to measure
/// * `options` - Configuration options for the calculation
///
/// # Returns
///
/// The display width of the string in terminal columns
///
/// # Examples
///
/// ```rust
/// use string_width::{string_width_with_options, StringWidthOptions};
///
/// let options = StringWidthOptions::default();
/// assert_eq!(string_width_with_options("Hello", options.clone()), 5);
/// assert_eq!(string_width_with_options("😀", options), 2);
/// ```
pub fn string_width_with_options(input: &str, options: StringWidthOptions) -> usize {
    if input.is_empty() {
        return 0;
    }

    let processed_input = prepare_input(input, options.count_ansi);
    calculate_grapheme_widths(&processed_input, &options)
}

/// Prepares input string by optionally stripping ANSI escape sequences
///
/// # Arguments
///
/// * `input` - The input string
/// * `count_ansi` - Whether to preserve ANSI sequences
///
/// # Returns
///
/// Either a borrowed reference to the original string (if preserving ANSI)
/// or an owned string with ANSI sequences stripped
fn prepare_input(input: &str, count_ansi: bool) -> std::borrow::Cow<'_, str> {
    if count_ansi {
        std::borrow::Cow::Borrowed(input)
    } else {
        let stripped_bytes = strip_ansi_escapes::strip(input);
        let stripped = String::from_utf8_lossy(&stripped_bytes).into_owned();
        std::borrow::Cow::Owned(stripped)
    }
}

/// Calculates the total width by processing grapheme clusters
///
/// # Arguments
///
/// * `input` - The string to process
/// * `options` - Configuration options
///
/// # Returns
///
/// The total display width
fn calculate_grapheme_widths(input: &str, options: &StringWidthOptions) -> usize {
    if input.is_empty() {
        return 0;
    }

    let ambiguous_as_wide = options.ambiguous_width == AmbiguousWidthTreatment::Wide;

    input
        .graphemes(true)
        .map(|segment| calculate_segment_width(segment, options, ambiguous_as_wide))
        .sum()
}

/// Calculates the width of a single grapheme segment
///
/// # Arguments
///
/// * `segment` - The grapheme cluster to measure
/// * `options` - Configuration options
/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
///
/// # Returns
///
/// The display width of the segment
fn calculate_segment_width(
    segment: &str,
    options: &StringWidthOptions,
    ambiguous_as_wide: bool,
) -> usize {
    if !options.count_ansi && is_zero_width_cluster(segment) {
        return 0;
    }

    if is_rgi_emoji(segment) {
        return 2;
    }

    if options.count_ansi {
        calculate_ansi_aware_width(segment, ambiguous_as_wide)
    } else {
        calculate_standard_width(segment, ambiguous_as_wide)
    }
}

/// Calculates width when ANSI escape sequences should be counted
///
/// In this mode, control characters are given width 1 instead of being
/// treated as zero-width.
///
/// # Arguments
///
/// * `segment` - The grapheme cluster to measure
/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
///
/// # Returns
///
/// The display width including control characters
fn calculate_ansi_aware_width(segment: &str, ambiguous_as_wide: bool) -> usize {
    segment
        .chars()
        .map(|ch| {
            let code_point = ch as u32;
            if code_point <= CONTROL_CHAR_MAX || (DEL_CHAR..=C1_CONTROL_MAX).contains(&code_point) {
                1 // Control characters count as width 1
            } else {
                east_asian_width((code_point, ambiguous_as_wide)).as_usize()
            }
        })
        .sum()
}

/// Calculates width using standard East Asian width rules
///
/// This is the standard width calculation that ignores control characters
/// and uses Unicode East Asian Width properties.
///
/// # Arguments
///
/// * `segment` - The grapheme cluster to measure
/// * `ambiguous_as_wide` - Whether ambiguous characters are treated as wide
///
/// # Returns
///
/// The display width using standard rules
fn calculate_standard_width(segment: &str, ambiguous_as_wide: bool) -> usize {
    let main_char_info = find_main_character_optimized(segment);
    match main_char_info {
        Some(info) => {
            let base_width = east_asian_width((info.code_point, ambiguous_as_wide)).as_usize();
            let trailing_width = calculate_trailing_width(segment, ambiguous_as_wide);
            base_width + trailing_width
        }
        None => 0,
    }
}

/// Trait for types that can have their display width calculated
///
/// This trait provides a clean API for calculating string width
/// with better naming than the original StringWidthInput.
pub trait DisplayWidth {
    /// Calculate the display width using default options
    fn display_width(&self) -> usize;

    /// Calculate the display width with custom options
    fn display_width_with_options(&self, options: StringWidthOptions) -> usize;
}

impl DisplayWidth for str {
    /// Calculate the display width using default options
    ///
    /// # Examples
    ///
    /// ```rust
    /// use string_width::DisplayWidth;
    ///
    /// assert_eq!("Hello".display_width(), 5);
    /// assert_eq!("😀".display_width(), 2);
    /// ```
    fn display_width(&self) -> usize {
        string_width_with_options(self, StringWidthOptions::default())
    }

    /// Calculate the display width with custom options
    ///
    /// # Arguments
    ///
    /// * `options` - Configuration options for the calculation
    ///
    /// # Examples
    ///
    /// ```rust
    /// use string_width::{DisplayWidth, StringWidthOptions};
    ///
    /// let options = StringWidthOptions::builder()
    ///     .count_ansi(true)
    ///     .build();
    /// assert_eq!("\x1b[31mRed\x1b[0m".display_width_with_options(options), 12);
    /// ```
    fn display_width_with_options(&self, options: StringWidthOptions) -> usize {
        string_width_with_options(self, options)
    }
}

impl DisplayWidth for String {
    /// Calculate the display width using default options
    ///
    /// # Examples
    ///
    /// ```rust
    /// use string_width::DisplayWidth;
    ///
    /// let text = String::from("Hello 🌍");
    /// assert_eq!(text.display_width(), 8);
    /// ```
    fn display_width(&self) -> usize {
        self.as_str().display_width()
    }

    /// Calculate the display width with custom options
    ///
    /// # Arguments
    ///
    /// * `options` - Configuration options for the calculation
    ///
    /// # Examples
    ///
    /// ```rust
    /// use string_width::{DisplayWidth, StringWidthOptions};
    ///
    /// let text = String::from("±×÷");
    /// let options = StringWidthOptions::builder()
    ///     .ambiguous_as_wide()
    ///     .build();
    /// assert_eq!(text.display_width_with_options(options), 6);
    /// ```
    fn display_width_with_options(&self, options: StringWidthOptions) -> usize {
        self.as_str().display_width_with_options(options)
    }
}

/// Convenience function for calculating string width with default options
///
/// This is the main public API for simple width calculations.
/// It uses default options (no ANSI counting, ambiguous as narrow).
///
/// # Arguments
///
/// * `input` - The string to measure
///
/// # Returns
///
/// The display width of the string in terminal columns
///
/// # Examples
///
/// ```rust
/// use string_width::string_width;
///
/// assert_eq!(string_width("Hello"), 5);
/// assert_eq!(string_width("你好"), 4);  // Chinese characters
/// assert_eq!(string_width("😀"), 2);  // Emoji
/// assert_eq!(string_width("\x1b[31mRed\x1b[0m"), 3);  // ANSI stripped
/// ```
pub fn string_width(input: &str) -> usize {
    input.display_width()
}