string-width 0.1.0

Accurate Unicode string width calculation for terminal applications, handling emoji, East Asian characters, combining marks, and ANSI escape sequences
Documentation
/// Character classification utilities for string width calculation
///
/// This module provides functions to classify Unicode characters
/// into various categories needed for width calculation.
use crate::unicode_constants::{combining_marks, format_chars, halfwidth_fullwidth, prepend_chars};

/// Information about a character for width calculation
///
/// This struct contains all the classification information needed to determine
/// how a character should be handled during width calculation.
#[derive(Debug, Clone)]
pub struct CharacterInfo {
    /// The Unicode code point of the character
    pub code_point: u32,
    /// Whether this character is a combining mark
    pub is_combining: bool,
    /// Whether this character is a format character
    pub is_format: bool,
    /// Whether this character is a prepend character
    pub is_prepend: bool,
}

impl CharacterInfo {
    /// Create a new CharacterInfo with all classifications computed
    ///
    /// This constructor analyzes the given character and determines its
    /// classification properties for width calculation.
    ///
    /// # Arguments
    ///
    /// * `ch` - The character to analyze
    pub fn new(ch: char) -> Self {
        let code_point = ch as u32;
        Self {
            code_point,
            is_combining: Self::is_combining_mark(code_point),
            is_format: Self::is_format_character(code_point),
            is_prepend: Self::is_prepend_character(code_point),
        }
    }

    /// Check if a code point is a combining mark
    ///
    /// Combining marks are characters that combine with preceding characters
    /// to form a single grapheme cluster, typically used for diacritics.
    ///
    /// # Arguments
    ///
    /// * `code` - The Unicode code point to check
    ///
    /// # Returns
    ///
    /// `true` if the code point is a combining mark, `false` otherwise
    fn is_combining_mark(code: u32) -> bool {
        combining_marks::DIACRITICAL.contains(&code)
            || combining_marks::DIACRITICAL_EXTENDED.contains(&code)
            || combining_marks::DIACRITICAL_SUPPLEMENT.contains(&code)
            || combining_marks::DIACRITICAL_SYMBOLS.contains(&code)
            || combining_marks::HALF_MARKS.contains(&code)
    }

    /// Check if a code point is a format character
    ///
    /// Format characters are invisible characters that affect the formatting
    /// or layout of text, such as zero-width spaces and bidirectional marks.
    ///
    /// # Arguments
    ///
    /// * `code` - The Unicode code point to check
    ///
    /// # Returns
    ///
    /// `true` if the code point is a format character, `false` otherwise
    fn is_format_character(code: u32) -> bool {
        matches!(code,
            format_chars::SOFT_HYPHEN |
            format_chars::ZERO_WIDTH_SPACE..=format_chars::RIGHT_TO_LEFT_MARK |
            0x202A..=0x202E | // Bidirectional formatting
            0x2060..=0x206F | // Various format characters
            format_chars::ZERO_WIDTH_NO_BREAK_SPACE |
            0xFFF9..=0xFFFB   // Interlinear annotation characters
        )
    }

    /// Check if a code point is a prepend character
    ///
    /// Prepend characters are characters that should be treated as part of
    /// the following grapheme cluster, commonly found in Arabic scripts.
    ///
    /// # Arguments
    ///
    /// * `code` - The Unicode code point to check
    ///
    /// # Returns
    ///
    /// `true` if the code point is a prepend character, `false` otherwise
    fn is_prepend_character(code: u32) -> bool {
        prepend_chars::ARABIC_PREPEND.contains(&code)
            || prepend_chars::ARABIC_DIACRITICS.contains(&code)
            || prepend_chars::ARABIC_SUPPLEMENT.contains(&code)
    }
}

/// Check if a character is in the Halfwidth and Fullwidth Forms block
///
/// The Halfwidth and Fullwidth Forms block (U+FF00-U+FFEF) contains
/// characters that have specific width properties and may contribute
/// additional width to grapheme clusters.
///
/// # Arguments
///
/// * `ch` - The character to check
///
/// # Returns
///
/// `true` if the character is in the Halfwidth and Fullwidth Forms block
pub fn is_halfwidth_fullwidth(ch: char) -> bool {
    halfwidth_fullwidth::BLOCK.contains(&(ch as u32))
}

/// Find the main character that determines the width of a grapheme cluster
///
/// This function analyzes a grapheme cluster and identifies the primary character
/// that determines its display width. It skips over prepend characters, combining
/// marks, and format characters to find the base character.
///
/// # Arguments
///
/// * `segment` - The grapheme cluster to analyze
///
/// # Returns
///
/// `Some(CharacterInfo)` for the main character, or `None` if no suitable
/// character is found (in which case it falls back to the first character)
pub fn find_main_character_optimized(segment: &str) -> Option<CharacterInfo> {
    segment
        .chars()
        .map(CharacterInfo::new)
        .find(|info| !info.is_prepend && !info.is_combining && !info.is_format)
        .or_else(|| segment.chars().map(CharacterInfo::new).next())
}