string-width 0.1.0

Accurate Unicode string width calculation for terminal applications, handling emoji, East Asian characters, combining marks, and ANSI escape sequences
Documentation
/// Emoji detection and classification for string width calculation
///
/// This module handles the complex logic of determining whether
/// a grapheme cluster represents an emoji that should be displayed
/// with width 2.
use crate::unicode_constants::{
    emoji_ranges, emoji_with_vs16_ranges, keycap, regional_indicators, variation_selectors,
};

/// A wrapper around Unicode codepoints to provide type safety and convenience methods
///
/// This internal type provides a safe wrapper around raw Unicode code points
/// with convenient methods for range checking and value access.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct Codepoint(u32);

impl From<char> for Codepoint {
    /// Convert a character to a Codepoint wrapper
    fn from(ch: char) -> Self {
        Self(ch as u32)
    }
}

impl Codepoint {
    /// Get the raw codepoint value
    ///
    /// # Returns
    ///
    /// The underlying Unicode code point as a u32
    fn value(self) -> u32 {
        self.0
    }

    /// Check if this codepoint is within a given range
    ///
    /// # Arguments
    ///
    /// * `range` - The inclusive range to check against
    ///
    /// # Returns
    ///
    /// `true` if the codepoint is within the range, `false` otherwise
    fn is_in_range(self, range: &std::ops::RangeInclusive<u32>) -> bool {
        range.contains(&self.0)
    }
}

/// Determines if a grapheme cluster represents an emoji that should be displayed with width 2.
///
/// This function implements the Unicode Standard recommendations for emoji presentation,
/// handling various edge cases:
///
/// - Variation Selector sequences (VS15 for text, VS16 for emoji)
/// - Keycap sequences (digit/symbol + VS16 + U+20E3)
/// - Regional Indicator pairs (flag sequences)
/// - Characters with default emoji presentation
///
/// # Examples
///
/// ```
/// use string_width::string_width;
///
/// assert_eq!(string_width("😀"), 2);  // Default emoji presentation
/// assert_eq!(string_width("1️⃣"), 2);   // Keycap sequence
/// assert_eq!(string_width("🇺🇸"), 2);   // Flag (regional indicator pair)
/// assert_eq!(string_width("©️"), 2);    // Copyright with VS16
/// assert_eq!(string_width("©︎"), 1);   // Copyright with VS15 (text)
/// ```
pub fn is_rgi_emoji(segment: &str) -> bool {
    if segment.is_empty() {
        return false;
    }

    let chars: Vec<char> = segment.chars().collect();

    // Early return for variation selector sequences
    if let Some(result) = handle_variation_selector_sequences(&chars) {
        return result;
    }

    // Check specific emoji sequence types
    is_keycap_emoji_sequence(&chars)
        || is_regional_indicator_pair(&chars)
        || has_default_emoji_presentation_any(&chars)
}

/// Handle variation selector sequences (VS15/VS16)
///
/// Variation selectors control whether a character should be displayed
/// in text form (VS15) or emoji form (VS16).
///
/// # Arguments
///
/// * `chars` - The character sequence to analyze
///
/// # Returns
///
/// `Some(bool)` if a variation selector is found, `None` otherwise
fn handle_variation_selector_sequences(chars: &[char]) -> Option<bool> {
    match chars {
        [_base, vs, ..] => {
            let vs_code = Codepoint::from(*vs);
            match vs_code.value() {
                variation_selectors::TEXT_PRESENTATION => Some(false),
                variation_selectors::EMOJI_PRESENTATION => {
                    Some(handle_emoji_presentation_sequence(chars))
                }
                _ => None,
            }
        }
        _ => None,
    }
}

/// Handle emoji presentation sequences (VS16)
///
/// Processes sequences that contain the emoji presentation selector (VS16)
/// to determine if they should be treated as emoji.
///
/// # Arguments
///
/// * `chars` - The character sequence containing VS16
///
/// # Returns
///
/// `true` if the sequence should be treated as emoji, `false` otherwise
fn handle_emoji_presentation_sequence(chars: &[char]) -> bool {
    // Check for keycap sequences with VS16
    if is_keycap_sequence_with_vs16(chars) {
        return true;
    }

    // For keycap base characters (0-9, *, #), VS16 alone doesn't make them emoji
    // They need the full keycap sequence (base + VS16 + U+20E3)
    if is_keycap_base_character(chars[0]) {
        return false;
    }

    // If we have a sequence that looks like "base + VS16 + keycap" but the base
    // is not a valid keycap base, treat the entire sequence as non-emoji
    if chars.len() >= 3 && Codepoint::from(chars[2]).value() == keycap::COMBINING_ENCLOSING {
        return false;
    }

    // Check if base character becomes emoji with VS16
    is_emoji_with_vs16(chars[0])
}

/// Determines if a character sequence represents a keycap emoji (0️⃣-9️⃣, *️⃣, #️⃣)
///
/// Keycap sequences consist of a base character (digit, asterisk, or hash)
/// optionally followed by VS16, then the combining enclosing keycap (U+20E3).
///
/// # Arguments
///
/// * `chars` - The character sequence to check
///
/// # Returns
///
/// `true` if the sequence represents a keycap emoji, `false` otherwise
fn is_keycap_emoji_sequence(chars: &[char]) -> bool {
    is_keycap_sequence_with_vs16(chars) || is_keycap_sequence_without_vs16(chars)
}

/// Checks for keycap sequence without variation selector: base + U+20E3
///
/// This handles the simpler keycap format where the base character
/// is directly followed by the combining enclosing keycap.
///
/// # Arguments
///
/// * `chars` - The character sequence to check
///
/// # Returns
///
/// `true` if it matches the pattern base + U+20E3, `false` otherwise
fn is_keycap_sequence_without_vs16(chars: &[char]) -> bool {
    matches!(chars, [base, keycap, ..] 
        if Codepoint::from(*keycap).value() == keycap::COMBINING_ENCLOSING
        && is_keycap_base_character(*base))
}

/// Checks for keycap sequence with variation selector: base + VS16 + U+20E3
///
/// This handles the full keycap format with explicit emoji presentation.
///
/// # Arguments
///
/// * `chars` - The character sequence to check
///
/// # Returns
///
/// `true` if it matches the pattern base + VS16 + U+20E3, `false` otherwise
fn is_keycap_sequence_with_vs16(chars: &[char]) -> bool {
    matches!(chars, [base, vs, keycap, ..] 
        if Codepoint::from(*vs).value() == variation_selectors::EMOJI_PRESENTATION
        && Codepoint::from(*keycap).value() == keycap::COMBINING_ENCLOSING
        && is_keycap_base_character(*base))
}

/// Check if a character is a valid keycap base (0-9, *, #)
///
/// Only certain characters can serve as the base for keycap sequences.
///
/// # Arguments
///
/// * `ch` - The character to check
///
/// # Returns
///
/// `true` if the character can be used as a keycap base, `false` otherwise
fn is_keycap_base_character(ch: char) -> bool {
    let code = Codepoint::from(ch);
    matches!(
        code.value(),
        keycap::DIGIT_ZERO..=keycap::DIGIT_NINE | keycap::ASTERISK | keycap::NUMBER_SIGN
    )
}

/// Check if this is a regional indicator pair (flag)
///
/// Flag sequences consist of two regional indicator characters
/// that together represent a country or region flag.
///
/// # Arguments
///
/// * `chars` - The character sequence to check
///
/// # Returns
///
/// `true` if the sequence represents a flag, `false` otherwise
fn is_regional_indicator_pair(chars: &[char]) -> bool {
    match chars {
        [first, second, ..] => {
            let first_code = Codepoint::from(*first);
            let second_code = Codepoint::from(*second);
            first_code.is_in_range(&(regional_indicators::START..=regional_indicators::END))
                && second_code.is_in_range(&(regional_indicators::START..=regional_indicators::END))
        }
        _ => false,
    }
}

/// Check if any character in the sequence has default emoji presentation
///
/// Some characters have default emoji presentation, meaning they are
/// displayed as emoji without needing a variation selector.
///
/// # Arguments
///
/// * `chars` - The character sequence to check
///
/// # Returns
///
/// `true` if any character has default emoji presentation, `false` otherwise
fn has_default_emoji_presentation_any(chars: &[char]) -> bool {
    chars.iter().any(|&ch| has_default_emoji_presentation(ch))
}

/// Check if a character has default emoji presentation (width 2 by default)
///
/// Only characters in specific Unicode blocks have default emoji presentation.
/// Characters in other ranges (like U+2600-U+26FF) need VS16 to be emoji.
///
/// # Arguments
///
/// * `ch` - The character to check
///
/// # Returns
///
/// `true` if the character has default emoji presentation, `false` otherwise
pub fn has_default_emoji_presentation(ch: char) -> bool {
    let code = ch as u32;
    emoji_ranges::EMOTICONS.contains(&code)
        || emoji_ranges::MISC_SYMBOLS_PICTOGRAPHS.contains(&code)
        || emoji_ranges::TRANSPORT_MAP.contains(&code)
        || emoji_ranges::ALCHEMICAL.contains(&code)
        || emoji_ranges::GEOMETRIC_EXTENDED.contains(&code)
        || emoji_ranges::SUPPLEMENTAL_ARROWS_C.contains(&code)
        || emoji_ranges::SUPPLEMENTAL_SYMBOLS.contains(&code)
        || emoji_ranges::CHESS.contains(&code)
        || emoji_ranges::SYMBOLS_EXTENDED_A.contains(&code)
}

/// Check if a character becomes emoji when combined with VS16
///
/// These are characters that don't have default emoji presentation
/// but can be displayed as emoji when followed by VS16.
///
/// # Arguments
///
/// * `ch` - The character to check
///
/// # Returns
///
/// `true` if the character becomes emoji with VS16, `false` otherwise
pub fn is_emoji_with_vs16(ch: char) -> bool {
    let code = Codepoint::from(ch);

    // Check ranges
    code.is_in_range(&emoji_with_vs16_ranges::COPYRIGHT_REGISTERED)
        || code.is_in_range(&emoji_with_vs16_ranges::EXCLAMATION_MARKS)
        || code.is_in_range(&emoji_with_vs16_ranges::ARROWS)
        || code.is_in_range(&emoji_with_vs16_ranges::RETURN_ARROWS)
        || code.is_in_range(&emoji_with_vs16_ranges::WATCH_HOURGLASS)
        || code.is_in_range(&emoji_with_vs16_ranges::MEDIA_CONTROLS)
        || code.is_in_range(&emoji_with_vs16_ranges::SMALL_SQUARES)
        || code.is_in_range(&emoji_with_vs16_ranges::WEATHER_BASIC)
        || code.is_in_range(&emoji_with_vs16_ranges::UMBRELLA_COFFEE)
        || code.is_in_range(&emoji_with_vs16_ranges::HAZARD_SYMBOLS)
        || code.is_in_range(&emoji_with_vs16_ranges::PEACE_YIN_YANG)
        || code.is_in_range(&emoji_with_vs16_ranges::DHARMA_SMILE)
        || code.is_in_range(&emoji_with_vs16_ranges::ZODIAC)
        || code.is_in_range(&emoji_with_vs16_ranges::CARD_SUITS_1)
        || code.is_in_range(&emoji_with_vs16_ranges::TOOLS_SCIENCE)
        || code.is_in_range(&emoji_with_vs16_ranges::ATOM_FLEUR)
        || code.is_in_range(&emoji_with_vs16_ranges::WARNING_ZAP)
        || code.is_in_range(&emoji_with_vs16_ranges::CIRCLES)
        || code.is_in_range(&emoji_with_vs16_ranges::FUNERAL)
        || code.is_in_range(&emoji_with_vs16_ranges::SPORTS_BALLS)
        || code.is_in_range(&emoji_with_vs16_ranges::WEATHER_EXTENDED)
        || code.is_in_range(&emoji_with_vs16_ranges::OPHIUCHUS_PICK)
        || code.is_in_range(&emoji_with_vs16_ranges::CHAINS_NO_ENTRY)
        || code.is_in_range(&emoji_with_vs16_ranges::RELIGIOUS_BUILDINGS)
        || code.is_in_range(&emoji_with_vs16_ranges::MOUNTAIN_SAILBOAT)
        || code.is_in_range(&emoji_with_vs16_ranges::SKIER_TENT)
        || code.is_in_range(&emoji_with_vs16_ranges::AIRPLANE_PENCIL)
        || code.is_in_range(&emoji_with_vs16_ranges::ASTERISK_VARIANTS)
        || code.is_in_range(&emoji_with_vs16_ranges::QUESTION_MARKS)
        || code.is_in_range(&emoji_with_vs16_ranges::HEARTS)
        || code.is_in_range(&emoji_with_vs16_ranges::PLUS_MINUS)
        || code.is_in_range(&emoji_with_vs16_ranges::CURVED_ARROWS)
        || code.is_in_range(&emoji_with_vs16_ranges::BASIC_ARROWS)
        || code.is_in_range(&emoji_with_vs16_ranges::LARGE_SQUARES)
        // Check individual characters
        || emoji_with_vs16_ranges::INDIVIDUAL_CHARS.contains(&code.value())
}