1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#![doc = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/README.md"))]

mod code_point_ranges;

use code_point_ranges::{CodePointRange, ASCII_TABLE, DOUBLEWIDE_TABLE};
use unicode_segmentation::UnicodeSegmentation;

/// Check if char `c` is in array of code point ranges using binary search.
///
/// This operation is theoretically faster with the range-set data structure.  See <https://github.com/CarlKCarlK/range-set-blaze>.  In the interest of fewer dependencies and simpler code, this optimization has not been implemented.
fn in_table(arr: &[CodePointRange], c: char) -> bool {
    let c = c as u32;
    arr.binary_search_by(|range| {
        if range.contains(&c) {
            std::cmp::Ordering::Equal
        } else {
            range.start().cmp(&c)
        }
    })
    .is_ok()
}

/// Check if the char `c` has double width.
///
/// ## Examples
///
/// ```rust
/// use unicode_display_width::is_double_width;
///
/// // assert_eq!(is_double_width('🛡'), false);
/// assert_eq!(is_double_width('✅'), true);
/// ```
pub fn is_double_width(c: char) -> bool {
    // Since ASCII characters are so much more common in English text, check these first
    if in_table(ASCII_TABLE, c) {
        return false;
    }

    if in_table(DOUBLEWIDE_TABLE, c) {
        return true;
    }

    false
}

/// Get the number of columns required to display the grapheme cluster in a monospace font
///
/// Returns either `1` or `2`
fn get_grapheme_width(grapheme_cluster: &str) -> u64 {
    for scalar_value in grapheme_cluster.chars() {
        // emoji style variation selector
        if scalar_value == '\u{FE0F}' {
            return 2;
        }

        if is_double_width(scalar_value) {
            return 2;
        }
    }

    1
}

/// Return the number of columns required to display the `text` string in a monospace font as a sequence of extended grapheme clusters.  
///
/// Overflow is not realistically possible in this function with `u64` since each operation takes ~20 nanoseconds to complete (~500 years of continuous operation to overflow).  In terms of memory, an 18 exabyte string would need to be parsed to overflow.
///
/// ## Examples
///
/// ```rust
/// use unicode_display_width::width;
///
/// assert_eq!(width("👨‍👩‍👧‍👧"), 2);
/// assert_eq!(width("слава україні"), 13); // Glory to Ukraine in Ukrainian
/// assert_eq!(width("ݓ΅ɓԶѥƘҕ࠹ɇঐԢظٰ"), 12); // randomly generated Unicode
/// ```
pub fn width(text: &str) -> u64 {
    text.graphemes(true).fold(0, |acc, grapheme_cluster| {
        acc + (get_grapheme_width(grapheme_cluster))
    })
}

#[cfg(test)]
mod test {
    use super::*;
    use test_case::test_case;

    #[test_case('🛡', false)]
    #[test_case('✅', true)]
    fn test_width(text: char, wide: bool) {
        assert_eq!(is_double_width(text), wide);
    }

    #[test_case("🛡", 1; "length 1 grapheme")]
    #[test_case("\u{2764}", 1; "Heavy Black Heart emoji")]
    #[test_case("\u{2764}\u{FE0F}", 2; "Heavy Black Heart emoji with emoji style variation selector in Hex representation")]
    #[test_case("❤️", 2; "Heavy Black Heart emoji with emoji style variation selector")] // VS Code doesn't seem to support variation selectors
    #[test_case("✅", 2; "length 2 grapheme")]
    #[test_case("👨‍👩‍👧‍👧", 2; "grapheme composed of multiple emojis, at least one of which is length 2")]
    #[test_case("test test", 9; "ASCII text")]
    #[test_case("🗡", 1; "single width because it may be paired with the shield which is also a length 1 code point")]
    #[test_case("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻", 15; "U+1F608")] // 😈
    #[test_case("слава україні", 13; "Glory to Ukraine in Ukrainian")]
    #[test_case("슬라바 우크라이나", 17; "Glory to Ukraine in Korean")]
    #[test_case("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ", 9; "corrupted text")]
    fn test_string_width(text: &str, length: u64) {
        assert_eq!(width(text), length);
    }

    // The results of Indic script text may not be useful
    #[test_case("ണ്‍", 1; "Indic text with zero width joiner")]
    #[test_case("ന്‍", 1; "Indic text with zero width joiner 2")]
    #[test_case("ര്‍", 1; "Indic text with zero width joiner 3")]
    #[test_case(
        "\u{0924}\u{094D}\u{200D}\u{0928}",
        2;
        "Half letter form" // https://www.unicode.org/faq/indic.html
    )]
    #[test_case(
        "\u{0924}\u{094D}\u{200C}\u{0928}",
        2;
        "Single glyph form" // https://www.unicode.org/faq/indic.html
    )]
    fn indic_script(text: &str, length: u64) {
        assert_eq!(width(text), length);
    }
}