unicode_display_width/
lib.rs

1#![doc = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/README.md"))]
2
3mod code_point_ranges;
4
5use code_point_ranges::{CodePointRange, ASCII_TABLE, DOUBLEWIDE_TABLE};
6use unicode_segmentation::UnicodeSegmentation;
7
8/// Check if char `c` is in array of code point ranges using binary search.
9///
10/// This operation is theoretically faster with the range-set data structure.  See <https://github.com/CarlKCarlK/range-set-blaze>.  In the interest of fewer dependencies and simpler code, this optimization has not been implemented.
11fn in_table(arr: &[CodePointRange], c: char) -> bool {
12    let c = c as u32;
13    arr.binary_search_by(|range| {
14        if range.contains(&c) {
15            std::cmp::Ordering::Equal
16        } else {
17            range.start().cmp(&c)
18        }
19    })
20    .is_ok()
21}
22
23/// Check if the char `c` has double width.
24///
25/// ## Examples
26///
27/// ```rust
28/// use unicode_display_width::is_double_width;
29///
30/// // assert_eq!(is_double_width('🛡'), false);
31/// assert_eq!(is_double_width('✅'), true);
32/// ```
33pub fn is_double_width(c: char) -> bool {
34    // Since ASCII characters are so much more common in English text, check these first
35    if in_table(ASCII_TABLE, c) {
36        return false;
37    }
38
39    if in_table(DOUBLEWIDE_TABLE, c) {
40        return true;
41    }
42
43    false
44}
45
46/// Get the number of columns required to display the grapheme cluster in a monospace font
47///
48/// Returns either `1` or `2`
49fn get_grapheme_width(grapheme_cluster: &str) -> u64 {
50    for scalar_value in grapheme_cluster.chars() {
51        // emoji style variation selector
52        if scalar_value == '\u{FE0F}' {
53            return 2;
54        }
55
56        if is_double_width(scalar_value) {
57            return 2;
58        }
59    }
60
61    1
62}
63
64/// Return the number of columns required to display the `text` string in a monospace font as a sequence of extended grapheme clusters.  
65///
66/// Overflow is not realistically possible in this function with `u64` since each operation takes ~20 nanoseconds to complete (~500 years of continuous operation to overflow).  In terms of memory, an 18 exabyte string would need to be parsed to overflow.
67///
68/// ## Examples
69///
70/// ```rust
71/// use unicode_display_width::width;
72///
73/// assert_eq!(width("👨‍👩‍👧‍👧"), 2);
74/// assert_eq!(width("слава україні"), 13); // Glory to Ukraine in Ukrainian
75/// assert_eq!(width("ݓ΅ɓԶѥƘҕ࠹ɇঐԢظٰ"), 12); // randomly generated Unicode
76/// ```
77pub fn width(text: &str) -> u64 {
78    text.graphemes(true).fold(0, |acc, grapheme_cluster| {
79        acc + (get_grapheme_width(grapheme_cluster))
80    })
81}
82
83#[cfg(test)]
84mod test {
85    use super::*;
86    use test_case::test_case;
87
88    #[test_case('🛡', false)]
89    #[test_case('✅', true)]
90    fn test_width(text: char, wide: bool) {
91        assert_eq!(is_double_width(text), wide);
92    }
93
94    #[test_case("🛡", 1; "length 1 grapheme")]
95    #[test_case("\u{2764}", 1; "Heavy Black Heart emoji")]
96    #[test_case("\u{2764}\u{FE0F}", 2; "Heavy Black Heart emoji with emoji style variation selector in Hex representation")]
97    #[test_case("❤️", 2; "Heavy Black Heart emoji with emoji style variation selector")] // VS Code doesn't seem to support variation selectors
98    #[test_case("✅", 2; "length 2 grapheme")]
99    #[test_case("👨‍👩‍👧‍👧", 2; "grapheme composed of multiple emojis, at least one of which is length 2")]
100    #[test_case("test test", 9; "ASCII text")]
101    #[test_case("🗡", 1; "single width because it may be paired with the shield which is also a length 1 code point")]
102    #[test_case("🔥🗡🍩👩🏻‍🚀⏰💃🏼🔦👍🏻", 15; "U+1F608")] // 😈
103    #[test_case("слава україні", 13; "Glory to Ukraine in Ukrainian")]
104    #[test_case("슬라바 우크라이나", 17; "Glory to Ukraine in Korean")]
105    #[test_case("Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ", 9; "corrupted text")]
106    fn test_string_width(text: &str, length: u64) {
107        assert_eq!(width(text), length);
108    }
109
110    // The results of Indic script text may not be useful
111    #[test_case("ണ്‍", 1; "Indic text with zero width joiner")]
112    #[test_case("ന്‍", 1; "Indic text with zero width joiner 2")]
113    #[test_case("ര്‍", 1; "Indic text with zero width joiner 3")]
114    #[test_case(
115        "\u{0924}\u{094D}\u{200D}\u{0928}",
116        2;
117        "Half letter form" // https://www.unicode.org/faq/indic.html
118    )]
119    #[test_case(
120        "\u{0924}\u{094D}\u{200C}\u{0928}",
121        2;
122        "Single glyph form" // https://www.unicode.org/faq/indic.html
123    )]
124    fn indic_script(text: &str, length: u64) {
125        assert_eq!(width(text), length);
126    }
127}