reifydb_type/util/unicode.rs
1// Copyright (c) reifydb.com 2025
2// This file is licensed under the AGPL-3.0-or-later, see license.md file
3
4//! Simple unicode width calculation implementation
5
6/// Trait for calculating display width of strings
7pub trait UnicodeWidthStr {
8 /// Returns the display width of the string
9 fn width(&self) -> usize;
10}
11
12impl UnicodeWidthStr for str {
13 fn width(&self) -> usize {
14 self.chars().map(char_width).sum()
15 }
16}
17
18impl UnicodeWidthStr for &str {
19 fn width(&self) -> usize {
20 self.chars().map(char_width).sum()
21 }
22}
23
24/// Calculate the display width of a single character
25fn char_width(ch: char) -> usize {
26 match ch {
27 // Control characters have no width (includes \n, \r, \t)
28 '\x00'..='\x1F' | '\x7F'..='\u{9F}' => 0,
29
30 // Most CJK characters are double-width
31 '\u{1100}'..='\u{115F}' | // Hangul Jamo
32 '\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement
33 '\u{2F00}'..='\u{2FDF}' | // Kangxi Radicals
34 '\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation
35 '\u{3040}'..='\u{309F}' | // Hiragana
36 '\u{30A0}'..='\u{30FF}' | // Katakana
37 '\u{3100}'..='\u{312F}' | // Bopomofo
38 '\u{3130}'..='\u{318F}' | // Hangul Compatibility Jamo
39 '\u{31A0}'..='\u{31BF}' | // Bopomofo Extended
40 '\u{31F0}'..='\u{31FF}' | // Katakana Phonetic Extensions
41 '\u{3200}'..='\u{32FF}' | // Enclosed CJK Letters and Months
42 '\u{3300}'..='\u{33FF}' | // CJK Compatibility
43 '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
44 '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
45 '\u{A000}'..='\u{A48F}' | // Yi Syllables
46 '\u{A490}'..='\u{A4CF}' | // Yi Radicals
47 '\u{AC00}'..='\u{D7AF}' | // Hangul Syllables
48 '\u{F900}'..='\u{FAFF}' | // CJK Compatibility Ideographs
49 '\u{FE30}'..='\u{FE4F}' | // CJK Compatibility Forms
50 '\u{FF00}'..='\u{FF60}' | // Fullwidth Forms (part)
51 '\u{FFE0}'..='\u{FFE6}' | // Fullwidth Forms (part)
52 '\u{20000}'..='\u{2FFFD}' => 2,
53
54 // Emoji and symbols (generally double-width)
55 '\u{1F300}'..='\u{1F6FF}' | // Emoji & Pictographs (includes Regional Indicators U+1F1E6-1F1FF)
56 '\u{1F700}'..='\u{1F77F}' | // Alchemical Symbols
57 '\u{1F780}'..='\u{1F7FF}' | // Geometric Shapes Extended
58 '\u{1F800}'..='\u{1F8FF}' | // Supplemental Arrows-C
59 '\u{1F900}'..='\u{1F9FF}' | // Supplemental Symbols and Pictographs
60 '\u{1FA00}'..='\u{1FA6F}' | // Chess Symbols
61 '\u{1FA70}'..='\u{1FAFF}' => 2, // Symbols and Pictographs Extended-A
62
63 // Zero-width characters
64 '\u{200B}'..='\u{200F}' | // Zero width space, joiners, etc.
65 '\u{2028}'..='\u{202E}' | // Line/paragraph separators, directional formatting
66 '\u{2060}'..='\u{206F}' => 0,
67
68 // Combining marks (zero width)
69 '\u{0300}'..='\u{036F}' | // Combining Diacritical Marks
70 '\u{1AB0}'..='\u{1AFF}' | // Combining Diacritical Marks Extended
71 '\u{1DC0}'..='\u{1DFF}' | // Combining Diacritical Marks for Symbols
72 '\u{FE20}'..='\u{FE2F}' => 0,
73
74 // Variation selectors (zero width)
75 '\u{FE00}'..='\u{FE0F}' => 0,
76 '\u{E0100}'..='\u{E01EF}' => 0,
77
78 // Some specific double-width symbols
79 '\u{2600}'..='\u{27BF}' => 2, // Miscellaneous Symbols, Dingbats
80 // Default: single width
81 _ => 1}
82}
83
84#[cfg(test)]
85mod tests {
86 use super::*;
87
88 #[test]
89 fn test_ascii() {
90 assert_eq!("Hello".width(), 5);
91 assert_eq!("Hello, World!".width(), 13);
92 assert_eq!("".width(), 0);
93 }
94
95 #[test]
96 fn test_cjk() {
97 assert_eq!("你好".width(), 4); // Two Chinese characters
98 assert_eq!("こんにちは".width(), 10); // Five Japanese characters
99 assert_eq!("안녕하세요".width(), 10); // Five Korean characters
100 }
101
102 #[test]
103 fn test_mixed() {
104 assert_eq!("Hello 世界".width(), 10); // 6 for "Hello " + 4 for two CJK chars
105 }
106
107 #[test]
108 fn test_control_chars() {
109 assert_eq!("\x00\x01\x02".width(), 0);
110 assert_eq!("Hello\nWorld".width(), 10); // newline has 0 width
111 assert_eq!("Hello\tWorld".width(), 10); // tab has 0 width
112 }
113
114 #[test]
115 fn test_combining_marks() {
116 // e with combining acute accent
117 assert_eq!("e\u{0301}".width(), 1);
118 // a with combining tilde
119 assert_eq!("a\u{0303}".width(), 1);
120 }
121
122 #[test]
123 fn test_emoji() {
124 assert_eq!("🚀".width(), 2); // Rocket emoji
125 assert_eq!("😀".width(), 2); // Smiley face
126 assert_eq!("🎉".width(), 2); // Party popper
127 assert_eq!("Unicode: 🚀 ñ é ü".width(), 17); // "Unicode: " (9) + 🚀 (2) + " ñ é ü" (6) = 17
128 }
129}