Skip to main content

reifydb_type/util/
unicode.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2025 ReifyDB
3
4/// Trait for calculating display width of strings
5pub trait UnicodeWidthStr {
6	/// Returns the display width of the string
7	fn width(&self) -> usize;
8}
9
10impl UnicodeWidthStr for str {
11	fn width(&self) -> usize {
12		self.chars().map(char_width).sum()
13	}
14}
15
16impl UnicodeWidthStr for &str {
17	fn width(&self) -> usize {
18		self.chars().map(char_width).sum()
19	}
20}
21
22/// Calculate the display width of a single character
23fn char_width(ch: char) -> usize {
24	match ch {
25        // Control characters have no width (includes \n, \r, \t)
26        '\x00'..='\x1F' | '\x7F'..='\u{9F}' => 0,
27
28        // Most CJK characters are double-width
29        '\u{1100}'..='\u{115F}' |  // Hangul Jamo
30        '\u{2E80}'..='\u{2EFF}' |  // CJK Radicals Supplement
31        '\u{2F00}'..='\u{2FDF}' |  // Kangxi Radicals
32        '\u{3000}'..='\u{303F}' |  // CJK Symbols and Punctuation
33        '\u{3040}'..='\u{309F}' |  // Hiragana
34        '\u{30A0}'..='\u{30FF}' |  // Katakana
35        '\u{3100}'..='\u{312F}' |  // Bopomofo
36        '\u{3130}'..='\u{318F}' |  // Hangul Compatibility Jamo
37        '\u{31A0}'..='\u{31BF}' |  // Bopomofo Extended
38        '\u{31F0}'..='\u{31FF}' |  // Katakana Phonetic Extensions
39        '\u{3200}'..='\u{32FF}' |  // Enclosed CJK Letters and Months
40        '\u{3300}'..='\u{33FF}' |  // CJK Compatibility
41        '\u{3400}'..='\u{4DBF}' |  // CJK Unified Ideographs Extension A
42        '\u{4E00}'..='\u{9FFF}' |  // CJK Unified Ideographs
43        '\u{A000}'..='\u{A48F}' |  // Yi Syllables
44        '\u{A490}'..='\u{A4CF}' |  // Yi Radicals
45        '\u{AC00}'..='\u{D7AF}' |  // Hangul Syllables
46        '\u{F900}'..='\u{FAFF}' |  // CJK Compatibility Ideographs
47        '\u{FE30}'..='\u{FE4F}' |  // CJK Compatibility Forms
48        '\u{FF00}'..='\u{FF60}' |  // Fullwidth Forms (part)
49        '\u{FFE0}'..='\u{FFE6}' |  // Fullwidth Forms (part)
50        '\u{20000}'..='\u{2FFFD}' => 2,
51
52        // Emoji and symbols (generally double-width)
53        '\u{1F300}'..='\u{1F6FF}' |  // Emoji & Pictographs (includes Regional Indicators U+1F1E6-1F1FF)
54        '\u{1F700}'..='\u{1F77F}' |  // Alchemical Symbols
55        '\u{1F780}'..='\u{1F7FF}' |  // Geometric Shapes Extended
56        '\u{1F800}'..='\u{1F8FF}' |  // Supplemental Arrows-C
57        '\u{1F900}'..='\u{1F9FF}' |  // Supplemental Symbols and Pictographs
58        '\u{1FA00}'..='\u{1FA6F}' |  // Chess Symbols
59        '\u{1FA70}'..='\u{1FAFF}' => 2, // Symbols and Pictographs Extended-A
60
61        // Zero-width characters
62        '\u{200B}'..='\u{200F}' |  // Zero width space, joiners, etc.
63        '\u{2028}'..='\u{202E}' |  // Line/paragraph separators, directional formatting
64        '\u{2060}'..='\u{206F}' => 0,
65
66        // Combining marks (zero width)
67        '\u{0300}'..='\u{036F}' |  // Combining Diacritical Marks
68        '\u{1AB0}'..='\u{1AFF}' |  // Combining Diacritical Marks Extended
69        '\u{1DC0}'..='\u{1DFF}' |  // Combining Diacritical Marks for Symbols
70        '\u{FE20}'..='\u{FE2F}' => 0,
71
72        // Variation selectors (zero width)
73        '\u{FE00}'..='\u{FE0F}' => 0,
74        '\u{E0100}'..='\u{E01EF}' => 0,
75
76        // Some specific double-width symbols
77        '\u{2600}'..='\u{27BF}' => 2, // Miscellaneous Symbols, Dingbats
78        // Default: single width
79        _ => 1}
80}
81
82#[cfg(test)]
83pub mod tests {
84	use super::*;
85
86	#[test]
87	fn test_ascii() {
88		assert_eq!("Hello".width(), 5);
89		assert_eq!("Hello, World!".width(), 13);
90		assert_eq!("".width(), 0);
91	}
92
93	#[test]
94	fn test_cjk() {
95		assert_eq!("你好".width(), 4); // Two Chinese characters
96		assert_eq!("こんにちは".width(), 10); // Five Japanese characters
97		assert_eq!("안녕하세요".width(), 10); // Five Korean characters
98	}
99
100	#[test]
101	fn test_mixed() {
102		assert_eq!("Hello 世界".width(), 10); // 6 for "Hello " + 4 for two CJK chars
103	}
104
105	#[test]
106	fn test_control_chars() {
107		assert_eq!("\x00\x01\x02".width(), 0);
108		assert_eq!("Hello\nWorld".width(), 10); // newline has 0 width
109		assert_eq!("Hello\tWorld".width(), 10); // tab has 0 width
110	}
111
112	#[test]
113	fn test_combining_marks() {
114		// e with combining acute accent
115		assert_eq!("e\u{0301}".width(), 1);
116		// a with combining tilde
117		assert_eq!("a\u{0303}".width(), 1);
118	}
119
120	#[test]
121	fn test_emoji() {
122		assert_eq!("🚀".width(), 2); // Rocket emoji
123		assert_eq!("😀".width(), 2); // Smiley face
124		assert_eq!("🎉".width(), 2); // Party popper
125		assert_eq!("Unicode: 🚀 ñ é ü".width(), 17); // "Unicode: " (9) + 🚀 (2) + " ñ é ü" (6) = 17
126	}
127}