beamterm_unicode/lib.rs
1//! Unicode character classification utilities for beamterm.
2//!
3//! Provides emoji detection and double-width character classification
4//! shared across the beamterm workspace crates.
5
6use unicode_width::UnicodeWidthStr;
7
8/// Checks if a grapheme is an emoji that should use color font rendering.
9///
10/// Uses UTF-8 byte-level checks and a codepoint table to avoid calling
11/// `unicode-width` for single-codepoint strings (the common case). Only
12/// multi-codepoint sequences (ZWJ, flags, keycaps, text + FE0F) fall
13/// through to a `width()` check.
14#[must_use]
15pub fn is_emoji(s: &str) -> bool {
16 let bytes = s.as_bytes();
17 let Some(&first_byte) = bytes.first() else {
18 return false;
19 };
20
21 // ASCII (1 byte, U+0000–U+007F): single ASCII is never emoji, but
22 // multi-codepoint sequences starting with ASCII can be (e.g. keycap "1️⃣").
23 if first_byte < 0x80 {
24 return s.len() > 1 && s.width() >= 2;
25 }
26
27 // 2-byte UTF-8 (U+0080–U+07FF): no emoji exist in this range.
28 if first_byte < 0xE0 {
29 return s.len() > 2 && s.width() >= 2;
30 }
31
32 // 3+ byte UTF-8: decode the first codepoint.
33 // SAFETY: we verified the string is non-empty and starts with a 3+ byte sequence.
34 let first = unsafe { s.chars().next().unwrap_unchecked() };
35 let first_len = first.len_utf8();
36
37 // Single codepoint
38 if s.len() == first_len {
39 // 3-byte (BMP, U+0800–U+FFFF): emoji table is exact — skip width().
40 // 4-byte (SMP, U+10000+): range check is broad, verify with width().
41 return if first_len == 3 {
42 is_emoji_presentation(first)
43 } else {
44 s.width() >= 2 && is_emoji_presentation(first)
45 };
46 }
47
48 // Multi-codepoint: emoji if wide (ZWJ, flags, skin tones, text + FE0F).
49 s.width() >= 2
50}
51
52/// Checks if a grapheme is double-width (emoji or fullwidth character).
53#[must_use]
54pub fn is_double_width(grapheme: &str) -> bool {
55 grapheme.width() >= 2
56}
57
58/// Returns `true` for characters with emoji-presentation-by-default that
59/// `unicode-width` reports as width 2. This covers BMP emoji (60 code
60/// points) and SMP emoji (U+1F000–U+1FFFF), excluding CJK Enclosed
61/// Ideographic Supplement characters that are wide but not emoji.
62///
63/// Derived from cross-referencing every entry in the `emojis` 0.8 crate
64/// against `unicode-width` 0.2 — see `tests/enumerate_emojis_crate.rs`.
65fn is_emoji_presentation(c: char) -> bool {
66 let cp = c as u32;
67
68 match cp {
69 // BMP emoji with default emoji presentation (60 code points, U+231A–U+2B55).
70 0x231A..=0x2B55 => matches!(
71 cp,
72 0x231A..=0x231B // ⌚⌛
73 | 0x23E9..=0x23EC // ⏩⏪⏫⏬
74 | 0x23F0 // ⏰
75 | 0x23F3 // ⏳
76 | 0x25FD..=0x25FE // ◽◾
77 | 0x2614..=0x2615 // ☔☕
78 | 0x2648..=0x2653 // ♈..♓
79 | 0x267F // ♿
80 | 0x2693 // ⚓
81 | 0x26A1 // ⚡
82 | 0x26AA..=0x26AB // ⚪⚫
83 | 0x26BD..=0x26BE // ⚽⚾
84 | 0x26C4..=0x26C5 // ⛄⛅
85 | 0x26CE // ⛎
86 | 0x26D4 // ⛔
87 | 0x26EA // ⛪
88 | 0x26F2..=0x26F3 // ⛲⛳
89 | 0x26F5 // ⛵
90 | 0x26FA // ⛺
91 | 0x26FD // ⛽
92 | 0x2705 // ✅
93 | 0x270A..=0x270B // ✊✋
94 | 0x2728 // ✨
95 | 0x274C // ❌
96 | 0x274E // ❎
97 | 0x2753..=0x2755 // ❓❔❕
98 | 0x2757 // ❗
99 | 0x2795..=0x2797 // ➕➖➗
100 | 0x27B0 // ➰
101 | 0x27BF // ➿
102 | 0x2B1B..=0x2B1C // ⬛⬜
103 | 0x2B50 // ⭐
104 | 0x2B55 // ⭕
105 ),
106 // SMP emoji: nearly all characters in U+1F000–U+1FFFF are emoji.
107 // Exclude CJK Enclosed Ideographic Supplement (EAW=W text symbols).
108 0x1F000..=0x1FFFF => !matches!(
109 cp,
110 0x1F200
111 | 0x1F202..=0x1F219
112 | 0x1F21B..=0x1F22E
113 | 0x1F230..=0x1F231
114 | 0x1F237
115 | 0x1F23B..=0x1F24F
116 | 0x1F260..=0x1F265
117 ),
118 _ => false,
119 }
120}
121
122#[cfg(test)]
123mod tests {
124 use super::*;
125
126 #[test]
127 fn test_is_emoji() {
128 // Emoji-presentation-by-default: always emoji
129 assert!(is_emoji("\u{1F680}"));
130 assert!(is_emoji("\u{1F600}"));
131 assert!(is_emoji("\u{23E9}"));
132 assert!(is_emoji("\u{23EA}"));
133
134 // Text-presentation-by-default with FE0F: emoji
135 assert!(is_emoji("\u{25B6}\u{FE0F}"));
136
137 // Text-presentation-by-default without FE0F: NOT emoji
138 assert!(!is_emoji("\u{25B6}"));
139 assert!(!is_emoji("\u{25C0}"));
140 assert!(!is_emoji("\u{23ED}"));
141 assert!(!is_emoji("\u{23F9}"));
142 assert!(!is_emoji("\u{23EE}"));
143 assert!(!is_emoji("\u{25AA}"));
144 assert!(!is_emoji("\u{25AB}"));
145 assert!(!is_emoji("\u{25FC}"));
146
147 // Not emoji
148 assert!(!is_emoji("A"));
149 assert!(!is_emoji("\u{2588}"));
150 }
151
152 #[test]
153 fn test_is_double_width() {
154 // emoji-presentation-by-default
155 assert!(is_double_width("\u{1F600}"));
156 assert!(is_double_width(
157 "\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}"
158 )); // ZWJ sequence
159
160 [
161 "\u{231A}", "\u{231B}", "\u{23E9}", "\u{23F3}", "\u{2614}", "\u{2615}", "\u{2648}",
162 "\u{2653}", "\u{267F}", "\u{2693}", "\u{26A1}", "\u{26AA}", "\u{26AB}", "\u{26BD}",
163 "\u{26BE}", "\u{26C4}", "\u{26C5}", "\u{26CE}", "\u{26D4}", "\u{26EA}", "\u{26F2}",
164 "\u{26F3}", "\u{26F5}", "\u{26FA}", "\u{26FD}", "\u{25FE}", "\u{2B1B}", "\u{2B1C}",
165 "\u{2B50}", "\u{2B55}", "\u{3030}", "\u{303D}", "\u{3297}", "\u{3299}",
166 ]
167 .iter()
168 .for_each(|s| {
169 assert!(is_double_width(s), "Failed for emoji: {s}");
170 });
171
172 // text-presentation-by-default with FE0F: double-width
173 assert!(is_double_width("\u{25B6}\u{FE0F}"));
174 assert!(is_double_width("\u{25C0}\u{FE0F}"));
175
176 // text-presentation-by-default without FE0F: single-width
177 assert!(!is_double_width("\u{23F8}"));
178 assert!(!is_double_width("\u{23FA}"));
179 assert!(!is_double_width("\u{25AA}"));
180 assert!(!is_double_width("\u{25AB}"));
181 assert!(!is_double_width("\u{25B6}"));
182 assert!(!is_double_width("\u{25C0}"));
183 assert!(!is_double_width("\u{25FB}"));
184 assert!(!is_double_width("\u{2934}"));
185 assert!(!is_double_width("\u{2935}"));
186 assert!(!is_double_width("\u{2B05}"));
187 assert!(!is_double_width("\u{2B07}"));
188 assert!(!is_double_width("\u{26C8}"));
189
190 // CJK
191 assert!(is_double_width("\u{4E2D}"));
192 assert!(is_double_width("\u{65E5}"));
193
194 // single-width
195 assert!(!is_double_width("A"));
196 assert!(!is_double_width("\u{2192}"));
197 }
198}