Skip to main content

beamterm_unicode/
lib.rs

1//! Unicode character classification utilities for beamterm.
2//!
3//! Provides emoji detection and double-width character classification
4//! shared across the beamterm workspace crates.
5
6use unicode_width::UnicodeWidthStr;
7
8/// Checks if a grapheme is an emoji that should use color font rendering.
9///
10/// Uses UTF-8 byte-level checks and a codepoint table to avoid calling
11/// `unicode-width` for single-codepoint strings (the common case). Only
12/// multi-codepoint sequences (ZWJ, flags, keycaps, text + FE0F) fall
13/// through to a `width()` check.
14#[must_use]
15pub fn is_emoji(s: &str) -> bool {
16    let bytes = s.as_bytes();
17    let Some(&first_byte) = bytes.first() else {
18        return false;
19    };
20
21    // ASCII (1 byte, U+0000–U+007F): single ASCII is never emoji, but
22    // multi-codepoint sequences starting with ASCII can be (e.g. keycap "1️⃣").
23    if first_byte < 0x80 {
24        return s.len() > 1 && s.width() >= 2;
25    }
26
27    // 2-byte UTF-8 (U+0080–U+07FF): no emoji exist in this range.
28    if first_byte < 0xE0 {
29        return s.len() > 2 && s.width() >= 2;
30    }
31
32    // 3+ byte UTF-8: decode the first codepoint.
33    // SAFETY: we verified the string is non-empty and starts with a 3+ byte sequence.
34    let first = unsafe { s.chars().next().unwrap_unchecked() };
35    let first_len = first.len_utf8();
36
37    // Single codepoint
38    if s.len() == first_len {
39        // 3-byte (BMP, U+0800–U+FFFF): emoji table is exact — skip width().
40        // 4-byte (SMP, U+10000+): range check is broad, verify with width().
41        return if first_len == 3 {
42            is_emoji_presentation(first)
43        } else {
44            s.width() >= 2 && is_emoji_presentation(first)
45        };
46    }
47
48    // Multi-codepoint: emoji if wide (ZWJ, flags, skin tones, text + FE0F).
49    s.width() >= 2
50}
51
52/// Checks if a grapheme is double-width (emoji or fullwidth character).
53#[must_use]
54pub fn is_double_width(grapheme: &str) -> bool {
55    grapheme.width() >= 2
56}
57
58/// Returns `true` for characters with emoji-presentation-by-default that
59/// `unicode-width` reports as width 2. This covers BMP emoji (60 code
60/// points) and SMP emoji (U+1F000–U+1FFFF), excluding CJK Enclosed
61/// Ideographic Supplement characters that are wide but not emoji.
62///
63/// Derived from cross-referencing every entry in the `emojis` 0.8 crate
64/// against `unicode-width` 0.2 — see `tests/enumerate_emojis_crate.rs`.
65fn is_emoji_presentation(c: char) -> bool {
66    let cp = c as u32;
67
68    match cp {
69        // BMP emoji with default emoji presentation (60 code points, U+231A–U+2B55).
70        0x231A..=0x2B55 => matches!(
71            cp,
72            0x231A..=0x231B   // ⌚⌛
73            | 0x23E9..=0x23EC // ⏩⏪⏫⏬
74            | 0x23F0           // ⏰
75            | 0x23F3           // ⏳
76            | 0x25FD..=0x25FE // ◽◾
77            | 0x2614..=0x2615 // ☔☕
78            | 0x2648..=0x2653 // ♈..♓
79            | 0x267F           // ♿
80            | 0x2693           // ⚓
81            | 0x26A1           // ⚡
82            | 0x26AA..=0x26AB // ⚪⚫
83            | 0x26BD..=0x26BE // ⚽⚾
84            | 0x26C4..=0x26C5 // ⛄⛅
85            | 0x26CE           // ⛎
86            | 0x26D4           // ⛔
87            | 0x26EA           // ⛪
88            | 0x26F2..=0x26F3 // ⛲⛳
89            | 0x26F5           // ⛵
90            | 0x26FA           // ⛺
91            | 0x26FD           // ⛽
92            | 0x2705           // ✅
93            | 0x270A..=0x270B // ✊✋
94            | 0x2728           // ✨
95            | 0x274C           // ❌
96            | 0x274E           // ❎
97            | 0x2753..=0x2755 // ❓❔❕
98            | 0x2757           // ❗
99            | 0x2795..=0x2797 // ➕➖➗
100            | 0x27B0           // ➰
101            | 0x27BF           // ➿
102            | 0x2B1B..=0x2B1C // ⬛⬜
103            | 0x2B50           // ⭐
104            | 0x2B55           // ⭕
105        ),
106        // SMP emoji: nearly all characters in U+1F000–U+1FFFF are emoji.
107        // Exclude CJK Enclosed Ideographic Supplement (EAW=W text symbols).
108        0x1F000..=0x1FFFF => !matches!(
109            cp,
110            0x1F200
111                | 0x1F202..=0x1F219
112                | 0x1F21B..=0x1F22E
113                | 0x1F230..=0x1F231
114                | 0x1F237
115                | 0x1F23B..=0x1F24F
116                | 0x1F260..=0x1F265
117        ),
118        _ => false,
119    }
120}
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125
126    #[test]
127    fn test_is_emoji() {
128        // Emoji-presentation-by-default: always emoji
129        assert!(is_emoji("\u{1F680}"));
130        assert!(is_emoji("\u{1F600}"));
131        assert!(is_emoji("\u{23E9}"));
132        assert!(is_emoji("\u{23EA}"));
133
134        // Text-presentation-by-default with FE0F: emoji
135        assert!(is_emoji("\u{25B6}\u{FE0F}"));
136
137        // Text-presentation-by-default without FE0F: NOT emoji
138        assert!(!is_emoji("\u{25B6}"));
139        assert!(!is_emoji("\u{25C0}"));
140        assert!(!is_emoji("\u{23ED}"));
141        assert!(!is_emoji("\u{23F9}"));
142        assert!(!is_emoji("\u{23EE}"));
143        assert!(!is_emoji("\u{25AA}"));
144        assert!(!is_emoji("\u{25AB}"));
145        assert!(!is_emoji("\u{25FC}"));
146
147        // Not emoji
148        assert!(!is_emoji("A"));
149        assert!(!is_emoji("\u{2588}"));
150    }
151
152    #[test]
153    fn test_is_double_width() {
154        // emoji-presentation-by-default
155        assert!(is_double_width("\u{1F600}"));
156        assert!(is_double_width(
157            "\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}"
158        )); // ZWJ sequence
159
160        [
161            "\u{231A}", "\u{231B}", "\u{23E9}", "\u{23F3}", "\u{2614}", "\u{2615}", "\u{2648}",
162            "\u{2653}", "\u{267F}", "\u{2693}", "\u{26A1}", "\u{26AA}", "\u{26AB}", "\u{26BD}",
163            "\u{26BE}", "\u{26C4}", "\u{26C5}", "\u{26CE}", "\u{26D4}", "\u{26EA}", "\u{26F2}",
164            "\u{26F3}", "\u{26F5}", "\u{26FA}", "\u{26FD}", "\u{25FE}", "\u{2B1B}", "\u{2B1C}",
165            "\u{2B50}", "\u{2B55}", "\u{3030}", "\u{303D}", "\u{3297}", "\u{3299}",
166        ]
167        .iter()
168        .for_each(|s| {
169            assert!(is_double_width(s), "Failed for emoji: {s}");
170        });
171
172        // text-presentation-by-default with FE0F: double-width
173        assert!(is_double_width("\u{25B6}\u{FE0F}"));
174        assert!(is_double_width("\u{25C0}\u{FE0F}"));
175
176        // text-presentation-by-default without FE0F: single-width
177        assert!(!is_double_width("\u{23F8}"));
178        assert!(!is_double_width("\u{23FA}"));
179        assert!(!is_double_width("\u{25AA}"));
180        assert!(!is_double_width("\u{25AB}"));
181        assert!(!is_double_width("\u{25B6}"));
182        assert!(!is_double_width("\u{25C0}"));
183        assert!(!is_double_width("\u{25FB}"));
184        assert!(!is_double_width("\u{2934}"));
185        assert!(!is_double_width("\u{2935}"));
186        assert!(!is_double_width("\u{2B05}"));
187        assert!(!is_double_width("\u{2B07}"));
188        assert!(!is_double_width("\u{26C8}"));
189
190        // CJK
191        assert!(is_double_width("\u{4E2D}"));
192        assert!(is_double_width("\u{65E5}"));
193
194        // single-width
195        assert!(!is_double_width("A"));
196        assert!(!is_double_width("\u{2192}"));
197    }
198}