1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// devela::text::char::namespace::u32
use crate::{Char, TextLut};
/// # Methods over `u32`.
#[rustfmt::skip]
impl Char<u32> {
/* private helpers */
/// Bitmask for extracting the 6-bit payload from a UTF-8 continuation byte (`10xxxxxx`).
pub(crate) const CONT_MASK: u32 = 0b0011_1111;
/* constants */
/// The maximum value of a Unicode code point.
const MAX_UNICODE: u32 = 0x10_FFFF;
/// The value of the first Unicode surrogate code point.
const SURROGATE_START: u32 = 0xD800;
/// The value of the last Unicode surrogate code point.
const SURROGATE_END: u32 = 0xDFFF;
/* public methods */
/// Returns the bytes required to store the given Unicode code point in a non-UTF encoding.
///
/// This function does **not** determine the UTF-8 byte length.
/// It assumes a simple encoding where values up to `0xFF` use 1 byte,
/// `0x100..=0xFFFF` use 2 bytes, and anything larger uses 3 bytes.
#[must_use]
pub const fn len_bytes(self) -> usize {
match self.0 {
0x0000..=0x00FF => 1,
0x0100..=0xFFFF => 2,
_ => 3,
}
}
/// Returns the number of bytes required to encode the given Unicode scalar as UTF-8.
///
/// Returns `None` if it's not a valid Unicode scalar.
#[must_use]
pub const fn len_utf8(self) -> Option<usize> {
if self.is_valid_scalar() { Some(self.len_utf8_unchecked()) } else { None }
}
/// Returns the UTF-8 byte length of the current Unicode scalar **without validation**.
///
/// Assumes the code is a valid Unicode scalar.
/// Use [`len_utf8`][Self::len_utf8] for a checked version.
#[must_use]
pub const fn len_utf8_unchecked(self) -> usize {
match self.0 {
0x00_0000..=0x00_007F => 1,
0x00_0080..=0x00_07FF => 2,
0x00_0800..=0x00_FFFF => 3,
_ => 4,
}
}
/// Returns the monospace display width.
///
/// - 0: Non-printing characters (controls, combining marks)
/// - 1: Regular characters (Latin, Greek, Cyrillic, etc.)
/// - 2: Wide characters (CJK, emoji, fullwidth forms)
#[must_use]
pub const fn width(self) -> usize {
if self.is_control() || self.is_combining() {
0
} else if self.is_fullwidth() {
2
} else {
1
}
}
/// Returns the monospace display width using faster calculation.
///
/// Uses optimized checks that cover common cases but may incorrectly
/// report some obscure Unicode characters as 1 width instead of 2.
#[must_use]
pub const fn width_common(self) -> usize {
if self.is_control_common() || self.is_combining_common() {
0
} else if self.is_fullwidth_common() {
2
} else {
1
}
}
/// Checks if the value is a valid Unicode code point.
///
/// A valid Unicode code point is any integer in the range:
/// - `U+0000` to `U+10FFFF` (inclusive)
///
/// This includes surrogate code points (`U+D800` to `U+DFFF`), which are
/// valid code points but cannot be represented as Unicode scalars.
///
/// # Example
/// ```
/// # use devela::Char;
/// assert!(Char('A' as u32).is_valid_code()); // regular character
/// assert!(Char(0x00).is_valid_code()); // NULL is valid
/// assert!(Char(0x10FFFF).is_valid_code()); // maximum Unicode code point
/// // surrogates are valid code points:
/// assert!(Char(0xD800).is_valid_code()); // high surrogate
/// assert!(Char(0xDFFF).is_valid_code()); // low surrogate
/// // invalid:
/// assert!(!Char(0x110000).is_valid_code()); // above max Unicode
/// ```
#[must_use] #[inline(always)]
pub const fn is_valid_code(self) -> bool {
self.0 <= Self::MAX_UNICODE
}
/// Checks if the value is a valid Unicode scalar (a Rust's [`char`]).
///
/// A valid Unicode scalar value is any integer in the ranges:
/// - `U+0000` to `U+D7FF` (inclusive), or
/// - `U+E000` to `U+10FFFF` (inclusive)
///
/// This excludes surrogate code points (`U+D800` to `U+DFFF`), which are
/// invalid in UTF-8 and cannot be represented as Unicode scalars.
///
/// # Example
/// ```
/// # use devela::Char;
/// assert!(Char('A' as u32).is_valid_scalar()); // regular character
/// assert!(Char(0x00).is_valid_scalar()); // NULL is valid
/// assert!(Char(0x10FFFF).is_valid_scalar()); // maximum Unicode scalar
/// // invalid:
/// assert!(!Char(0xD800).is_valid_scalar()); // high surrogate
/// assert!(!Char(0xDFFF).is_valid_scalar()); // low surrogate
/// assert!(!Char(0x110000).is_valid_scalar()); // above max Unicode
/// ```
#[must_use]
#[inline(always)]
pub const fn is_valid_scalar(self) -> bool {
(self.0 < Self::SURROGATE_START)
|| (self.0 > Self::SURROGATE_END && self.is_valid_code())
}
/// Checks if the given value is a 7-bit ASCII character (U+0000..=U+007F).
#[must_use] #[inline(always)]
pub const fn is_ascii(self) -> bool { self.0 <= 0x7F }
/// Returns `true` if the given Unicode scalar code is a [noncharacter][0].
///
/// Note that this also checks against reserved, potential non-characters.
///
/// [0]: https://www.unicode.org/glossary/#noncharacter
#[must_use]
pub const fn is_noncharacter(self) -> bool {
// sub-block of 32 non-characters:
(self.0 >= 0xFDD0 && self.0 <= 0xFDEF)
// 2× non-characters at the end of each plane:
|| (self.0 >= 0xFFFE && (self.0 & 0xFF) == 0xFE)
|| (self.0 >= 0xFFFE && (self.0 & 0xFF) == 0xFF)
// unallocated range (16 potential non-characters):
|| (self.0 >= 0x2FE0 && self.0 <= 0x2FEF)
}
/// Returns `true` for all Unicode combining characters.
///
/// Includes musical notation, historic scripts, and obscure diacritics.
/// Comprehensive but slightly slower than `is_combining_common`.
#[must_use]
pub const fn is_combining(self) -> bool {
matches![
self.0,
0x0300..=0x036F | // Combining Diacritical Marks
0x1AB0..=0x1AFF | // Combining Diacritical Marks Extended
0x1DC0..=0x1DFF | // Combining Diacritical Marks Supplement
0x20D0..=0x20FF | // Combining Diacritical Marks for Symbols
0xFE20..=0xFE2F | // Combining Half Marks
0xFE00..=0xFE0F | // Variation Selectors
0xE0100..=0xE01EF | // "
0x1D1A0..=0x1D1CD | // Musical Symbols
0x1D200..=0x1D245 | // Ancient Greek Musical Notation
0x1E000..=0x1E006 | // Glagolitic Combining Letters
0x1E130..=0x1E136 | // Nyiakeng Puachue Hmong
0x1E2AE..=0x1E2BF | // Toto
0x1E2EC..=0x1E2EF | // Wancho
0x1EC71..=0x1ECAB | // Kaktovik Numerals
0x1ED01..=0x1ED3D // Ottoman Siyaq Numbers
]
}
/// Returns `true` for common combining marks used in modern text.
///
/// Covers Latin, Greek, and most European language diacritics.
/// Fast and suitable for 95% of use cases.
#[must_use]
pub const fn is_combining_common(self) -> bool {
matches![
self.0,
0x0300..=0x036F | // Combining Diacritical Marks
0x1DC0..=0x1DFF | // Combining Diacritical Marks Supplement
0x20D0..=0x20FF | // Combining Diacritical Marks for Symbols
0xFE20..=0xFE2F | // Combining Half Marks
0xFE00..=0xFE0F // Variation Selectors
]
}
/// Returns `true` for all Unicode control characters.
pub const fn is_control(self) -> bool {
matches![self.0,
// ASCII and C1 controls
0x00..=0x1F | 0x7F | 0x80..=0x9F |
// Unicode control blocks
0x070F | // Syriac Abbreviation Mark
0x180B..=0x180E | // Mongolian controls
0x200B..=0x200F | // Zero-width spaces, bidirectional
0x202A..=0x202E | // Bidirectional formatting
0x2060..=0x206F | // Word joiners, invisible operators
0xFEFF | // Zero Width No-Break Space (BOM)
0xFFF9..=0xFFFB | // Interlinear annotation controls
0x110B9 | // Kaithi punctuation
0x1D173..=0x1D17A | // Musical symbols controls
0xE0000..=0xE007F // Tags and variation selectors
]
}
/// Returns `true` for common Unicode control characters.
///
/// Just ASCII, zero-width spaces, bidi formatting, word joiners and invisible operators.
pub const fn is_control_common(self) -> bool {
matches![self.0,
// ASCII and C1 controls
0x00..=0x1F | 0x7F | 0x80..=0x9F |
// most common Unicode control blocks
0x200B..=0x200F | // Zero-width spaces, bidirectional
0x202A..=0x202E | // Bidirectional formatting
0x2060..=0x206F // Word joiners, invisible operators
]
}
/// Returns `true` for all Unicode fullwidth characters.
#[must_use]
pub const fn is_fullwidth(self) -> bool {
matches![self.0,
// fullwidth Forms block (FF00-FFEF)
0xFF01..=0xFF5E | // Fullwidth ASCII
0xFF5F..=0xFF60 | // Fullwidth brackets
0xFF61..=0xFF9F | // Halfwidth Katakana (considered fullwidth in context)
0xFFE0..=0xFFE6 | // Fullwidth symbols
0xFFE8..=0xFFEE | // Fullwidth halfwidth forms
// CJK Unified Ideographs and extensions
0x4E00..=0x9FFF | // CJK Unified Ideographs
0x3400..=0x4DBF | // CJK Extension A
0x20000..=0x2A6DF | // CJK Extension B
0x2A700..=0x2B73F | // CJK Extension C
0x2B740..=0x2B81F | // CJK Extension D
0x2B820..=0x2CEAF | // CJK Extension E
0x2CEB0..=0x2EBEF | // CJK Extension F
// Hangul Syllables
0xAC00..=0xD7AF |
// Other East Asian wide characters
0x3000..=0x303F | // CJK Symbols and Punctuation
0x3040..=0x309F | // Hiragana
0x30A0..=0x30FF | // Katakana
0x3100..=0x312F | // Bopomofo
0x3130..=0x318F | // Hangul Compatibility Jamo
0x3190..=0x319F | // Kanbun
0x31A0..=0x31BF | // Bopomofo Extended
0x31C0..=0x31EF | // CJK Strokes
0x31F0..=0x31FF | // Katakana Phonetic Extensions
0x3200..=0x32FF | // Enclosed CJK Letters and Months
0x3300..=0x33FF | // CJK Compatibility
0xFE10..=0xFE1F | // Vertical Forms
0xFE30..=0xFE4F | // CJK Compatibility Forms
0xFE50..=0xFE6F | // Small Form Variants
0x1F200..=0x1F2FF // Enclosed Ideographic Supplement
]
}
/// Returns `true` for common fullwidth characters (ASCII variants, basic CJK)
pub const fn is_fullwidth_common(self) -> bool {
matches![
self.0,
// Fullwidth ASCII variants and basic East Asian punctuation
0xFF01..=0xFF5E |
0xFF5F..=0xFF60 |
0xFF61..=0xFF9F |
0xFFE0..=0xFFE6 |
0xFFE8..=0xFFEE |
// Basic CJK ranges
0x4E00..=0x9FFF |
0x3000..=0x303F
]
}
/// Returns `true` if the given value is a Unicode [surrogate][0] code point.
///
/// [0]: https://www.unicode.org/glossary/#surrogate_code_point
#[must_use] #[inline(always)]
pub const fn is_surrogate(self) -> bool { matches!(self.0, 0xD800..=0xDFFF) }
/// Returns `true` if the given value is a Unicode [leading surrogate][0] code point.
///
/// [0]: https://www.unicode.org/glossary/#high_surrogate_code_point
#[must_use] #[inline(always)]
pub const fn is_surrogate_high(self) -> bool { matches!(self.0, 0xD800..=0xDBFF) }
/// Returns `true` if the given value is a Unicode [trailing surrogate][0] code point.
///
/// [0]: https://www.unicode.org/glossary/#low_surrogate_code_point
#[must_use] #[inline(always)]
pub const fn is_surrogate_low(self) -> bool { matches!(self.0, 0xDC00..=0xDFFF) }
//
/// Returns the ASCII `&'static str` representation of the value, or `""` if non-ASCII.
#[must_use]
pub const fn as_ascii(self) -> &'static str {
if self.is_ascii() { TextLut::ASCII_CHARS[self.0 as usize] } else { "" }
}
/// Returns the ASCII `&'static str` representation of the value, or panics if non-ASCII.
///
/// # Panics
/// Panics if the character is not ASCII.
#[must_use]
pub const fn as_ascii_unchecked(self) -> &'static str { TextLut::ASCII_CHARS[self.0 as usize] }
/// Converts the Unicode scalar value to a UTF-8 encoded byte sequence array.
///
/// Returns `None` if the value is not a valid Unicode scalar.
/// The result is always a `[u8; 4]` array, with unused bytes set to `0`.
///
/// See also [`char::encode_utf8`].
#[must_use] #[inline(always)]
pub const fn to_utf8_bytes(self) -> Option<[u8; 4]> {
if self.is_valid_scalar() { Some(self.to_utf8_bytes_unchecked()) } else { None }
}
/// Converts the Unicode scalar value to a UTF-8 encoded byte sequence **without validation**.
///
/// Assumes the value is a valid Unicode scalar.
/// Always returns a `[u8; 4]` array, with unused bytes set to `0`.
///
/// See also [`Char::to_utf8_bytes`] for a checked version.
#[must_use]
#[allow(clippy::unusual_byte_groupings)]
pub const fn to_utf8_bytes_unchecked(self) -> [u8; 4] {
let value = self.0;
match value {
// From 0x0000 to 0x007F:
// the UTF-8 encoding is the same as the scalar value.
0x0000..=0x007F => [value as u8, 0, 0, 0],
// from 0x0080 to 0x07FF:
// the UTF-8 encoding is 110xxxxx 10xxxxxx,
// where xxxxx and xxxxxx are the bits of the scalar value.
0x0080..=0x07FF => {
let y = 0b10_000000 | (Char::<u8>::CONT_MASK & (value as u8));
let x = 0b110_00000 | ((value >> 6) as u8);
[x, y, 0, 0]
}
// From from 0x0800 to 0xFFFF:
// the UTF-8 encoding is 1110xxxx 10xxxxxx 10xxxxxx.
0x0800..=0xFFFF => {
let z = 0b10_000000 | (Char::<u8>::CONT_MASK & (value as u8));
let y = 0b10_000000 | ((value >> 6) & Char::<u32>::CONT_MASK) as u8;
let x = 0b1110_0000 | ((value >> 12) as u8);
[x, y, z, 0]
}
// From 0x10000 to 0x10FFFF:
// the UTF-8 encoding is 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
_ => {
let w = 0b10_000000 | (Char::<u8>::CONT_MASK & (value as u8));
let z = 0b10_000000 | ((value >> 6) & Char::<u32>::CONT_MASK) as u8;
let y = 0b10_000000 | ((value >> 12) & Char::<u32>::CONT_MASK) as u8;
let x = 0b11110_000 | ((value >> 18) as u8);
[x, y, z, w]
}
}
}
}