devela/text/ascii/char.rs
1// devela::text::ascii::char
2//
3//! Defines [`CharAscii`].
4//
5// Ported from:
6// - https://doc.rust-lang.org/stable/core/ascii/enum.Char.html
7// - WAIT: [ascii::Char](https://github.com/rust-lang/rust/issues/110998)
8
9#[cfg(feature = "unsafe_str")]
10use crate::transmute;
11use ::core::fmt;
12
13#[doc = crate::_tags!(text)]
14/// One of 128 Unicode characters (`U+0000` to `U+007F`), the ASCII subset.
15#[doc = crate::_doc_meta!{location("text")}]
16///
17/// Officially, this is the first [block] in Unicode, _Basic Latin_.
18/// For details, see the [*C0 Controls and Basic Latin*][chart] code chart.
19///
20/// This block was based on older 7-bit character code standards such as
21/// ANSI X3.4-1977, ISO 646-1973, and [NIST FIPS 1-2].
22///
23/// # When to use this
24/// The main advantage of this subset is that it's always valid UTF-8. As such,
25/// the `&[CharAscii]` -> `&str` conversion function (as well as other related
26/// ones) are O(1): *no* runtime checks are needed.
27///
28/// If you're consuming strings, you should usually handle Unicode and thus
29/// accept `str`s, not limit yourself to `CharAscii`s.
30///
31/// However, certain formats are intentionally designed to produce ASCII-only
32/// output in order to be 8-bit-clean. In those cases, it can be simpler and
33/// faster to generate `CharAscii`s instead of dealing with the variable width
34/// properties of general UTF-8 encoded strings, while still allowing the result
35/// to be used freely with other Rust things that deal in general `str`s.
36///
37/// For example, a UUID library might offer a way to produce the string
38/// representation of a UUID as an `[CharAscii; 36]` to avoid memory
39/// allocation yet still allow it to be used as UTF-8 via `as_str` without
40/// paying for validation (or needing `unsafe` code) the way it would if it
41/// were provided as a `[u8; 36]`.
42///
43/// # Layout
44/// This type is guaranteed to have a size and alignment of 1 byte.
45///
46/// # Names
47/// The variants on this type are [Unicode names][NamesList] of the characters
48/// in upper camel case, with a few tweaks:
49/// - For `<control>` characters, the primary alias name is used.
50/// - `LATIN` is dropped, as this block has no non-latin letters.
51/// - `LETTER` is dropped, as `CAPITAL`/`SMALL` suffices in this block.
52/// - `DIGIT`s use a single digit rather than writing out `ZERO`, `ONE`, etc.
53///
54/// [ASCII]: https://www.unicode.org/glossary/index.html#ASCII
55/// [block]: https://www.unicode.org/glossary/index.html#block
56/// [chart]: https://www.unicode.org/charts/PDF/U0000.pdf
57/// [NIST FIPS 1-2]: https://nvlpubs.nist.gov/nistpubs/Legacy/FIPS/fipspub1-2-1977.pdf
58/// [NamesList]: https://www.unicode.org/Public/15.0.0/ucd/NamesList.txt
59#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
60#[repr(u8)]
61pub enum CharAscii {
62 /// U+0000 (Default variant)
63 #[default]
64 Null = 0,
65 /// U+0001
66 StartOfHeading = 1,
67 /// U+0002
68 StartOfText = 2,
69 /// U+0003
70 EndOfText = 3,
71 /// U+0004
72 EndOfTransmission = 4,
73 /// U+0005
74 Enquiry = 5,
75 /// U+0006
76 Acknowledge = 6,
77 /// U+0007
78 Bell = 7,
79 /// U+0008
80 Backspace = 8,
81 /// U+0009
82 AsciiCharacterTabulation = 9,
83 /// U+000A
84 LineFeed = 10,
85 /// U+000B
86 LineTabulation = 11,
87 /// U+000C
88 FormFeed = 12,
89 /// U+000D
90 CarriageReturn = 13,
91 /// U+000E
92 ShiftOut = 14,
93 /// U+000F
94 ShiftIn = 15,
95 /// U+0010
96 DataLinkEscape = 16,
97 /// U+0011
98 DeviceControlOne = 17,
99 /// U+0012
100 DeviceControlTwo = 18,
101 /// U+0013
102 DeviceControlThree = 19,
103 /// U+0014
104 DeviceControlFour = 20,
105 /// U+0015
106 NegativeAcknowledge = 21,
107 /// U+0016
108 SynchronousIdle = 22,
109 /// U+0017
110 EndOfTransmissionBlock = 23,
111 /// U+0018
112 Cancel = 24,
113 /// U+0019
114 EndOfMedium = 25,
115 /// U+001A
116 Substitute = 26,
117 /// U+001B
118 Escape = 27,
119 /// U+001C
120 InformationSeparatorFour = 28,
121 /// U+001D
122 InformationSeparatorThree = 29,
123 /// U+001E
124 InformationSeparatorTwo = 30,
125 /// U+001F
126 InformationSeparatorOne = 31,
127 /// U+0020
128 Space = 32,
129 /// U+0021
130 ExclamationMark = 33,
131 /// U+0022
132 QuotationMark = 34,
133 /// U+0023
134 NumberSign = 35,
135 /// U+0024
136 DollarSign = 36,
137 /// U+0025
138 PercentSign = 37,
139 /// U+0026
140 Ampersand = 38,
141 /// U+0027
142 Apostrophe = 39,
143 /// U+0028
144 LeftParenthesis = 40,
145 /// U+0029
146 RightParenthesis = 41,
147 /// U+002A
148 Asterisk = 42,
149 /// U+002B
150 PlusSign = 43,
151 /// U+002C
152 Comma = 44,
153 /// U+002D
154 HyphenMinus = 45,
155 /// U+002E
156 FullStop = 46,
157 /// U+002F
158 Solidus = 47,
159 /// U+0030
160 Digit0 = 48,
161 /// U+0031
162 Digit1 = 49,
163 /// U+0032
164 Digit2 = 50,
165 /// U+0033
166 Digit3 = 51,
167 /// U+0034
168 Digit4 = 52,
169 /// U+0035
170 Digit5 = 53,
171 /// U+0036
172 Digit6 = 54,
173 /// U+0037
174 Digit7 = 55,
175 /// U+0038
176 Digit8 = 56,
177 /// U+0039
178 Digit9 = 57,
179 /// U+003A
180 Colon = 58,
181 /// U+003B
182 Semicolon = 59,
183 /// U+003C
184 LessThanSign = 60,
185 /// U+003D
186 EqualsSign = 61,
187 /// U+003E
188 GreaterThanSign = 62,
189 /// U+003F
190 QuestionMark = 63,
191 /// U+0040
192 CommercialAt = 64,
193 /// U+0041
194 CapitalA = 65,
195 /// U+0042
196 CapitalB = 66,
197 /// U+0043
198 CapitalC = 67,
199 /// U+0044
200 CapitalD = 68,
201 /// U+0045
202 CapitalE = 69,
203 /// U+0046
204 CapitalF = 70,
205 /// U+0047
206 CapitalG = 71,
207 /// U+0048
208 CapitalH = 72,
209 /// U+0049
210 CapitalI = 73,
211 /// U+004A
212 CapitalJ = 74,
213 /// U+004B
214 CapitalK = 75,
215 /// U+004C
216 CapitalL = 76,
217 /// U+004D
218 CapitalM = 77,
219 /// U+004E
220 CapitalN = 78,
221 /// U+004F
222 CapitalO = 79,
223 /// U+0050
224 CapitalP = 80,
225 /// U+0051
226 CapitalQ = 81,
227 /// U+0052
228 CapitalR = 82,
229 /// U+0053
230 CapitalS = 83,
231 /// U+0054
232 CapitalT = 84,
233 /// U+0055
234 CapitalU = 85,
235 /// U+0056
236 CapitalV = 86,
237 /// U+0057
238 CapitalW = 87,
239 /// U+0058
240 CapitalX = 88,
241 /// U+0059
242 CapitalY = 89,
243 /// U+005A
244 CapitalZ = 90,
245 /// U+005B
246 LeftSquareBracket = 91,
247 /// U+005C
248 ReverseSolidus = 92,
249 /// U+005D
250 RightSquareBracket = 93,
251 /// U+005E
252 CircumflexAccent = 94,
253 /// U+005F
254 LowLine = 95,
255 /// U+0060
256 GraveAccent = 96,
257 /// U+0061
258 SmallA = 97,
259 /// U+0062
260 SmallB = 98,
261 /// U+0063
262 SmallC = 99,
263 /// U+0064
264 SmallD = 100,
265 /// U+0065
266 SmallE = 101,
267 /// U+0066
268 SmallF = 102,
269 /// U+0067
270 SmallG = 103,
271 /// U+0068
272 SmallH = 104,
273 /// U+0069
274 SmallI = 105,
275 /// U+006A
276 SmallJ = 106,
277 /// U+006B
278 SmallK = 107,
279 /// U+006C
280 SmallL = 108,
281 /// U+006D
282 SmallM = 109,
283 /// U+006E
284 SmallN = 110,
285 /// U+006F
286 SmallO = 111,
287 /// U+0070
288 SmallP = 112,
289 /// U+0071
290 SmallQ = 113,
291 /// U+0072
292 SmallR = 114,
293 /// U+0073
294 SmallS = 115,
295 /// U+0074
296 SmallT = 116,
297 /// U+0075
298 SmallU = 117,
299 /// U+0076
300 SmallV = 118,
301 /// U+0077
302 SmallW = 119,
303 /// U+0078
304 SmallX = 120,
305 /// U+0079
306 SmallY = 121,
307 /// U+007A
308 SmallZ = 122,
309 /// U+007B
310 LeftCurlyBracket = 123,
311 /// U+007C
312 VerticalLine = 124,
313 /// U+007D
314 RightCurlyBracket = 125,
315 /// U+007E
316 Tilde = 126,
317 /// U+007F
318 Delete = 127,
319}
320
321crate::_impl_init![Self::Null => CharAscii];
322
323impl CharAscii {
324 /// Creates an ascii character from the byte `b`,
325 /// or returns `None` if it's too large.
326 #[must_use]
327 pub const fn from_u8(b: u8) -> Option<Self> {
328 match b {
329 0 => Some(Self::Null),
330 1 => Some(Self::StartOfHeading),
331 2 => Some(Self::StartOfText),
332 3 => Some(Self::EndOfText),
333 4 => Some(Self::EndOfTransmission),
334 5 => Some(Self::Enquiry),
335 6 => Some(Self::Acknowledge),
336 7 => Some(Self::Bell),
337 8 => Some(Self::Backspace),
338 9 => Some(Self::AsciiCharacterTabulation),
339 10 => Some(Self::LineFeed),
340 11 => Some(Self::LineTabulation),
341 12 => Some(Self::FormFeed),
342 13 => Some(Self::CarriageReturn),
343 14 => Some(Self::ShiftOut),
344 15 => Some(Self::ShiftIn),
345 16 => Some(Self::DataLinkEscape),
346 17 => Some(Self::DeviceControlOne),
347 18 => Some(Self::DeviceControlTwo),
348 19 => Some(Self::DeviceControlThree),
349 20 => Some(Self::DeviceControlFour),
350 21 => Some(Self::NegativeAcknowledge),
351 22 => Some(Self::SynchronousIdle),
352 23 => Some(Self::EndOfTransmissionBlock),
353 24 => Some(Self::Cancel),
354 25 => Some(Self::EndOfMedium),
355 26 => Some(Self::Substitute),
356 27 => Some(Self::Escape),
357 28 => Some(Self::InformationSeparatorFour),
358 29 => Some(Self::InformationSeparatorThree),
359 30 => Some(Self::InformationSeparatorTwo),
360 31 => Some(Self::InformationSeparatorOne),
361 32 => Some(Self::Space),
362 33 => Some(Self::ExclamationMark),
363 34 => Some(Self::QuotationMark),
364 35 => Some(Self::NumberSign),
365 36 => Some(Self::DollarSign),
366 37 => Some(Self::PercentSign),
367 38 => Some(Self::Ampersand),
368 39 => Some(Self::Apostrophe),
369 40 => Some(Self::LeftParenthesis),
370 41 => Some(Self::RightParenthesis),
371 42 => Some(Self::Asterisk),
372 43 => Some(Self::PlusSign),
373 44 => Some(Self::Comma),
374 45 => Some(Self::HyphenMinus),
375 46 => Some(Self::FullStop),
376 47 => Some(Self::Solidus),
377 48 => Some(Self::Digit0),
378 49 => Some(Self::Digit1),
379 50 => Some(Self::Digit2),
380 51 => Some(Self::Digit3),
381 52 => Some(Self::Digit4),
382 53 => Some(Self::Digit5),
383 54 => Some(Self::Digit6),
384 55 => Some(Self::Digit7),
385 56 => Some(Self::Digit8),
386 57 => Some(Self::Digit9),
387 58 => Some(Self::Colon),
388 59 => Some(Self::Semicolon),
389 60 => Some(Self::LessThanSign),
390 61 => Some(Self::EqualsSign),
391 62 => Some(Self::GreaterThanSign),
392 63 => Some(Self::QuestionMark),
393 64 => Some(Self::CommercialAt),
394 65 => Some(Self::CapitalA),
395 66 => Some(Self::CapitalB),
396 67 => Some(Self::CapitalC),
397 68 => Some(Self::CapitalD),
398 69 => Some(Self::CapitalE),
399 70 => Some(Self::CapitalF),
400 71 => Some(Self::CapitalG),
401 72 => Some(Self::CapitalH),
402 73 => Some(Self::CapitalI),
403 74 => Some(Self::CapitalJ),
404 75 => Some(Self::CapitalK),
405 76 => Some(Self::CapitalL),
406 77 => Some(Self::CapitalM),
407 78 => Some(Self::CapitalN),
408 79 => Some(Self::CapitalO),
409 80 => Some(Self::CapitalP),
410 81 => Some(Self::CapitalQ),
411 82 => Some(Self::CapitalR),
412 83 => Some(Self::CapitalS),
413 84 => Some(Self::CapitalT),
414 85 => Some(Self::CapitalU),
415 86 => Some(Self::CapitalV),
416 87 => Some(Self::CapitalW),
417 88 => Some(Self::CapitalX),
418 89 => Some(Self::CapitalY),
419 90 => Some(Self::CapitalZ),
420 91 => Some(Self::LeftSquareBracket),
421 92 => Some(Self::ReverseSolidus),
422 93 => Some(Self::RightSquareBracket),
423 94 => Some(Self::CircumflexAccent),
424 95 => Some(Self::LowLine),
425 96 => Some(Self::GraveAccent),
426 97 => Some(Self::SmallA),
427 98 => Some(Self::SmallB),
428 99 => Some(Self::SmallC),
429 100 => Some(Self::SmallD),
430 101 => Some(Self::SmallE),
431 102 => Some(Self::SmallF),
432 103 => Some(Self::SmallG),
433 104 => Some(Self::SmallH),
434 105 => Some(Self::SmallI),
435 106 => Some(Self::SmallJ),
436 107 => Some(Self::SmallK),
437 108 => Some(Self::SmallL),
438 109 => Some(Self::SmallM),
439 110 => Some(Self::SmallN),
440 111 => Some(Self::SmallO),
441 112 => Some(Self::SmallP),
442 113 => Some(Self::SmallQ),
443 114 => Some(Self::SmallR),
444 115 => Some(Self::SmallS),
445 116 => Some(Self::SmallT),
446 117 => Some(Self::SmallU),
447 118 => Some(Self::SmallV),
448 119 => Some(Self::SmallW),
449 120 => Some(Self::SmallX),
450 121 => Some(Self::SmallY),
451 122 => Some(Self::SmallZ),
452 123 => Some(Self::LeftCurlyBracket),
453 124 => Some(Self::VerticalLine),
454 125 => Some(Self::RightCurlyBracket),
455 126 => Some(Self::Tilde),
456 127 => Some(Self::Delete),
457 _ => None,
458 }
459 }
460
461 /// Creates an ASCII character from the byte `b`,
462 /// without checking whether it's valid.
463 /// # Safety
464 /// `b` must be in `0..=127`, or else this is UB.
465 #[must_use]
466 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
467 #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
468 pub const unsafe fn from_u8_unchecked(b: u8) -> Self {
469 // SAFETY: Our safety precondition is that `b` is in-range.
470 unsafe { transmute(b) }
471 }
472
473 /// When passed the *number* `0`, `1`, …, `9`, returns the *character*
474 /// `'0'`, `'1'`, …, `'9'` respectively.
475 ///
476 /// If `d >= 10`, returns `None`.
477 ///
478 /// # Features
479 /// Uses `unsafe_hint` for performance optimizations.
480 #[must_use]
481 pub const fn digit(d: u8) -> Option<Self> {
482 if d < 10 {
483 let sum = {
484 cfg_select! { all(feature = "unsafe_hint", not(feature = "safe_text")) => {
485 unsafe { b'0'.unchecked_add(d) } // SAFETY: we've checked d < 10
486 } _ => {
487 b'0' + d
488 }}
489 };
490 Self::from_u8(sum)
491 } else {
492 None
493 }
494 }
495
496 /// When passed the *number* `0`, `1`, …, `9`, returns the *character*
497 /// `'0'`, `'1'`, …, `'9'` respectively, without checking that it's in-range.
498 ///
499 /// # Safety
500 /// This is immediate UB if called with `d > 64`.
501 ///
502 /// If `d >= 10` and `d <= 64`, this is allowed to return any value or panic.
503 /// Notably, it should not be expected to return hex digits, or any other
504 /// reasonable extension of the decimal digits.
505 ///
506 /// (This lose safety condition is intended to simplify soundness proofs
507 /// when writing code using this method, since the implementation doesn't
508 /// need something really specific, not to make those other arguments do
509 /// something useful. It might be tightened before stabilization.)
510 #[must_use]
511 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
512 #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
513 pub const unsafe fn digit_unchecked(d: u8) -> Self {
514 debug_assert!(d < 10);
515
516 // SAFETY: `'0'` through `'9'` are U+00030 through U+0039,
517 // so because `d` must be 64 or less the addition can return at most
518 // 112 (0x70), which doesn't overflow and is within the ASCII range.
519 unsafe {
520 let byte = b'0'.unchecked_add(d);
521 Self::from_u8_unchecked(byte)
522 }
523 }
524
525 /// Gets this ASCII character as a byte.
526 #[must_use]
527 pub const fn as_u8(self) -> u8 {
528 self as u8
529 }
530
531 /// Gets this ASCII character as a `char` Unicode Scalar Value.
532 #[must_use]
533 pub const fn as_char(self) -> char {
534 self as u8 as char
535 }
536
537 /// Views this ASCII character as a one-code-unit UTF-8 `str`.
538 #[must_use]
539 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
540 #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
541 pub const fn as_str(&self) -> &str {
542 Self::slice_as_str(core::slice::from_ref(self))
543 }
544}
545
546impl CharAscii {
547 /// Views a slice of ASCII characters as a UTF-8 `str`.
548 #[must_use]
549 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
550 #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
551 pub const fn slice_as_str(slice: &[CharAscii]) -> &str {
552 let ascii_ptr: *const [CharAscii] = slice;
553 let str_ptr = ascii_ptr as *const str;
554 // SAFETY: Each ASCII codepoint in UTF-8 is encoded as one single-byte
555 // code unit having the same value as the ASCII byte.
556 unsafe { &*str_ptr }
557 }
558
559 /// Views a slice of ASCII characters as a slice of `u8` bytes.
560 #[must_use]
561 #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
562 #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
563 pub const fn slice_as_bytes(slice: &[CharAscii]) -> &[u8] {
564 CharAscii::slice_as_str(slice).as_bytes()
565 }
566}
567// impl [CharAscii] {
568// /// Views this slice of ASCII characters as a UTF-8 `str`.
569// #[must_use]
570// pub const fn as_str(&self) -> &str {
571// let ascii_ptr: *const Self = self;
572// let str_ptr = ascii_ptr as *const str;
573// // SAFETY: Each ASCII codepoint in UTF-8 is encoded as one single-byte
574// // code unit having the same value as the ASCII byte.
575// unsafe { &*str_ptr }
576// }
577//
578// /// Views this slice of ASCII characters as a slice of `u8` bytes.
579// #[must_use]
580// pub const fn as_bytes(&self) -> &[u8] {
581// self.as_str().as_bytes()
582// }
583// }
584
585impl fmt::Display for CharAscii {
586 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587 fmt::Display::fmt(&self.as_char(), f)
588 }
589}