Skip to main content

devela/text/ascii/
char.rs

1// devela::text::ascii::char
2//
3//! Defines [`CharAscii`].
4//
5// Ported from:
6// - https://doc.rust-lang.org/stable/core/ascii/enum.Char.html
7// - WAIT: [ascii::Char](https://github.com/rust-lang/rust/issues/110998)
8
9#[cfg(feature = "unsafe_str")]
10use crate::transmute;
11use ::core::fmt;
12
13#[doc = crate::_tags!(text)]
14/// One of 128 Unicode characters (`U+0000` to `U+007F`), the ASCII subset.
15#[doc = crate::_doc_meta!{location("text")}]
16///
17/// Officially, this is the first [block] in Unicode, _Basic Latin_.
18/// For details, see the [*C0 Controls and Basic Latin*][chart] code chart.
19///
20/// This block was based on older 7-bit character code standards such as
21/// ANSI X3.4-1977, ISO 646-1973, and [NIST FIPS 1-2].
22///
23/// # When to use this
24/// The main advantage of this subset is that it's always valid UTF-8.  As such,
25/// the `&[CharAscii]` -> `&str` conversion function (as well as other related
26/// ones) are O(1): *no* runtime checks are needed.
27///
28/// If you're consuming strings, you should usually handle Unicode and thus
29/// accept `str`s, not limit yourself to `CharAscii`s.
30///
31/// However, certain formats are intentionally designed to produce ASCII-only
32/// output in order to be 8-bit-clean.  In those cases, it can be simpler and
33/// faster to generate `CharAscii`s instead of dealing with the variable width
34/// properties of general UTF-8 encoded strings, while still allowing the result
35/// to be used freely with other Rust things that deal in general `str`s.
36///
37/// For example, a UUID library might offer a way to produce the string
38/// representation of a UUID as an `[CharAscii; 36]` to avoid memory
39/// allocation yet still allow it to be used as UTF-8 via `as_str` without
40/// paying for validation (or needing `unsafe` code) the way it would if it
41/// were provided as a `[u8; 36]`.
42///
43/// # Layout
44/// This type is guaranteed to have a size and alignment of 1 byte.
45///
46/// # Names
47/// The variants on this type are [Unicode names][NamesList] of the characters
48/// in upper camel case, with a few tweaks:
49/// - For `<control>` characters, the primary alias name is used.
50/// - `LATIN` is dropped, as this block has no non-latin letters.
51/// - `LETTER` is dropped, as `CAPITAL`/`SMALL` suffices in this block.
52/// - `DIGIT`s use a single digit rather than writing out `ZERO`, `ONE`, etc.
53///
54/// [ASCII]: https://www.unicode.org/glossary/index.html#ASCII
55/// [block]: https://www.unicode.org/glossary/index.html#block
56/// [chart]: https://www.unicode.org/charts/PDF/U0000.pdf
57/// [NIST FIPS 1-2]: https://nvlpubs.nist.gov/nistpubs/Legacy/FIPS/fipspub1-2-1977.pdf
58/// [NamesList]: https://www.unicode.org/Public/15.0.0/ucd/NamesList.txt
59#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
60#[repr(u8)]
61pub enum CharAscii {
62    /// U+0000 (Default variant)
63    #[default]
64    Null = 0,
65    /// U+0001
66    StartOfHeading = 1,
67    /// U+0002
68    StartOfText = 2,
69    /// U+0003
70    EndOfText = 3,
71    /// U+0004
72    EndOfTransmission = 4,
73    /// U+0005
74    Enquiry = 5,
75    /// U+0006
76    Acknowledge = 6,
77    /// U+0007
78    Bell = 7,
79    /// U+0008
80    Backspace = 8,
81    /// U+0009
82    AsciiCharacterTabulation = 9,
83    /// U+000A
84    LineFeed = 10,
85    /// U+000B
86    LineTabulation = 11,
87    /// U+000C
88    FormFeed = 12,
89    /// U+000D
90    CarriageReturn = 13,
91    /// U+000E
92    ShiftOut = 14,
93    /// U+000F
94    ShiftIn = 15,
95    /// U+0010
96    DataLinkEscape = 16,
97    /// U+0011
98    DeviceControlOne = 17,
99    /// U+0012
100    DeviceControlTwo = 18,
101    /// U+0013
102    DeviceControlThree = 19,
103    /// U+0014
104    DeviceControlFour = 20,
105    /// U+0015
106    NegativeAcknowledge = 21,
107    /// U+0016
108    SynchronousIdle = 22,
109    /// U+0017
110    EndOfTransmissionBlock = 23,
111    /// U+0018
112    Cancel = 24,
113    /// U+0019
114    EndOfMedium = 25,
115    /// U+001A
116    Substitute = 26,
117    /// U+001B
118    Escape = 27,
119    /// U+001C
120    InformationSeparatorFour = 28,
121    /// U+001D
122    InformationSeparatorThree = 29,
123    /// U+001E
124    InformationSeparatorTwo = 30,
125    /// U+001F
126    InformationSeparatorOne = 31,
127    /// U+0020
128    Space = 32,
129    /// U+0021
130    ExclamationMark = 33,
131    /// U+0022
132    QuotationMark = 34,
133    /// U+0023
134    NumberSign = 35,
135    /// U+0024
136    DollarSign = 36,
137    /// U+0025
138    PercentSign = 37,
139    /// U+0026
140    Ampersand = 38,
141    /// U+0027
142    Apostrophe = 39,
143    /// U+0028
144    LeftParenthesis = 40,
145    /// U+0029
146    RightParenthesis = 41,
147    /// U+002A
148    Asterisk = 42,
149    /// U+002B
150    PlusSign = 43,
151    /// U+002C
152    Comma = 44,
153    /// U+002D
154    HyphenMinus = 45,
155    /// U+002E
156    FullStop = 46,
157    /// U+002F
158    Solidus = 47,
159    /// U+0030
160    Digit0 = 48,
161    /// U+0031
162    Digit1 = 49,
163    /// U+0032
164    Digit2 = 50,
165    /// U+0033
166    Digit3 = 51,
167    /// U+0034
168    Digit4 = 52,
169    /// U+0035
170    Digit5 = 53,
171    /// U+0036
172    Digit6 = 54,
173    /// U+0037
174    Digit7 = 55,
175    /// U+0038
176    Digit8 = 56,
177    /// U+0039
178    Digit9 = 57,
179    /// U+003A
180    Colon = 58,
181    /// U+003B
182    Semicolon = 59,
183    /// U+003C
184    LessThanSign = 60,
185    /// U+003D
186    EqualsSign = 61,
187    /// U+003E
188    GreaterThanSign = 62,
189    /// U+003F
190    QuestionMark = 63,
191    /// U+0040
192    CommercialAt = 64,
193    /// U+0041
194    CapitalA = 65,
195    /// U+0042
196    CapitalB = 66,
197    /// U+0043
198    CapitalC = 67,
199    /// U+0044
200    CapitalD = 68,
201    /// U+0045
202    CapitalE = 69,
203    /// U+0046
204    CapitalF = 70,
205    /// U+0047
206    CapitalG = 71,
207    /// U+0048
208    CapitalH = 72,
209    /// U+0049
210    CapitalI = 73,
211    /// U+004A
212    CapitalJ = 74,
213    /// U+004B
214    CapitalK = 75,
215    /// U+004C
216    CapitalL = 76,
217    /// U+004D
218    CapitalM = 77,
219    /// U+004E
220    CapitalN = 78,
221    /// U+004F
222    CapitalO = 79,
223    /// U+0050
224    CapitalP = 80,
225    /// U+0051
226    CapitalQ = 81,
227    /// U+0052
228    CapitalR = 82,
229    /// U+0053
230    CapitalS = 83,
231    /// U+0054
232    CapitalT = 84,
233    /// U+0055
234    CapitalU = 85,
235    /// U+0056
236    CapitalV = 86,
237    /// U+0057
238    CapitalW = 87,
239    /// U+0058
240    CapitalX = 88,
241    /// U+0059
242    CapitalY = 89,
243    /// U+005A
244    CapitalZ = 90,
245    /// U+005B
246    LeftSquareBracket = 91,
247    /// U+005C
248    ReverseSolidus = 92,
249    /// U+005D
250    RightSquareBracket = 93,
251    /// U+005E
252    CircumflexAccent = 94,
253    /// U+005F
254    LowLine = 95,
255    /// U+0060
256    GraveAccent = 96,
257    /// U+0061
258    SmallA = 97,
259    /// U+0062
260    SmallB = 98,
261    /// U+0063
262    SmallC = 99,
263    /// U+0064
264    SmallD = 100,
265    /// U+0065
266    SmallE = 101,
267    /// U+0066
268    SmallF = 102,
269    /// U+0067
270    SmallG = 103,
271    /// U+0068
272    SmallH = 104,
273    /// U+0069
274    SmallI = 105,
275    /// U+006A
276    SmallJ = 106,
277    /// U+006B
278    SmallK = 107,
279    /// U+006C
280    SmallL = 108,
281    /// U+006D
282    SmallM = 109,
283    /// U+006E
284    SmallN = 110,
285    /// U+006F
286    SmallO = 111,
287    /// U+0070
288    SmallP = 112,
289    /// U+0071
290    SmallQ = 113,
291    /// U+0072
292    SmallR = 114,
293    /// U+0073
294    SmallS = 115,
295    /// U+0074
296    SmallT = 116,
297    /// U+0075
298    SmallU = 117,
299    /// U+0076
300    SmallV = 118,
301    /// U+0077
302    SmallW = 119,
303    /// U+0078
304    SmallX = 120,
305    /// U+0079
306    SmallY = 121,
307    /// U+007A
308    SmallZ = 122,
309    /// U+007B
310    LeftCurlyBracket = 123,
311    /// U+007C
312    VerticalLine = 124,
313    /// U+007D
314    RightCurlyBracket = 125,
315    /// U+007E
316    Tilde = 126,
317    /// U+007F
318    Delete = 127,
319}
320
321crate::_impl_init![Self::Null => CharAscii];
322
323impl CharAscii {
324    /// Creates an ascii character from the byte `b`,
325    /// or returns `None` if it's too large.
326    #[must_use]
327    pub const fn from_u8(b: u8) -> Option<Self> {
328        match b {
329            0 => Some(Self::Null),
330            1 => Some(Self::StartOfHeading),
331            2 => Some(Self::StartOfText),
332            3 => Some(Self::EndOfText),
333            4 => Some(Self::EndOfTransmission),
334            5 => Some(Self::Enquiry),
335            6 => Some(Self::Acknowledge),
336            7 => Some(Self::Bell),
337            8 => Some(Self::Backspace),
338            9 => Some(Self::AsciiCharacterTabulation),
339            10 => Some(Self::LineFeed),
340            11 => Some(Self::LineTabulation),
341            12 => Some(Self::FormFeed),
342            13 => Some(Self::CarriageReturn),
343            14 => Some(Self::ShiftOut),
344            15 => Some(Self::ShiftIn),
345            16 => Some(Self::DataLinkEscape),
346            17 => Some(Self::DeviceControlOne),
347            18 => Some(Self::DeviceControlTwo),
348            19 => Some(Self::DeviceControlThree),
349            20 => Some(Self::DeviceControlFour),
350            21 => Some(Self::NegativeAcknowledge),
351            22 => Some(Self::SynchronousIdle),
352            23 => Some(Self::EndOfTransmissionBlock),
353            24 => Some(Self::Cancel),
354            25 => Some(Self::EndOfMedium),
355            26 => Some(Self::Substitute),
356            27 => Some(Self::Escape),
357            28 => Some(Self::InformationSeparatorFour),
358            29 => Some(Self::InformationSeparatorThree),
359            30 => Some(Self::InformationSeparatorTwo),
360            31 => Some(Self::InformationSeparatorOne),
361            32 => Some(Self::Space),
362            33 => Some(Self::ExclamationMark),
363            34 => Some(Self::QuotationMark),
364            35 => Some(Self::NumberSign),
365            36 => Some(Self::DollarSign),
366            37 => Some(Self::PercentSign),
367            38 => Some(Self::Ampersand),
368            39 => Some(Self::Apostrophe),
369            40 => Some(Self::LeftParenthesis),
370            41 => Some(Self::RightParenthesis),
371            42 => Some(Self::Asterisk),
372            43 => Some(Self::PlusSign),
373            44 => Some(Self::Comma),
374            45 => Some(Self::HyphenMinus),
375            46 => Some(Self::FullStop),
376            47 => Some(Self::Solidus),
377            48 => Some(Self::Digit0),
378            49 => Some(Self::Digit1),
379            50 => Some(Self::Digit2),
380            51 => Some(Self::Digit3),
381            52 => Some(Self::Digit4),
382            53 => Some(Self::Digit5),
383            54 => Some(Self::Digit6),
384            55 => Some(Self::Digit7),
385            56 => Some(Self::Digit8),
386            57 => Some(Self::Digit9),
387            58 => Some(Self::Colon),
388            59 => Some(Self::Semicolon),
389            60 => Some(Self::LessThanSign),
390            61 => Some(Self::EqualsSign),
391            62 => Some(Self::GreaterThanSign),
392            63 => Some(Self::QuestionMark),
393            64 => Some(Self::CommercialAt),
394            65 => Some(Self::CapitalA),
395            66 => Some(Self::CapitalB),
396            67 => Some(Self::CapitalC),
397            68 => Some(Self::CapitalD),
398            69 => Some(Self::CapitalE),
399            70 => Some(Self::CapitalF),
400            71 => Some(Self::CapitalG),
401            72 => Some(Self::CapitalH),
402            73 => Some(Self::CapitalI),
403            74 => Some(Self::CapitalJ),
404            75 => Some(Self::CapitalK),
405            76 => Some(Self::CapitalL),
406            77 => Some(Self::CapitalM),
407            78 => Some(Self::CapitalN),
408            79 => Some(Self::CapitalO),
409            80 => Some(Self::CapitalP),
410            81 => Some(Self::CapitalQ),
411            82 => Some(Self::CapitalR),
412            83 => Some(Self::CapitalS),
413            84 => Some(Self::CapitalT),
414            85 => Some(Self::CapitalU),
415            86 => Some(Self::CapitalV),
416            87 => Some(Self::CapitalW),
417            88 => Some(Self::CapitalX),
418            89 => Some(Self::CapitalY),
419            90 => Some(Self::CapitalZ),
420            91 => Some(Self::LeftSquareBracket),
421            92 => Some(Self::ReverseSolidus),
422            93 => Some(Self::RightSquareBracket),
423            94 => Some(Self::CircumflexAccent),
424            95 => Some(Self::LowLine),
425            96 => Some(Self::GraveAccent),
426            97 => Some(Self::SmallA),
427            98 => Some(Self::SmallB),
428            99 => Some(Self::SmallC),
429            100 => Some(Self::SmallD),
430            101 => Some(Self::SmallE),
431            102 => Some(Self::SmallF),
432            103 => Some(Self::SmallG),
433            104 => Some(Self::SmallH),
434            105 => Some(Self::SmallI),
435            106 => Some(Self::SmallJ),
436            107 => Some(Self::SmallK),
437            108 => Some(Self::SmallL),
438            109 => Some(Self::SmallM),
439            110 => Some(Self::SmallN),
440            111 => Some(Self::SmallO),
441            112 => Some(Self::SmallP),
442            113 => Some(Self::SmallQ),
443            114 => Some(Self::SmallR),
444            115 => Some(Self::SmallS),
445            116 => Some(Self::SmallT),
446            117 => Some(Self::SmallU),
447            118 => Some(Self::SmallV),
448            119 => Some(Self::SmallW),
449            120 => Some(Self::SmallX),
450            121 => Some(Self::SmallY),
451            122 => Some(Self::SmallZ),
452            123 => Some(Self::LeftCurlyBracket),
453            124 => Some(Self::VerticalLine),
454            125 => Some(Self::RightCurlyBracket),
455            126 => Some(Self::Tilde),
456            127 => Some(Self::Delete),
457            _ => None,
458        }
459    }
460
461    /// Creates an ASCII character from the byte `b`,
462    /// without checking whether it's valid.
463    /// # Safety
464    /// `b` must be in `0..=127`, or else this is UB.
465    #[must_use]
466    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
467    #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
468    pub const unsafe fn from_u8_unchecked(b: u8) -> Self {
469        // SAFETY: Our safety precondition is that `b` is in-range.
470        unsafe { transmute(b) }
471    }
472
473    /// When passed the *number* `0`, `1`, …, `9`, returns the *character*
474    /// `'0'`, `'1'`, …, `'9'` respectively.
475    ///
476    /// If `d >= 10`, returns `None`.
477    ///
478    /// # Features
479    /// Uses `unsafe_hint` for performance optimizations.
480    #[must_use]
481    pub const fn digit(d: u8) -> Option<Self> {
482        if d < 10 {
483            let sum = {
484                cfg_select! { all(feature = "unsafe_hint", not(feature = "safe_text")) => {
485                    unsafe { b'0'.unchecked_add(d) } // SAFETY: we've checked d < 10
486                } _ => {
487                    b'0' + d
488                }}
489            };
490            Self::from_u8(sum)
491        } else {
492            None
493        }
494    }
495
496    /// When passed the *number* `0`, `1`, …, `9`, returns the *character*
497    /// `'0'`, `'1'`, …, `'9'` respectively, without checking that it's in-range.
498    ///
499    /// # Safety
500    /// This is immediate UB if called with `d > 64`.
501    ///
502    /// If `d >= 10` and `d <= 64`, this is allowed to return any value or panic.
503    /// Notably, it should not be expected to return hex digits, or any other
504    /// reasonable extension of the decimal digits.
505    ///
506    /// (This lose safety condition is intended to simplify soundness proofs
507    /// when writing code using this method, since the implementation doesn't
508    /// need something really specific, not to make those other arguments do
509    /// something useful. It might be tightened before stabilization.)
510    #[must_use]
511    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
512    #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
513    pub const unsafe fn digit_unchecked(d: u8) -> Self {
514        debug_assert!(d < 10);
515
516        // SAFETY: `'0'` through `'9'` are U+00030 through U+0039,
517        // so because `d` must be 64 or less the addition can return at most
518        // 112 (0x70), which doesn't overflow and is within the ASCII range.
519        unsafe {
520            let byte = b'0'.unchecked_add(d);
521            Self::from_u8_unchecked(byte)
522        }
523    }
524
525    /// Gets this ASCII character as a byte.
526    #[must_use]
527    pub const fn as_u8(self) -> u8 {
528        self as u8
529    }
530
531    /// Gets this ASCII character as a `char` Unicode Scalar Value.
532    #[must_use]
533    pub const fn as_char(self) -> char {
534        self as u8 as char
535    }
536
537    /// Views this ASCII character as a one-code-unit UTF-8 `str`.
538    #[must_use]
539    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
540    #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
541    pub const fn as_str(&self) -> &str {
542        Self::slice_as_str(core::slice::from_ref(self))
543    }
544}
545
546impl CharAscii {
547    /// Views a slice of ASCII characters as a UTF-8 `str`.
548    #[must_use]
549    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
550    #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
551    pub const fn slice_as_str(slice: &[CharAscii]) -> &str {
552        let ascii_ptr: *const [CharAscii] = slice;
553        let str_ptr = ascii_ptr as *const str;
554        // SAFETY: Each ASCII codepoint in UTF-8 is encoded as one single-byte
555        // code unit having the same value as the ASCII byte.
556        unsafe { &*str_ptr }
557    }
558
559    /// Views a slice of ASCII characters as a slice of `u8` bytes.
560    #[must_use]
561    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
562    #[cfg_attr(nightly_doc, doc(cfg(feature = "unsafe_str")))]
563    pub const fn slice_as_bytes(slice: &[CharAscii]) -> &[u8] {
564        CharAscii::slice_as_str(slice).as_bytes()
565    }
566}
567// impl [CharAscii] {
568//     /// Views this slice of ASCII characters as a UTF-8 `str`.
569//     #[must_use]
570//     pub const fn as_str(&self) -> &str {
571//         let ascii_ptr: *const Self = self;
572//         let str_ptr = ascii_ptr as *const str;
573//         // SAFETY: Each ASCII codepoint in UTF-8 is encoded as one single-byte
574//         // code unit having the same value as the ASCII byte.
575//         unsafe { &*str_ptr }
576//     }
577//
578//     /// Views this slice of ASCII characters as a slice of `u8` bytes.
579//     #[must_use]
580//     pub const fn as_bytes(&self) -> &[u8] {
581//         self.as_str().as_bytes()
582//     }
583// }
584
585impl fmt::Display for CharAscii {
586    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587        fmt::Display::fmt(&self.as_char(), f)
588    }
589}