utf8_rune/
byte_type.rs

1use std::fmt::{Debug, Formatter};
2
3/// Represents UTF-8 byte type based on the most significant bits
4/// the given byte
5///
6/// Examples
7///
8/// ```
9/// use utf8_rune::ByteType;
10/// let f0 = ByteType::from(0xf0u8);
11/// assert_eq!(f0, ByteType::FourOrMore(0xF0));
12/// assert_eq!(f0.len(), 4);
13/// assert_eq!(f0.is_ascii(), false);
14/// assert_eq!(f0.is_continuation(), false);
15/// ```
16///
17/// ```
18/// use utf8_rune::ByteType;
19/// let e4 = ByteType::from(0xE4u8);
20/// assert_eq!(e4, ByteType::Three(0xE4));
21/// assert_eq!(e4.len(), 3);
22/// assert_eq!(e4.is_ascii(), false);
23/// assert_eq!(e4.is_continuation(), false);
24/// ```
25///
26/// ```
27/// use utf8_rune::ByteType;
28/// let c3 = ByteType::from(0xC3u8);
29/// assert_eq!(c3, ByteType::Two(0xC3));
30/// assert_eq!(c3.len(), 2);
31/// assert_eq!(c3.is_ascii(), false);
32/// assert_eq!(c3.is_continuation(), false);
33/// ```
34///
35/// ```
36/// use utf8_rune::ByteType;
37/// let g = ByteType::from(b'g');
38/// assert_eq!(g, ByteType::Ascii(0x67));
39/// assert_eq!(g.len(), 1);
40/// assert_eq!(g.is_ascii(), true);
41/// assert_eq!(g.is_continuation(), false);
42/// ```
43///
44/// ```
45/// use utf8_rune::ByteType;
46/// let g = ByteType::from(0x80u8);
47/// assert_eq!(g, ByteType::Continuation(0x80));
48/// assert_eq!(g.len(), 1);
49/// assert_eq!(g.is_ascii(), false);
50/// assert_eq!(g.is_continuation(), true);
51/// ```
52#[derive(Clone, Copy, PartialEq, Eq)]
53pub enum ByteType {
54    None,
55    Ascii(u8),
56    One(u8),
57    Two(u8),
58    Three(u8),
59    FourOrMore(u8),
60    Continuation(u8),
61}
62
63impl ByteType {
64    pub fn new(byte: u8) -> ByteType {
65        if byte < 127 {
66            ByteType::Ascii(byte)
67        } else {
68            match byte.leading_ones() {
69                0 => ByteType::One(byte),
70                1 => ByteType::Continuation(byte),
71                2 => ByteType::Two(byte),
72                3 => ByteType::Three(byte),
73                _ => ByteType::FourOrMore(byte),
74            }
75        }
76    }
77
78    pub fn name(&self) -> &'static str {
79        match self {
80            ByteType::None => "None",
81            ByteType::Ascii(_) => "Ascii",
82            ByteType::One(_) => "One",
83            ByteType::Two(_) => "Two",
84            ByteType::Three(_) => "Three",
85            ByteType::FourOrMore(_) => "FourOrMore",
86            ByteType::Continuation(_) => "Continuation",
87        }
88    }
89
90    pub fn byte(&self) -> u8 {
91        match self {
92            ByteType::None => u8::default(),
93            ByteType::Ascii(byte) => *byte,
94            ByteType::One(byte) => *byte,
95            ByteType::Two(byte) => *byte,
96            ByteType::Three(byte) => *byte,
97            ByteType::FourOrMore(byte) => *byte,
98            ByteType::Continuation(byte) => *byte,
99        }
100    }
101
102    pub fn len(&self) -> usize {
103        match self {
104            ByteType::Continuation(_) | ByteType::Ascii(_) => 1,
105            _ => (self.byte().leading_ones()) as usize,
106        }
107    }
108
109    pub fn is_ascii(&self) -> bool {
110        match self {
111            ByteType::Ascii(_) => true,
112            _ => false,
113        }
114    }
115
116    pub fn is_continuation(&self) -> bool {
117        match self {
118            ByteType::Continuation(_) => true,
119            _ => false,
120        }
121    }
122
123    pub fn has_rune_delta(&self) -> bool {
124        match self {
125            ByteType::None => false,
126            ByteType::Ascii(_) => false,
127            ByteType::Continuation(_) => false,
128            _ => true,
129        }
130    }
131
132    fn as_debug(&self, indent: Option<usize>) -> String {
133        let indent = crate::unwrap_indent(indent);
134        format!(
135            "{}::{}{{\n{}\n}})",
136            "ByteType",
137            self.name(),
138            [
139                format!(
140                    "byte: 0x{:02x},{}",
141                    self.byte(),
142                    if let Ok(c) = std::str::from_utf8(&[self.byte()]) {
143                        format!(" // \"{c}\"")
144                    } else {
145                        String::new()
146                    }
147                ),
148                format!("len: {},", self.len()),
149            ]
150            .iter()
151            .map(|c| {
152                let padding = " ".repeat(indent);
153                format!("{padding}{c}")
154            })
155            .collect::<Vec<String>>()
156            .join("\n")
157        )
158    }
159}
160impl From<u8> for ByteType {
161    fn from(byte: u8) -> ByteType {
162        ByteType::new(byte)
163    }
164}
165impl From<&u8> for ByteType {
166    fn from(byte: &u8) -> ByteType {
167        ByteType::new(*byte)
168    }
169}
170impl From<u16> for ByteType {
171    fn from(bytes: u16) -> ByteType {
172        ByteType::from(bytes.to_le_bytes()[0])
173    }
174}
175impl From<u32> for ByteType {
176    fn from(bytes: u32) -> ByteType {
177        ByteType::from(bytes.to_le_bytes()[0])
178    }
179}
180impl From<u64> for ByteType {
181    fn from(bytes: u64) -> ByteType {
182        ByteType::from(bytes.to_le_bytes()[0])
183    }
184}
185impl From<usize> for ByteType {
186    fn from(bytes: usize) -> ByteType {
187        ByteType::from(bytes.to_le_bytes()[0])
188    }
189}
190
191impl Debug for ByteType {
192    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
193        write!(f, "{}", self.as_debug(None))
194    }
195}
196
197#[cfg(test)]
198mod test_byte_type {
199    use crate::ByteType;
200
201    #[test]
202    fn test_byte_type() {
203        //  "😀" => [0bf0, 0b9f, 0b98, 0b80] => [0b11110000, 0b10011111, 0b10011000, 0b10000000]
204        let obf0 = ByteType::from(0b11110000u8);
205        assert_eq!(obf0.len(), 4);
206        let ob9f = ByteType::from(0b10011111u8);
207        assert_eq!(ob9f.len(), 1);
208        let ob98 = ByteType::from(0b10011000u8);
209        assert_eq!(ob98.len(), 1);
210        let ob80 = ByteType::from(0b10000000u8);
211        assert_eq!(ob80.len(), 1);
212
213        // "☠️" => [0be2, 0b98, 0ba0, 0bef, 0bb8, 0b8f] => [0b11100010, 0b10011000, 0b10100000, 0b11101111, 0b10111000, 0b10001111]
214        let obe2 = ByteType::from(0b11100010u8);
215        assert_eq!(obe2.len(), 3);
216        let ob98 = ByteType::from(0b10011000u8);
217        assert_eq!(ob98.len(), 1);
218        let oba0 = ByteType::from(0b10100000u8);
219        assert_eq!(oba0.len(), 1);
220        let obef = ByteType::from(0b11101111u8);
221        assert_eq!(obef.len(), 3);
222        let obb8 = ByteType::from(0b10111000u8);
223        assert_eq!(obb8.len(), 1);
224        let ob8f = ByteType::from(0b10001111u8);
225        assert_eq!(ob8f.len(), 1);
226    }
227}