utf8_reader/
lib.rs

1//! Read UTF-8 characters from object that implement Read trait
2//!
3//! # Examples:
4//! ```rust
5//! use utf8_reader::Utf8Reader;
6//! use std::io::Cursor;
7//! use std::io::Write;
8//!
9//! let mut buf = Cursor::new(Vec::new());
10//! buf.write("复/d❤".as_bytes()).unwrap();
11//! buf.set_position(0);
12//!
13//! let mut reader = Utf8Reader::new(buf);
14//!
15//! assert_eq!(Some('复'.into()), reader.next());
16//! assert_eq!(Some('/'.into()), reader.next());
17//! assert_eq!(Some('d'.into()), reader.next());
18//! assert_eq!(Some('❤'.into()), reader.next());
19//! assert_eq!(None, reader.next());
20//! ```
21
22use std::convert::AsRef;
23use std::convert::From;
24use std::fmt;
25use std::io::Read;
26use std::iter::Iterator;
27use std::str::FromStr;
28
29macro_rules! impl_eq {
30    ($lhs: ty, $rhs: ty) => {
31        impl PartialEq<$rhs> for $lhs {
32            fn eq(&self, other: &$rhs) -> bool {
33                self.as_str() == other
34            }
35        }
36
37        impl PartialEq<$lhs> for $rhs {
38            fn eq(&self, other: &$lhs) -> bool {
39                other.as_str() == self
40            }
41        }
42    };
43}
44
45/// representing a UTF-8 character
46///
47/// Note: a UTF-8 character inside UTF8Char indicates an array [u8; 4]
48/// so if an UTF-8 charactor's length of byte greater than 4 is not allow. e.g. ❤️ is 6 bytes length
49#[derive(Clone, Copy, Eq, PartialEq, Debug)]
50pub struct Utf8Char([u8; 4]);
51
52impl Utf8Char {
53    /// Extracts a byte slice containing the UTF-8 bytes
54    pub fn as_slice(&self) -> &[u8] {
55        match self.0 {
56            [0, 0, 0, 0] | [0, 0, 0, _] => &self.0[3..],
57            [0, 0, _, _] => &self.0[2..],
58            [0, _, _, _] => &self.0[1..],
59            _ => &self.0[..],
60        }
61    }
62
63    /// Extracts a byte slice containing the UTF-8 bytes
64    pub fn as_str(&self) -> &str {
65        self.as_ref()
66    }
67
68    /// Return `true` if this char is `white_space`
69    ///
70    /// `white_space` includes ` `, `\t`, `\r`, `\n`
71    pub fn is_whitespace(&self) -> bool {
72        match self.0 {
73            [0, 0, 0, 9] | [0, 0, 0, 10] | [0, 0, 0, 13] | [0, 0, 0, 32] => true,
74            _ => false,
75        }
76    }
77
78    /// Check if the value is an ASCII decimal digit
79    ///
80    /// 0 to 9
81    pub fn is_ascii_digit(&self) -> bool {
82        match self.0 {
83            [0, 0, 0, v] if v >= b'0' && v <= b'9' => true,
84            _ => false,
85        }
86    }
87
88    /// Return `true` if this char is an `Alphabetic`
89    pub fn is_alphabetic(&self) -> bool {
90        match self.0 {
91            [0, 0, 0, v] if v >= b'A' && v <= b'Z' || v >= b'a' && v <= b'z' => true,
92            _ => false,
93        }
94    }
95
96    /// convert a UTF8Char to a digit
97    ///
98    /// diget as defined to be only the 0-9
99    ///
100    /// # Error
101    /// Returns [Nane] if the UTF8Char does not refer to a digit
102    pub fn to_digit(&self) -> Option<u32> {
103        match self.0 {
104            [0, 0, 0, v] if v >= b'0' && v <= b'9' => Some((v - b'0').into()),
105            _ => None,
106        }
107    }
108}
109
110impl From<u8> for Utf8Char {
111    fn from(value: u8) -> Self {
112        Self([0, 0, 0, value])
113    }
114}
115
116impl From<u32> for Utf8Char {
117    fn from(value: u32) -> Self {
118        Self(value.to_be_bytes())
119    }
120}
121
122impl From<char> for Utf8Char {
123    fn from(value: char) -> Self {
124        let mut b = [0; 4];
125        let st = value.encode_utf8(&mut b);
126        let st = st.as_bytes();
127        let mut b = [0; 4];
128        for (i, v) in ((4 - st.len())..4).enumerate() {
129            b[v] = st[i];
130        }
131
132        Self(b)
133    }
134}
135
136impl fmt::Display for Utf8Char {
137    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138        write!(
139            f,
140            "{}",
141            String::from_str(self.as_str()).expect("cannot convert to a String")
142        )
143    }
144}
145
146impl AsRef<str> for Utf8Char {
147    fn as_ref(&self) -> &str {
148        use std::str;
149        str::from_utf8(self.as_slice())
150            .expect("cannot convert to a str, maybe is not a valid UTF-8 character")
151    }
152}
153
154impl PartialEq<&str> for Utf8Char {
155    fn eq(&self, other: &&str) -> bool {
156        self.as_str() == *other
157    }
158}
159
160impl PartialEq<Utf8Char> for &str {
161    fn eq(&self, other: &Utf8Char) -> bool {
162        other.as_str() == *self
163    }
164}
165
166impl_eq!(Utf8Char, str);
167impl_eq!(&Utf8Char, str);
168
169/// Readd UTF-8 characters from object that implement Read
170///
171/// This is not validate the content whether it is a valid UTF-8 format or not
172/// Implemented [Iterator]
173///
174/// # Example:
175///
176/// ```rust
177/// # use utf8_reader::Utf8Reader;
178/// # use std::io::Cursor;
179/// # use std::io::Write;
180///
181/// let mut buf = Cursor::new(Vec::new());
182/// buf.write("复/d❤".as_bytes()).unwrap();
183/// buf.set_position(0);
184///
185/// let mut reader = Utf8Reader::new(buf);
186///
187/// assert_eq!(Some('复'.into()), reader.next());
188/// assert_eq!(Some('/'.into()), reader.next());
189/// assert_eq!(Some('d'.into()), reader.next());
190/// assert_eq!(Some('❤'.into()), reader.next());
191/// assert_eq!(None, reader.next());
192/// ```
193pub struct Utf8Reader<T: Read>(T);
194
195impl<T: Read> Utf8Reader<T> {
196    /// Create a new Utf8Reader
197    ///
198    /// # Argement:
199    /// inner: object that implemented Read
200    ///
201    /// # Example:
202    ///
203    pub fn new(inner: T) -> Self {
204        Self(inner)
205    }
206}
207
208impl<T: Read> Iterator for Utf8Reader<T> {
209    type Item = Utf8Char;
210
211    fn next(&mut self) -> Option<Self::Item> {
212        let mut b = [0u8; 1];
213        let size = self.0.read(&mut b).expect("read a byte faied");
214        if size == 0 {
215            return None;
216        }
217
218        let first_byte = b[0];
219        if first_byte < 128 {
220            return Some(first_byte.into());
221        }
222
223        let utf8_32 = match first_byte & 0b11100000 {
224            0b11110000 => exact_next(&mut self.0, 3, first_byte),
225            0b11100000 => exact_next(&mut self.0, 2, first_byte),
226            0b11000000 => exact_next(&mut self.0, 1, first_byte),
227            _ => first_byte as u32,
228        };
229
230        Some(utf8_32.into())
231    }
232}
233
234fn exact_next(read: &mut impl Read, count: usize, first_byte: u8) -> u32 {
235    let mut b = [0u8; 1];
236    let mut res_u32 = first_byte as u32;
237
238    for _ in 0..count {
239        let size = read.read(&mut b).expect("read a byte faied");
240        if size != 0 {
241            res_u32 = res_u32 << 8 | b[0] as u32;
242        }
243    }
244
245    res_u32
246}
247
248#[cfg(test)]
249mod test {
250    use super::*;
251    use std::io::Cursor;
252    use std::io::Write;
253
254    #[test]
255    fn test_whitespace() {
256        let mut buf = Cursor::new(Vec::new());
257        buf.write(" d\t\r\n".as_bytes()).unwrap();
258        buf.set_position(0);
259
260        let mut r = Utf8Reader::new(buf);
261        assert!(r.next().unwrap().is_whitespace());
262        assert!(!r.next().unwrap().is_whitespace());
263        assert!(r.next().unwrap().is_whitespace());
264        assert!(r.next().unwrap().is_whitespace());
265        assert!(r.next().unwrap().is_whitespace());
266        assert!(r.next().is_none());
267    }
268
269    #[test]
270    fn test_digit() {
271        let mut buf = Cursor::new(Vec::new());
272        buf.write("0123456789abi".as_bytes()).unwrap();
273        buf.set_position(0);
274
275        let mut r = Utf8Reader::new(buf);
276        assert!(r.next().unwrap().is_ascii_digit());
277        assert!(r.next().unwrap().is_ascii_digit());
278        assert!(r.next().unwrap().is_ascii_digit());
279        assert!(r.next().unwrap().is_ascii_digit());
280        assert!(r.next().unwrap().is_ascii_digit());
281        assert!(r.next().unwrap().is_ascii_digit());
282        assert!(r.next().unwrap().is_ascii_digit());
283        assert!(r.next().unwrap().is_ascii_digit());
284        assert!(r.next().unwrap().is_ascii_digit());
285        assert!(r.next().unwrap().is_ascii_digit());
286        assert!(!r.next().unwrap().is_ascii_digit());
287        assert!(!r.next().unwrap().is_ascii_digit());
288        assert!(!r.next().unwrap().is_ascii_digit());
289        assert!(r.next().is_none());
290    }
291
292    #[test]
293    fn test_to_digit() {
294        let mut buf = Cursor::new(Vec::new());
295        buf.write("0123456789abi".as_bytes()).unwrap();
296        buf.set_position(0);
297
298        let mut r = Utf8Reader::new(buf);
299        assert_eq!(Some(0), r.next().unwrap().to_digit());
300        assert_eq!(Some(1), r.next().unwrap().to_digit());
301        assert_eq!(Some(2), r.next().unwrap().to_digit());
302        assert_eq!(Some(3), r.next().unwrap().to_digit());
303        assert_eq!(Some(4), r.next().unwrap().to_digit());
304        assert_eq!(Some(5), r.next().unwrap().to_digit());
305        assert_eq!(Some(6), r.next().unwrap().to_digit());
306        assert_eq!(Some(7), r.next().unwrap().to_digit());
307        assert_eq!(Some(8), r.next().unwrap().to_digit());
308        assert_eq!(Some(9), r.next().unwrap().to_digit());
309        assert_eq!(None, r.next().unwrap().to_digit());
310        assert_eq!(None, r.next().unwrap().to_digit());
311        assert_eq!(None, r.next().unwrap().to_digit());
312        assert_eq!(None, r.next());
313    }
314
315    #[test]
316    fn is_alphabetic() {
317        let mut buf = Cursor::new(Vec::new());
318        buf.write("abcdABCDEZz0000".as_bytes()).unwrap();
319        buf.set_position(0);
320
321        let mut r = Utf8Reader::new(buf);
322        assert!(r.next().unwrap().is_alphabetic());
323        assert!(r.next().unwrap().is_alphabetic());
324        assert!(r.next().unwrap().is_alphabetic());
325        assert!(r.next().unwrap().is_alphabetic());
326        assert!(r.next().unwrap().is_alphabetic());
327        assert!(r.next().unwrap().is_alphabetic());
328        assert!(r.next().unwrap().is_alphabetic());
329        assert!(r.next().unwrap().is_alphabetic());
330        assert!(r.next().unwrap().is_alphabetic());
331        assert!(r.next().unwrap().is_alphabetic());
332        assert!(r.next().unwrap().is_alphabetic());
333        assert!(!r.next().unwrap().is_alphabetic());
334        assert!(!r.next().unwrap().is_alphabetic());
335        assert!(!r.next().unwrap().is_alphabetic());
336        assert!(!r.next().unwrap().is_alphabetic());
337        assert!(r.next().is_none());
338    }
339
340    #[test]
341    fn test_display() {
342        let mut buf = Cursor::new(Vec::new());
343        buf.write("复// d".as_bytes()).unwrap();
344        buf.set_position(0);
345
346        let mut r = Utf8Reader::new(buf);
347        assert_eq!("复".to_string(), r.next().unwrap().to_string());
348        assert_eq!("/".to_string(), r.next().unwrap().to_string());
349    }
350
351    #[test]
352    fn test_as_str() {
353        let mut buf = Cursor::new(Vec::new());
354        buf.write("复// d".as_bytes()).unwrap();
355        buf.set_position(0);
356
357        let mut r = Utf8Reader::new(buf);
358        let utf8char = r.next().unwrap();
359        assert_eq!("复", utf8char.as_ref());
360        let utf8char = r.next().unwrap();
361        assert_eq!("/", utf8char.as_ref());
362        let utf8char = r.next().unwrap();
363        assert_eq!("/", utf8char.as_ref());
364        let utf8char = r.next().unwrap();
365        assert_eq!(" ", utf8char.as_ref());
366        let utf8char = r.next().unwrap();
367        assert_eq!("d", utf8char.as_ref());
368        assert_eq!(None, r.next());
369    }
370
371    #[test]
372    fn test_iterator() {
373        let mut buf = Cursor::new(Vec::new());
374        buf.write(
375            r"复// d❤
3761+1=2 // é异"
377                .as_bytes(),
378        )
379        .unwrap();
380        buf.set_position(0);
381
382        let mut r = Utf8Reader::new(buf);
383
384        assert_eq!(Some('复'.into()), r.next());
385        assert_eq!(Some('/'.into()), r.next());
386        assert_eq!(Some('/'.into()), r.next());
387        assert_eq!(Some(' '.into()), r.next());
388        assert_eq!(Some('d'.into()), r.next());
389        assert_eq!(Some('❤'.into()), r.next());
390        assert_eq!(Some('\n'.into()), r.next());
391        assert_eq!(Some('1'.into()), r.next());
392        assert_eq!(Some('+'.into()), r.next());
393        assert_eq!(Some('1'.into()), r.next());
394        assert_eq!(Some('='.into()), r.next());
395        assert_eq!(Some('2'.into()), r.next());
396        assert_eq!(Some(' '.into()), r.next());
397        assert_eq!(Some('/'.into()), r.next());
398        assert_eq!(Some('/'.into()), r.next());
399        assert_eq!(Some(' '.into()), r.next());
400        assert_eq!(Some('é'.into()), r.next());
401        assert_eq!(Some('异'.into()), r.next());
402        assert_eq!(None, r.next());
403    }
404
405    #[test]
406    fn wrong_character() {
407        let mut buf = Cursor::new(Vec::new());
408        buf.write("\u{D7FF}复".as_bytes()).unwrap();
409        buf.set_position(0);
410
411        let mut r = Utf8Reader::new(buf);
412        assert_eq!(Some('\u{D7FF}'.into()), r.next());
413        assert_eq!(Some('复'.into()), r.next());
414        assert_eq!(None, r.next());
415    }
416
417    #[test]
418    fn equal_str() {
419        let mut buf = Cursor::new(Vec::new());
420        buf.write("0a/*-比".as_bytes()).unwrap();
421        buf.set_position(0);
422
423        let mut r = Utf8Reader::new(buf);
424        let v = r.next().unwrap();
425        assert_eq!("0", v);
426        assert_eq!("0", &v);
427        assert_eq!("a", r.next().unwrap());
428        assert_eq!("/", r.next().unwrap());
429        assert_eq!("*", r.next().unwrap());
430        assert_eq!("-", r.next().unwrap());
431        assert_eq!(r.next().unwrap(), "比");
432        assert_eq!(None, r.next());
433    }
434}