utf8reader/
lib.rs

1// UTF8Reader
2// - by John Hodge (thePowersGang)
3//
4// 
5// Reads a stream of UTF-8 encoded codepoints from a "Reader"
6//#![feature(associated_types)]
7use std::io;
8use std::io::Read;
9
10/// Unicode replacement character
11static BADCHAR: char = '\u{FFFD}';
12
13macro_rules! try_some_or {
14	($e:expr, $fail:expr) => ( match try!($e) { Some(v) => v, None => $fail } );
15}
16
17/// UTF8 reader structure
18pub struct UTF8Reader<T: Read>
19{
20	stream: T,
21}	
22
23fn tochar(codepoint: u32) -> char
24{
25	match ::std::char::from_u32(codepoint)
26	{
27	Some(c) => c,
28	None => BADCHAR,
29	}
30}
31
32impl<T: Read> UTF8Reader<T>
33{
34	pub fn new(reader: T) -> UTF8Reader<T>
35	{
36		UTF8Reader {
37			stream: reader,
38		}
39	}
40	
41	fn getb(&mut self) -> io::Result<Option<u8>> {
42		let mut b = [0];
43		if try!(self.stream.read(&mut b)) == 0 {
44			Ok(None)
45		}
46		else {
47			Ok(Some(b[0]))
48		}
49	}
50	
51	/// Read a single codepoint from the stream.
52	/// On an encoding error, it returns '\uFFFD' (the unicode replacement character)
53	pub fn getc(&mut self) -> io::Result<Option<char>>
54	{
55		let ch1 = try_some_or!(self.getb(), return Ok(None)) as u32;
56		if ch1 & 0xC0 == 0x80 {
57			return Ok( Some(BADCHAR) )
58		}
59		if ch1 & 0x80 == 0x00
60		{
61			// Single-byte
62			Ok( Some(tochar(ch1)) )
63		}
64		else if ch1 & 0xE0 == 0xC0
65		{
66			// Two-byte sequence
67			let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
68			if ch2 & 0xC0 != 0x80 {
69				return Ok( Some(BADCHAR) );
70			}
71			
72			let ret = (ch1 & 0x1F << 6) | (ch2 & 0x3F << 0);
73			Ok( Some(tochar(ret)) )
74		}
75		else if ch1 & 0xF0 == 0xE0
76		{
77			// Three-byte sequence
78			let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
79			if ch2 & 0xC0 != 0x80 {
80				return Ok( Some(BADCHAR) );
81			}
82			let ch3 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
83			if ch3 & 0xC0 != 0x80 {
84				return Ok( Some(BADCHAR) );
85			}
86			
87			let ret = (ch1 & 0x0F << 12) | (ch2 & 0x3F << 6) | (ch3 & 0x3F << 0);
88			Ok( Some(tochar(ret)) )
89		}
90		else if ch1 & 0xF8 == 0xF0
91		{
92			// Four-byte sequence
93			let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
94			if ch2 & 0xC0 != 0x80 {
95				return Ok( Some(BADCHAR) );
96			}
97			let ch3 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
98			if ch3 & 0xC0 != 0x80 {
99				return Ok( Some(BADCHAR) );
100			}
101			let ch4 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
102			if ch4 & 0xC0 != 0x80 {
103				return Ok( Some(BADCHAR) );
104			}
105			
106			let ret = (ch1 & 0x07 << 18) | (ch2 & 0x3F << 12) | (ch3 & 0x3F << 6) | (ch4 & 0x3F << 0);
107			Ok( Some(tochar(ret)) )
108		}
109		else
110		{
111			// More than four bytes is invalid
112			Ok( Some(BADCHAR) )
113		}
114	}
115}
116
117/// Implmentation of the same interface as 'Chars' provides, returns None at the end of the stream
118impl<T: Read> Iterator for UTF8Reader<T>
119{
120	type Item = io::Result<char>;
121	fn next(&mut self) -> Option<io::Result<char>>
122	{
123		// Get result from decoder
124		match self.getc()
125		{
126		// - All good, return a character
127		Ok(None) => None,
128		Ok(Some(c)) => Some( Ok(c) ),
129		Err(e) => Some( Err( e ) ),
130		}
131	}
132}
133
134#[test]
135fn it_works() {
136}
137
138// vim: ft=rust