1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// UTF8Reader
// - by John Hodge (thePowersGang)
//
// 
// Reads a stream of UTF-8 encoded codepoints from a "Reader"
//#![feature(associated_types)]
use std::io;
use std::io::Read;

/// Unicode replacement character
static BADCHAR: char = '\u{FFFD}';

macro_rules! try_some_or {
	($e:expr, $fail:expr) => ( match try!($e) { Some(v) => v, None => $fail } );
}

/// UTF8 reader structure
pub struct UTF8Reader<T: Read>
{
	stream: T,
}	

fn tochar(codepoint: u32) -> char
{
	match ::std::char::from_u32(codepoint)
	{
	Some(c) => c,
	None => BADCHAR,
	}
}

impl<T: Read> UTF8Reader<T>
{
	pub fn new(reader: T) -> UTF8Reader<T>
	{
		UTF8Reader {
			stream: reader,
		}
	}
	
	fn getb(&mut self) -> io::Result<Option<u8>> {
		let mut b = [0];
		if try!(self.stream.read(&mut b)) == 0 {
			Ok(None)
		}
		else {
			Ok(Some(b[0]))
		}
	}
	
	/// Read a single codepoint from the stream.
	/// On an encoding error, it returns '\uFFFD' (the unicode replacement character)
	pub fn getc(&mut self) -> io::Result<Option<char>>
	{
		let ch1 = try_some_or!(self.getb(), return Ok(None)) as u32;
		if ch1 & 0xC0 == 0x80 {
			return Ok( Some(BADCHAR) )
		}
		if ch1 & 0x80 == 0x00
		{
			// Single-byte
			Ok( Some(tochar(ch1)) )
		}
		else if ch1 & 0xE0 == 0xC0
		{
			// Two-byte sequence
			let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
			if ch2 & 0xC0 != 0x80 {
				return Ok( Some(BADCHAR) );
			}
			
			let ret = (ch1 & 0x1F << 6) | (ch2 & 0x3F << 0);
			Ok( Some(tochar(ret)) )
		}
		else if ch1 & 0xF0 == 0xE0
		{
			// Three-byte sequence
			let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
			if ch2 & 0xC0 != 0x80 {
				return Ok( Some(BADCHAR) );
			}
			let ch3 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
			if ch3 & 0xC0 != 0x80 {
				return Ok( Some(BADCHAR) );
			}
			
			let ret = (ch1 & 0x0F << 12) | (ch2 & 0x3F << 6) | (ch3 & 0x3F << 0);
			Ok( Some(tochar(ret)) )
		}
		else if ch1 & 0xF8 == 0xF0
		{
			// Four-byte sequence
			let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
			if ch2 & 0xC0 != 0x80 {
				return Ok( Some(BADCHAR) );
			}
			let ch3 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
			if ch3 & 0xC0 != 0x80 {
				return Ok( Some(BADCHAR) );
			}
			let ch4 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
			if ch4 & 0xC0 != 0x80 {
				return Ok( Some(BADCHAR) );
			}
			
			let ret = (ch1 & 0x07 << 18) | (ch2 & 0x3F << 12) | (ch3 & 0x3F << 6) | (ch4 & 0x3F << 0);
			Ok( Some(tochar(ret)) )
		}
		else
		{
			// More than four bytes is invalid
			Ok( Some(BADCHAR) )
		}
	}
}

/// Implmentation of the same interface as 'Chars' provides, returns None at the end of the stream
impl<T: Read> Iterator for UTF8Reader<T>
{
	type Item = io::Result<char>;
	fn next(&mut self) -> Option<io::Result<char>>
	{
		// Get result from decoder
		match self.getc()
		{
		// - All good, return a character
		Ok(None) => None,
		Ok(Some(c)) => Some( Ok(c) ),
		Err(e) => Some( Err( e ) ),
		}
	}
}

#[test]
fn it_works() {
}

// vim: ft=rust