1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
use std::io;
use std::io::Read;
static BADCHAR: char = '\u{FFFD}';
macro_rules! try_some_or {
($e:expr, $fail:expr) => ( match try!($e) { Some(v) => v, None => $fail } );
}
pub struct UTF8Reader<T: Read>
{
stream: T,
}
fn tochar(codepoint: u32) -> char
{
match ::std::char::from_u32(codepoint)
{
Some(c) => c,
None => BADCHAR,
}
}
impl<T: Read> UTF8Reader<T>
{
pub fn new(reader: T) -> UTF8Reader<T>
{
UTF8Reader {
stream: reader,
}
}
fn getb(&mut self) -> io::Result<Option<u8>> {
let mut b = [0];
if try!(self.stream.read(&mut b)) == 0 {
Ok(None)
}
else {
Ok(Some(b[0]))
}
}
pub fn getc(&mut self) -> io::Result<Option<char>>
{
let ch1 = try_some_or!(self.getb(), return Ok(None)) as u32;
if ch1 & 0xC0 == 0x80 {
return Ok( Some(BADCHAR) )
}
if ch1 & 0x80 == 0x00
{
Ok( Some(tochar(ch1)) )
}
else if ch1 & 0xE0 == 0xC0
{
let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
if ch2 & 0xC0 != 0x80 {
return Ok( Some(BADCHAR) );
}
let ret = (ch1 & 0x1F << 6) | (ch2 & 0x3F << 0);
Ok( Some(tochar(ret)) )
}
else if ch1 & 0xF0 == 0xE0
{
let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
if ch2 & 0xC0 != 0x80 {
return Ok( Some(BADCHAR) );
}
let ch3 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
if ch3 & 0xC0 != 0x80 {
return Ok( Some(BADCHAR) );
}
let ret = (ch1 & 0x0F << 12) | (ch2 & 0x3F << 6) | (ch3 & 0x3F << 0);
Ok( Some(tochar(ret)) )
}
else if ch1 & 0xF8 == 0xF0
{
let ch2 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
if ch2 & 0xC0 != 0x80 {
return Ok( Some(BADCHAR) );
}
let ch3 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
if ch3 & 0xC0 != 0x80 {
return Ok( Some(BADCHAR) );
}
let ch4 = try_some_or!(self.getb(), return Ok(Some(BADCHAR))) as u32;
if ch4 & 0xC0 != 0x80 {
return Ok( Some(BADCHAR) );
}
let ret = (ch1 & 0x07 << 18) | (ch2 & 0x3F << 12) | (ch3 & 0x3F << 6) | (ch4 & 0x3F << 0);
Ok( Some(tochar(ret)) )
}
else
{
Ok( Some(BADCHAR) )
}
}
}
impl<T: Read> Iterator for UTF8Reader<T>
{
type Item = io::Result<char>;
fn next(&mut self) -> Option<io::Result<char>>
{
match self.getc()
{
Ok(None) => None,
Ok(Some(c)) => Some( Ok(c) ),
Err(e) => Some( Err( e ) ),
}
}
}
#[test]
fn it_works() {
}