1use std::io::{BufReader, Read};
4
5pub const UTF8_MIN: u32 = 0;
31pub const UTF8_LOW_MAX: u32 = 0xd7ff;
32pub const UTF8_GAP_MIN: u32 = 0xd800;
33pub const UTF8_GAP_MAX: u32 = 0xdfff;
34pub const UTF8_HIGH_MIN: u32 = 0xe000;
35pub const UTF8_MAX: u32 = 0x10ffff;
36
37const UTF8_LENGTH: [u8; 256] = [
38 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
39 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
42 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
44 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
45 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
51 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
52 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
53 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
54];
55
56#[inline]
60pub fn utf8_len(byte: u8) -> usize {
62 return UTF8_LENGTH[byte as usize] as usize;
63}
64
65#[inline]
66pub fn utf8_len_notable(byte: u8) -> usize {
68 match byte {
69 0x00..=0x7f => 1,
70 0xc2..=0xdf => 2,
71 0xe0..=0xef => 3,
72 0xf0..=0xf4 => 4,
73 _ => 0
74 }
75}
76
77#[derive(Debug, PartialEq, Clone, Default)]
78pub enum CharReaderStatus {
79 #[default]
80 Reading,
81 Error(String),
82 Closed
83}
84
85#[derive(Debug)]
86pub enum CharReaderError {
87 NoRoomToRewind
88}
89
90pub struct CharReader<R> {
91 reader: BufReader<R>,
92 offset: u64,
94 status: CharReaderStatus,
95 peek: Option<(Option<char>, u64, CharReaderStatus)>,
96}
97
98impl<R: Read> CharReader<R> {
99 pub fn new(source: R) -> Self {
100 CharReader {
101 reader: BufReader::new(source),
102 offset: 0,
103 status: CharReaderStatus::Reading,
104 peek: None,
105 }
106 }
107
108 pub fn is_reading(&self) -> bool {
109 matches!(self.status, CharReaderStatus::Reading)
110 }
111
112 pub fn get_offset(&self) -> u64 {
113 self.offset
114 }
115
116 pub fn get_status(&self) -> &CharReaderStatus {
117 &self.status
118 }
119
120 pub fn chars(&mut self) -> CharReaderIter<'_, R> {
121 CharReaderIter { creader: self }
122 }
123
124 pub fn get_char(&mut self) -> Option<char> {
125 if let Some(peek) = std::mem::take(&mut self.peek) {
126 self.offset = peek.1;
127 self.status = peek.2;
128 peek.0
129 } else {
130 let (c, len, status) = self.read_char();
131 self.offset += len as u64;
132 self.status = status;
133 c
134 }
135 }
136
137 pub fn rewind(&mut self, chr: char) -> Result<(), CharReaderError> {
138 if self.peek.is_none() {
139 let new_offset = self.offset - chr.len_utf8() as u64;
140 self.peek = Some((Some(chr), self.offset, std::mem::take(&mut self.status)));
141 self.offset = new_offset;
142 self.status = CharReaderStatus::Reading;
143 Ok(())
144 } else {
145 Err(CharReaderError::NoRoomToRewind)
146 }
147 }
148
149 pub fn peek(&mut self) -> Option<char> {
150 if let Some(peek) = &self.peek {
151 peek.0
152 } else {
153 let (c, len, status) = self.read_char();
154 self.peek = Some((c, self.offset + len as u64, status));
155 c
156 }
157 }
158
159 fn read_char(&mut self) -> (Option<char>, usize, CharReaderStatus) {
160 if let CharReaderStatus::Reading = self.status {
161 let mut buffer = [0; 4];
162 let s = self.reader.read(&mut buffer[0..=0]);
163 match s {
164 Ok(0) => (None, 0, CharReaderStatus::Closed), Ok(1) => {
166 let len = utf8_len(buffer[0]);
167 match len {
168 0 => {
169 return (None, 0, CharReaderStatus::Error(format!("UTF-8 encoding error at offset {}", self.offset)));
170 }
171 1 => {}
172 2..=4 => {
173 match self.reader.read(&mut buffer[1..len]) {
174 Ok(n) => assert_eq!(n, len - 1),
175 Err(e) => return (None, 0, CharReaderStatus::Error(e.to_string())),
176 }
177 }
178 _ => panic!("Unexpected UTF-8 length {} at offset {}", len, self.offset),
179 }
180 let c = std::str::from_utf8(&buffer[..len]).unwrap()
181 .chars()
182 .next().unwrap();
183 (Some(c), len, CharReaderStatus::Reading)
184 }
185 Ok(n) => panic!("Unexpected Read::read() result: Ok({}) at offset {}", n, self.offset),
186 Err(e) => {
187 (None, 0, CharReaderStatus::Error(e.to_string()))
188 }
189 }
190 } else {
191 (None, 0, CharReaderStatus::Closed)
192 }
193 }
194}
195
196pub struct CharReaderIter<'a, R> {
197 creader: &'a mut CharReader<R>
198}
199
200pub struct IterChar {
201 pub char: char,
203 pub offset: u64
205}
206
207impl<'a, R: Read> Iterator for CharReaderIter<'a, R> {
208 type Item = IterChar;
209
210 fn next(&mut self) -> Option<Self::Item> {
211 let offset = self.creader.offset;
212 let c = self.creader.get_char();
213 c.map(|c| IterChar { char: c, offset })
214 }
215}
216
217pub mod macros {
221 #[macro_export]
223 macro_rules! utf8 {
224 ( MIN ) => { 0_u32 };
225 ( LOW_MAX ) => { 0xd7ff_u32 };
226 ( GAP_MIN ) => { 0xd800_u32 };
227 ( GAP_MAX ) => { 0xdfff_u32 };
228 ( HIGH_MIN ) => { 0xe000_u32 };
229 ( MAX ) => { 0x10ffff_u32 };
230 ( $a:literal ) => { $a as u32 }
231 }
232}
233
234#[cfg(test)]
239mod char_reader {
240 use std::io::Cursor;
241 use crate::CollectJoin;
242 use crate::char_reader::escape_char;
243 use super::*;
244
245 fn get_tests() -> Vec::<(&'static str, Vec<u64>)> {
246 vec![
247 ("012顠abc©345𠃐ab", vec![0, 1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 18, 19]),
248 ("1234567890123456789顠abc", vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24]),
249 ("", vec![]),
250 ("1", vec![0]),
251 ("12", vec![0, 1]),
252 ("©", vec![0]),
253 ("𠃐𠃐", vec![0, 4])
254 ]
255 }
256
257 #[test]
258 fn utf8_length() {
259 for i in 0_u8..128 {
260 assert_eq!(utf8_len(i), utf8_len_notable(i), "length of {i} (0x{i:x}) differs");
261 }
262 }
263
264 #[test]
265 fn read_rewind() {
266 let text = "aαbβgΔs∑z";
267 let mut reader = CharReader::new(Cursor::new(text));
268 assert!(reader.is_reading());
269 let mut counter = 0;
270 while reader.is_reading() {
271 counter += 1;
272 let c = reader.get_char().unwrap_or('!');
273 if c == '!' {
274 assert_eq!(reader.status, CharReaderStatus::Closed);
275 }
276 let reader_offset = reader.offset;
277 let reader_status = reader.status.clone();
278 assert!(reader.peek.is_none());
280 reader.rewind(c).expect("rewind should be fine");
281 assert!(reader.peek.is_some());
282 if let Some((pc, po, ps)) = &reader.peek {
283 assert_eq!(pc, &Some(c), "failed rewinding '{}'", escape_char(c));
284 assert_eq!(po, &reader_offset, "failed rewinding '{}'", escape_char(c));
285 assert_eq!(ps, &reader_status, "failed rewinding '{}'", escape_char(c));
286 }
287 let c_again = reader.get_char();
289 assert!(reader.peek.is_none(), "failed reading after rewind for '{}'", escape_char(c));
290 assert_eq!(c_again, Some(c), "failed reading after rewind for '{}'", escape_char(c));
291 assert_eq!(&reader.offset, &reader_offset, "failed reading after rewind for '{}'", escape_char(c));
292 assert_eq!(&reader.status, &reader_status, "failed reading after rewind for '{}'", escape_char(c));
293 }
294 assert_eq!(counter, text.chars().count() + 1);
295 assert_eq!(reader.status, CharReaderStatus::Closed);
296 assert_eq!(reader.get_char(), None);
297 }
298
299
300 #[test]
301 fn char_iterator() {
302 let tests = get_tests();
303 for (index, (text, expected_pos)) in tests.iter().enumerate() {
304 let mut result = String::new();
305 let mut result_pos = Vec::new();
306 let mut reader = CharReader::new(Cursor::new(text));
307 for c in reader.chars() {
308 result.push(c.char);
309 result_pos.push(c.offset);
310 }
311 assert_eq!(result, *text, "test #{index}");
312 assert_eq!(result_pos, *expected_pos, "test #{index}");
313 assert_eq!(reader.get_status(), &CharReaderStatus::Closed);
314 }
315 }
316
317 #[test]
318 fn char_iterator_peek() {
319 for early_peek in [false, true] {
320 let tests = get_tests();
321 for (index, (text, expected_pos)) in tests.iter().enumerate() {
322 let mut result = String::new();
323 let mut result_pos = Vec::new();
324 let mut reader = CharReader::new(Cursor::new(text));
325 let mut result_peek = Vec::new();
326 let mut i = 0;
327 if early_peek {
328 result_peek.push(reader.peek());
329 }
330 while let (offset, Some(c)) = (reader.get_offset(), reader.get_char()) {
331 if i & 1 == 1 {
332 result_peek.push(reader.peek());
333 }
334 result.push(c);
335 result_pos.push(offset);
336 i += 1;
337 }
338 let expected_peek = if early_peek {
339 text.chars().map(|c| Some(c)).chain([None])
340 .enumerate()
341 .filter_map(|(i, c)| if i & 1 == 0 { Some(c) } else { None })
342 .to_vec()
343 } else {
344 text.chars().map(|c| Some(c)).chain([None])
345 .skip(1).enumerate()
347 .filter_map(|(i, c)| if i & 1 == 1 { Some(c) } else { None })
348 .to_vec()
349 };
350 let error = format!("test #{index} for early_peek={early_peek}");
351 assert_eq!(result, *text, "{error}");
352 assert_eq!(result_pos, *expected_pos, "{error}");
353 assert_eq!(reader.get_status(), &CharReaderStatus::Closed, "{error}");
354 assert_eq!(result_peek, expected_peek, "{error}");
355 }
356 }
357 }
358
359 #[test]
360 fn partial_iterations() {
361 let tests = get_tests();
362 for (index, (text, _)) in tests.into_iter().enumerate() {
363 let mut reader = CharReader::new(Cursor::new(text));
364 let length = text.chars().count();
365 let mut result = reader.chars().take(length/2).map(|it| it.char).collect::<String>();
366 while let Some(c) = reader.get_char() {
367 result.push(c);
368 }
369 assert_eq!(result, text, "test #{index}");
370 }
371 }
372}
373
374pub fn escape_char(c: char) -> String {
375 match c {
376 '\u{0}' => "MIN".to_string(),
378 '\u{d7ff}' => "LOW_MAX".to_string(),
379 '\u{e000}' => "HIGH_MIN".to_string(),
380 '\u{10ffff}' => "MAX".to_string(),
381 _ => c.escape_debug().to_string(),
383 }
384}
385
386pub fn escape_string(s: &str) -> String {
387 s.chars().map(|c| escape_char(c)).collect::<String>()
388}