lexigram_core/
char_reader.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3use std::io::{BufReader, Read};
4
5// Note on UTF-8 encoding
6//
7//                         |     (hexa)    |                 UTF-8                  |                UTF-16
8//     Codepoint Value     |   min    max  | 1st byte  2nd byte  3rd byte  4th byte |     1st word           2nd word
9// ------------------------+---------------+----------------------------------------+--------------------------------------
10//       00000000_0xxxxxxx |   0000   007F | 0xxxxxxx                               | 00000000_0xxxxxxx
11//                         |               |                                        |
12//       00000yyy_yyxxxxxx |   0080   07FF | 110yyyyy  10xxxxxx                     | 00000yyy_yyxxxxxx
13//                         |               |                                        |
14//       zzzzyyyy_yyxxxxxx |   0800   FFFF | 1110zzzz  10yyyyyy  10xxxxxx           | zzzzyyyy_yyxxxxxx
15//                         |               |                                        |
16// uuuuu_zzzzyyyy_yyxxxxxx | 010000 10FFFF | 11110uuu  10uuzzzz  10yyyyyy  10xxxxxx | 110110ww_wwzzzzyy  110111yy_yyxxxxxx
17// (uuuuu: max 10000)      |               |                                        | (wwww = uuuuu-1)
18//
19// Valid codepoint values:
20//
21// 000000 - 00007f: 1 byte
22// 000080 - 0007ff: 2 bytes
23// 000800 - 00d7ff: 3 bytes
24// --------------------------
25// 00d800 - 00dfff: forbidden
26// --------------------------
27// 00e000 - 00ffff: 3 bytes
28// 010000 - 10ffff: 4 bytes
29
30pub const UTF8_MIN: u32      =        0;
31pub const UTF8_LOW_MAX: u32  =   0xd7ff;
32pub const UTF8_GAP_MIN: u32  =   0xd800;
33pub const UTF8_GAP_MAX: u32  =   0xdfff;
34pub const UTF8_HIGH_MIN: u32 =   0xe000;
35pub const UTF8_MAX: u32      = 0x10ffff;
36
37const UTF8_LENGTH: [u8; 256] = [
38    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
39    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
42    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
44    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
45    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
46    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
51    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
52    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
53    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
54];
55
56// Note: both utf8_len* functions below may be exchanged to avoid using a table
57// in production code.
58
59#[inline]
60/// Determines the number of bytes required to encode a UTF-8 character from its first byte.
61pub fn utf8_len(byte: u8) -> usize {
62    return UTF8_LENGTH[byte as usize] as usize;
63}
64
65#[inline]
66/// Determines the number of bytes required to encode a UTF-8 character from its first byte.
67pub fn utf8_len_notable(byte: u8) -> usize {
68    match byte {
69        0x00..=0x7f => 1,
70        0xc2..=0xdf => 2,
71        0xe0..=0xef => 3,
72        0xf0..=0xf4 => 4,
73        _ => 0
74    }
75}
76
77#[derive(Debug, PartialEq, Clone, Default)]
78pub enum CharReaderStatus {
79    #[default]
80    Reading,
81    Error(String),
82    Closed
83}
84
85#[derive(Debug)]
86pub enum CharReaderError {
87    NoRoomToRewind
88}
89
90pub struct CharReader<R> {
91    reader: BufReader<R>,
92    /// offset of next character, in bytes
93    offset: u64,
94    status: CharReaderStatus,
95    peek: Option<(Option<char>, u64, CharReaderStatus)>,
96}
97
98impl<R: Read> CharReader<R> {
99    pub fn new(source: R) -> Self {
100        CharReader {
101            reader: BufReader::new(source),
102            offset: 0,
103            status: CharReaderStatus::Reading,
104            peek: None,
105        }
106    }
107
108    pub fn is_reading(&self) -> bool {
109        matches!(self.status, CharReaderStatus::Reading)
110    }
111
112    pub fn get_offset(&self) -> u64 {
113        self.offset
114    }
115
116    pub fn get_status(&self) -> &CharReaderStatus {
117        &self.status
118    }
119    
120    pub fn chars(&mut self) -> CharReaderIter<'_, R> {
121        CharReaderIter { creader: self }
122    }
123
124    pub fn get_char(&mut self) -> Option<char> {
125        if let Some(peek) = std::mem::take(&mut self.peek) {
126            self.offset = peek.1;
127            self.status = peek.2;
128            peek.0
129        } else {
130            let (c, len, status) = self.read_char();
131            self.offset += len as u64;
132            self.status = status;
133            c
134        }
135    }
136
137    pub fn rewind(&mut self, chr: char) -> Result<(), CharReaderError> {
138        if self.peek.is_none() {
139            let new_offset = self.offset - chr.len_utf8() as u64;
140            self.peek = Some((Some(chr), self.offset, std::mem::take(&mut self.status)));
141            self.offset = new_offset;
142            self.status = CharReaderStatus::Reading;
143            Ok(())
144        } else {
145            Err(CharReaderError::NoRoomToRewind)
146        }
147    }
148
149    pub fn peek(&mut self) -> Option<char> {
150        if let Some(peek) = &self.peek {
151            peek.0
152        } else {
153            let (c, len, status) = self.read_char();
154            self.peek = Some((c, self.offset + len as u64, status));
155            c
156        }
157    }
158    
159    fn read_char(&mut self) -> (Option<char>, usize, CharReaderStatus) {
160        if let CharReaderStatus::Reading = self.status {
161            let mut buffer = [0; 4];
162            let s = self.reader.read(&mut buffer[0..=0]);
163            match s {
164                Ok(0) => (None, 0, CharReaderStatus::Closed), // TODO: take 'live', expandable sources into account with an option
165                Ok(1) => {
166                    let len = utf8_len(buffer[0]);
167                    match len {
168                        0 => {
169                            return (None, 0, CharReaderStatus::Error(format!("UTF-8 encoding error at offset {}", self.offset)));
170                        }
171                        1 => {}
172                        2..=4 => {
173                            match self.reader.read(&mut buffer[1..len]) {
174                                Ok(n) => assert_eq!(n, len - 1),
175                                Err(e) => return (None, 0, CharReaderStatus::Error(e.to_string())),
176                            }
177                        }
178                        _ => panic!("Unexpected UTF-8 length {} at offset {}", len, self.offset),
179                    }
180                    let c = std::str::from_utf8(&buffer[..len]).unwrap()
181                        .chars()
182                        .next().unwrap();
183                    (Some(c), len, CharReaderStatus::Reading)
184                }
185                Ok(n) => panic!("Unexpected Read::read() result: Ok({}) at offset {}", n, self.offset),
186                Err(e) => {
187                    (None, 0, CharReaderStatus::Error(e.to_string()))
188                }
189            }
190        } else {
191            (None, 0, CharReaderStatus::Closed)
192        }
193    }
194}
195
196pub struct CharReaderIter<'a, R> {
197    creader: &'a mut CharReader<R>
198}
199
200pub struct IterChar {
201    /// next character from the stream
202    pub char: char,
203    /// offset of `char` in the stream, in bytes
204    pub offset: u64
205}
206
207impl<'a, R: Read> Iterator for CharReaderIter<'a, R> {
208    type Item = IterChar;
209
210    fn next(&mut self) -> Option<Self::Item> {
211        let offset = self.creader.offset;
212        let c = self.creader.get_char();
213        c.map(|c| IterChar { char: c, offset })
214    }
215}
216
217// ---------------------------------------------------------------------------------------------
218// Macros
219
220pub mod macros {
221    /// Replaces a few identifiers by their codepoint value, and casts character / integer literals to `u32`.
222    #[macro_export]
223    macro_rules! utf8 {
224        ( MIN )        => { 0_u32 };
225        ( LOW_MAX )    => { 0xd7ff_u32 };
226        ( GAP_MIN )    => { 0xd800_u32 };
227        ( GAP_MAX )    => { 0xdfff_u32 };
228        ( HIGH_MIN )   => { 0xe000_u32 };
229        ( MAX )        => { 0x10ffff_u32 };
230        ( $a:literal ) => { $a as u32 }
231    }
232}
233
234// ---------------------------------------------------------------------------------------------
235// Tests
236// ---------------------------------------------------------------------------------------------
237
238#[cfg(test)]
239mod char_reader {
240    use std::io::Cursor;
241    use crate::CollectJoin;
242    use crate::char_reader::escape_char;
243    use super::*;
244
245    fn get_tests() -> Vec::<(&'static str, Vec<u64>)> {
246        vec![
247            ("012顠abc©345𠃐ab",          vec![0, 1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 18, 19]),
248            ("1234567890123456789顠abc",  vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24]),
249            ("",                          vec![]),
250            ("1",                         vec![0]),
251            ("12",                        vec![0, 1]),
252            ("©",                         vec![0]),
253            ("𠃐𠃐",                      vec![0, 4])
254        ]
255    }
256
257    #[test]
258    fn utf8_length() {
259        for i in 0_u8..128 {
260            assert_eq!(utf8_len(i), utf8_len_notable(i), "length of {i} (0x{i:x}) differs");
261        }
262    }
263
264    #[test]
265    fn read_rewind() {
266        let text = "aαbβgΔs∑z";
267        let mut reader = CharReader::new(Cursor::new(text));
268        assert!(reader.is_reading());
269        let mut counter = 0;
270        while reader.is_reading() {
271            counter += 1;
272            let c = reader.get_char().unwrap_or('!');
273            if c == '!' {
274                assert_eq!(reader.status, CharReaderStatus::Closed);
275            }
276            let reader_offset = reader.offset;
277            let reader_status = reader.status.clone();
278            // rewinding
279            assert!(reader.peek.is_none());
280            reader.rewind(c).expect("rewind should be fine");
281            assert!(reader.peek.is_some());
282            if let Some((pc, po, ps)) = &reader.peek {
283                assert_eq!(pc, &Some(c), "failed rewinding '{}'", escape_char(c));
284                assert_eq!(po, &reader_offset, "failed rewinding '{}'", escape_char(c));
285                assert_eq!(ps, &reader_status, "failed rewinding '{}'", escape_char(c));
286            }
287            // forward again
288            let c_again = reader.get_char();
289            assert!(reader.peek.is_none(), "failed reading after rewind for '{}'", escape_char(c));
290            assert_eq!(c_again, Some(c), "failed reading after rewind for '{}'", escape_char(c));
291            assert_eq!(&reader.offset, &reader_offset, "failed reading after rewind for '{}'", escape_char(c));
292            assert_eq!(&reader.status, &reader_status, "failed reading after rewind for '{}'", escape_char(c));
293        }
294        assert_eq!(counter, text.chars().count() + 1);
295        assert_eq!(reader.status, CharReaderStatus::Closed);
296        assert_eq!(reader.get_char(), None);
297    }
298
299
300    #[test]
301    fn char_iterator() {
302        let tests = get_tests();
303        for (index, (text, expected_pos)) in tests.iter().enumerate() {
304            let mut result = String::new();
305            let mut result_pos = Vec::new();
306            let mut reader = CharReader::new(Cursor::new(text));
307            for c in reader.chars() {
308                result.push(c.char);
309                result_pos.push(c.offset);
310            }
311            assert_eq!(result, *text, "test #{index}");
312            assert_eq!(result_pos, *expected_pos, "test #{index}");
313            assert_eq!(reader.get_status(), &CharReaderStatus::Closed);
314        }
315    }
316
317    #[test]
318    fn char_iterator_peek() {
319        for early_peek in [false, true] {
320            let tests = get_tests();
321            for (index, (text, expected_pos)) in tests.iter().enumerate() {
322                let mut result = String::new();
323                let mut result_pos = Vec::new();
324                let mut reader = CharReader::new(Cursor::new(text));
325                let mut result_peek = Vec::new();
326                let mut i = 0;
327                if early_peek {
328                    result_peek.push(reader.peek());
329                }
330                while let (offset, Some(c)) = (reader.get_offset(), reader.get_char()) {
331                    if i & 1 == 1 {
332                        result_peek.push(reader.peek());
333                    }
334                    result.push(c);
335                    result_pos.push(offset);
336                    i += 1;
337                }
338                let expected_peek = if early_peek {
339                    text.chars().map(|c| Some(c)).chain([None])
340                        .enumerate()
341                        .filter_map(|(i, c)| if i & 1 == 0 { Some(c) } else { None })
342                        .to_vec()
343                } else {
344                    text.chars().map(|c| Some(c)).chain([None])
345                        .skip(1)// no initial peek
346                        .enumerate()
347                        .filter_map(|(i, c)| if i & 1 == 1 { Some(c) } else { None })
348                        .to_vec()
349                };
350                let error = format!("test #{index} for early_peek={early_peek}");
351                assert_eq!(result, *text, "{error}");
352                assert_eq!(result_pos, *expected_pos, "{error}");
353                assert_eq!(reader.get_status(), &CharReaderStatus::Closed, "{error}");
354                assert_eq!(result_peek, expected_peek, "{error}");
355            }
356        }
357    }
358
359    #[test]
360    fn partial_iterations() {
361        let tests = get_tests();
362        for (index, (text, _)) in tests.into_iter().enumerate() {
363            let mut reader = CharReader::new(Cursor::new(text));
364            let length = text.chars().count();
365            let mut result = reader.chars().take(length/2).map(|it| it.char).collect::<String>();
366            while let Some(c) = reader.get_char() {
367                result.push(c);
368            }
369            assert_eq!(result, text, "test #{index}");
370        }
371    }
372}
373
374pub fn escape_char(c: char) -> String {
375    match c {
376        // '\x00'..='\x7f' => c.escape_debug().to_string(),
377              '\u{0}' => "MIN".to_string(),
378           '\u{d7ff}' => "LOW_MAX".to_string(),
379           '\u{e000}' => "HIGH_MIN".to_string(),
380         '\u{10ffff}' => "MAX".to_string(),
381        // '\u{f7ff}' | '\u{e000}' | '\u{10ffff}' => c.escape_unicode().to_string(),
382        _ => c.escape_debug().to_string(),
383    }
384}
385
386pub fn escape_string(s: &str) -> String {
387    s.chars().map(|c| escape_char(c)).collect::<String>()
388}