Skip to main content

lexigram_core/
char_reader.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3use std::io::Read;
4
5// Note on UTF-8 encoding
6//
7//                         |     (hexa)    |                 UTF-8                  |                UTF-16
8//     Codepoint Value     |   min    max  | 1st byte  2nd byte  3rd byte  4th byte |     1st word           2nd word
9// ------------------------+---------------+----------------------------------------+--------------------------------------
10//       00000000_0xxxxxxx |   0000   007F | 0xxxxxxx                               | 00000000_0xxxxxxx
11//                         |               |                                        |
12//       00000yyy_yyxxxxxx |   0080   07FF | 110yyyyy  10xxxxxx                     | 00000yyy_yyxxxxxx
13//                         |               |                                        |
14//       zzzzyyyy_yyxxxxxx |   0800   FFFF | 1110zzzz  10yyyyyy  10xxxxxx           | zzzzyyyy_yyxxxxxx
15//                         |               |                                        |
16// uuuuu_zzzzyyyy_yyxxxxxx | 010000 10FFFF | 11110uuu  10uuzzzz  10yyyyyy  10xxxxxx | 110110ww_wwzzzzyy  110111yy_yyxxxxxx
17// (uuuuu: max 10000)      |               |                                        | (wwww = uuuuu-1)
18//
19// Valid codepoint values:
20//
21// 000000 - 00007f: 1 byte
22// 000080 - 0007ff: 2 bytes
23// 000800 - 00d7ff: 3 bytes
24// --------------------------
25// 00d800 - 00dfff: forbidden
26// --------------------------
27// 00e000 - 00ffff: 3 bytes
28// 010000 - 10ffff: 4 bytes
29
30pub const UTF8_MIN: u32      =        0;
31pub const UTF8_LOW_MAX: u32  =   0xd7ff;
32pub const UTF8_GAP_MIN: u32  =   0xd800;
33pub const UTF8_GAP_MAX: u32  =   0xdfff;
34pub const UTF8_HIGH_MIN: u32 =   0xe000;
35pub const UTF8_MAX: u32      = 0x10ffff;
36
37const UTF8_LENGTH: [u8; 256] = [
38    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
39    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
42    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
44    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
45    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
46    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
51    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
52    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
53    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
54];
55
56// Note: both utf8_len* functions below may be exchanged to avoid using a table
57// in production code.
58
59#[inline]
60/// Determines the number of bytes required to encode a UTF-8 character from its first byte.
61pub fn utf8_len(byte: u8) -> usize {
62    UTF8_LENGTH[byte as usize] as usize
63}
64
65#[inline]
66/// Determines the number of bytes required to encode a UTF-8 character from its first byte.
67pub fn utf8_len_no_table(byte: u8) -> usize {
68    match byte {
69        0x00..=0x7f => 1,
70        0xc2..=0xdf => 2,
71        0xe0..=0xef => 3,
72        0xf0..=0xf4 => 4,
73        _ => 0
74    }
75}
76
77#[derive(Debug, PartialEq, Clone, Default)]
78pub enum CharReaderStatus {
79    #[default]
80    Reading,
81    Error(String),
82    Closed
83}
84
85#[derive(Debug)]
86pub enum CharReaderError {
87    NoRoomToRewind
88}
89
90pub struct CharReader<R> {
91    reader: R,
92    /// offset of next character, in bytes
93    offset: u64,
94    status: CharReaderStatus,
95    peek: Option<(Option<char>, u64, CharReaderStatus)>,
96}
97
98impl<R: Read> CharReader<R> {
99    pub fn new(source: R) -> Self {
100        CharReader {
101            reader: source,
102            offset: 0,
103            status: CharReaderStatus::Reading,
104            peek: None,
105        }
106    }
107
108    pub fn is_reading(&self) -> bool {
109        matches!(self.status, CharReaderStatus::Reading)
110    }
111
112    pub fn get_offset(&self) -> u64 {
113        self.offset
114    }
115
116    pub fn get_status(&self) -> &CharReaderStatus {
117        &self.status
118    }
119    
120    pub fn chars(&mut self) -> CharReaderIter<'_, R> {
121        CharReaderIter { creader: self }
122    }
123
124    pub fn get_char(&mut self) -> Option<char> {
125        if let Some(peek) = std::mem::take(&mut self.peek) {
126            self.offset = peek.1;
127            self.status = peek.2;
128            peek.0
129        } else {
130            let (c, len, status) = self.read_char();
131            self.offset += len as u64;
132            self.status = status;
133            c
134        }
135    }
136
137    pub fn rewind(&mut self, chr: char) -> Result<(), CharReaderError> {
138        if self.peek.is_none() {
139            let new_offset = self.offset - chr.len_utf8() as u64;
140            self.peek = Some((Some(chr), self.offset, std::mem::take(&mut self.status)));
141            self.offset = new_offset;
142            self.status = CharReaderStatus::Reading;
143            Ok(())
144        } else {
145            Err(CharReaderError::NoRoomToRewind)
146        }
147    }
148
149    pub fn peek(&mut self) -> Option<char> {
150        if let Some(peek) = &self.peek {
151            peek.0
152        } else {
153            let (c, len, status) = self.read_char();
154            self.peek = Some((c, self.offset + len as u64, status));
155            c
156        }
157    }
158    
159    fn read_char(&mut self) -> (Option<char>, usize, CharReaderStatus) {
160        if let CharReaderStatus::Reading = self.status {
161            let mut buffer = [0; 4];
162            let s = self.reader.read(&mut buffer[0..=0]);
163            match s {
164                Ok(0) => (None, 0, CharReaderStatus::Closed), // TODO: take 'live', expandable sources into account with an option
165                Ok(1) => {
166                    let len = utf8_len(buffer[0]);
167                    match len {
168                        0 => {
169                            return (None, 0, CharReaderStatus::Error(format!("UTF-8 encoding error at offset {}", self.offset)));
170                        }
171                        1 => {}
172                        2..=4 => {
173                            match self.reader.read(&mut buffer[1..len]) {
174                                Ok(n) => assert_eq!(n, len - 1),
175                                Err(e) => return (None, 0, CharReaderStatus::Error(e.to_string())),
176                            }
177                        }
178                        _ => panic!("Unexpected UTF-8 length {} at offset {}", len, self.offset),
179                    }
180                    let c = std::str::from_utf8(&buffer[..len]).unwrap()
181                        .chars()
182                        .next().unwrap();
183                    (Some(c), len, CharReaderStatus::Reading)
184                }
185                Ok(n) => panic!("Unexpected Read::read() result: Ok({}) at offset {}", n, self.offset),
186                Err(e) => {
187                    (None, 0, CharReaderStatus::Error(e.to_string()))
188                }
189            }
190        } else {
191            (None, 0, CharReaderStatus::Closed)
192        }
193    }
194}
195
196pub struct CharReaderIter<'a, R> {
197    creader: &'a mut CharReader<R>
198}
199
200pub struct IterChar {
201    /// next character from the stream
202    pub char: char,
203    /// offset of `char` in the stream, in bytes
204    pub offset: u64
205}
206
207impl<'a, R: Read> Iterator for CharReaderIter<'a, R> {
208    type Item = IterChar;
209
210    fn next(&mut self) -> Option<Self::Item> {
211        let offset = self.creader.offset;
212        let c = self.creader.get_char();
213        c.map(|c| IterChar { char: c, offset })
214    }
215}
216
217// ---------------------------------------------------------------------------------------------
218// Macros
219
220pub mod macros {
221    /// Replaces a few identifiers by their codepoint value, and casts character / integer literals to `u32`.
222    #[macro_export]
223    macro_rules! utf8 {
224        ( MIN )        => { 0_u32 };
225        ( LOW_MAX )    => { 0xd7ff_u32 };
226        ( GAP_MIN )    => { 0xd800_u32 };
227        ( GAP_MAX )    => { 0xdfff_u32 };
228        ( HIGH_MIN )   => { 0xe000_u32 };
229        ( MAX )        => { 0x10ffff_u32 };
230        ( $a:literal ) => { $a as u32 }
231    }
232}
233
234// ---------------------------------------------------------------------------------------------
235// Tests
236// ---------------------------------------------------------------------------------------------
237
238#[cfg(test)]
239mod char_reader {
240    use crate::CollectJoin;
241    use crate::char_reader::escape_char;
242    use super::*;
243
244    fn get_tests() -> Vec::<(&'static str, Vec<u64>)> {
245        vec![
246            ("012顠abc©345𠃐ab",          vec![0, 1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 18, 19]),
247            ("1234567890123456789顠abc",  vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24]),
248            ("",                          vec![]),
249            ("1",                         vec![0]),
250            ("12",                        vec![0, 1]),
251            ("©",                         vec![0]),
252            ("𠃐𠃐",                      vec![0, 4])
253        ]
254    }
255
256    #[test]
257    fn utf8_length() {
258        for i in 0_u8..128 {
259            assert_eq!(utf8_len(i), utf8_len_no_table(i), "length of {i} (0x{i:x}) differs");
260        }
261    }
262
263    #[test]
264    fn read_rewind() {
265        let text = "aαbβgΔs∑z";
266        let mut reader = CharReader::new(text.as_bytes());
267        assert!(reader.is_reading());
268        let mut counter = 0;
269        while reader.is_reading() {
270            counter += 1;
271            let c = reader.get_char().unwrap_or('!');
272            if c == '!' {
273                assert_eq!(reader.status, CharReaderStatus::Closed);
274            }
275            let reader_offset = reader.offset;
276            let reader_status = reader.status.clone();
277            // rewinding
278            assert!(reader.peek.is_none());
279            reader.rewind(c).expect("rewind should be fine");
280            assert!(reader.peek.is_some());
281            if let Some((pc, po, ps)) = &reader.peek {
282                assert_eq!(pc, &Some(c), "failed rewinding '{}'", escape_char(c));
283                assert_eq!(po, &reader_offset, "failed rewinding '{}'", escape_char(c));
284                assert_eq!(ps, &reader_status, "failed rewinding '{}'", escape_char(c));
285            }
286            // forward again
287            let c_again = reader.get_char();
288            assert!(reader.peek.is_none(), "failed reading after rewind for '{}'", escape_char(c));
289            assert_eq!(c_again, Some(c), "failed reading after rewind for '{}'", escape_char(c));
290            assert_eq!(&reader.offset, &reader_offset, "failed reading after rewind for '{}'", escape_char(c));
291            assert_eq!(&reader.status, &reader_status, "failed reading after rewind for '{}'", escape_char(c));
292        }
293        assert_eq!(counter, text.chars().count() + 1);
294        assert_eq!(reader.status, CharReaderStatus::Closed);
295        assert_eq!(reader.get_char(), None);
296    }
297
298
299    #[test]
300    fn char_iterator() {
301        let tests = get_tests();
302        for (index, (text, expected_pos)) in tests.iter().enumerate() {
303            let mut result = String::new();
304            let mut result_pos = Vec::new();
305            let mut reader = CharReader::new(text.as_bytes());
306            for c in reader.chars() {
307                result.push(c.char);
308                result_pos.push(c.offset);
309            }
310            assert_eq!(result, *text, "test #{index}");
311            assert_eq!(result_pos, *expected_pos, "test #{index}");
312            assert_eq!(reader.get_status(), &CharReaderStatus::Closed);
313        }
314    }
315
316    #[test]
317    fn char_iterator_peek() {
318        for early_peek in [false, true] {
319            let tests = get_tests();
320            for (index, (text, expected_pos)) in tests.iter().enumerate() {
321                let mut result = String::new();
322                let mut result_pos = Vec::new();
323                let mut reader = CharReader::new(text.as_bytes());
324                let mut result_peek = Vec::new();
325                let mut i = 0;
326                if early_peek {
327                    result_peek.push(reader.peek());
328                }
329                while let (offset, Some(c)) = (reader.get_offset(), reader.get_char()) {
330                    if i & 1 == 1 {
331                        result_peek.push(reader.peek());
332                    }
333                    result.push(c);
334                    result_pos.push(offset);
335                    i += 1;
336                }
337                let expected_peek = if early_peek {
338                    text.chars().map(|c| Some(c)).chain([None])
339                        .enumerate()
340                        .filter_map(|(i, c)| if i & 1 == 0 { Some(c) } else { None })
341                        .to_vec()
342                } else {
343                    text.chars().map(|c| Some(c)).chain([None])
344                        .skip(1)// no initial peek
345                        .enumerate()
346                        .filter_map(|(i, c)| if i & 1 == 1 { Some(c) } else { None })
347                        .to_vec()
348                };
349                let error = format!("test #{index} for early_peek={early_peek}");
350                assert_eq!(result, *text, "{error}");
351                assert_eq!(result_pos, *expected_pos, "{error}");
352                assert_eq!(reader.get_status(), &CharReaderStatus::Closed, "{error}");
353                assert_eq!(result_peek, expected_peek, "{error}");
354            }
355        }
356    }
357
358    #[test]
359    fn partial_iterations() {
360        let tests = get_tests();
361        for (index, (text, _)) in tests.into_iter().enumerate() {
362            let mut reader = CharReader::new(text.as_bytes());
363            let length = text.chars().count();
364            let mut result = reader.chars().take(length/2).map(|it| it.char).collect::<String>();
365            while let Some(c) = reader.get_char() {
366                result.push(c);
367            }
368            assert_eq!(result, text, "test #{index}");
369        }
370    }
371}
372
373pub fn escape_char(c: char) -> String {
374    match c {
375        // '\x00'..='\x7f' => c.escape_debug().to_string(),
376              '\u{0}' => "MIN".to_string(),
377           '\u{d7ff}' => "LOW_MAX".to_string(),
378           '\u{e000}' => "HIGH_MIN".to_string(),
379         '\u{10ffff}' => "MAX".to_string(),
380        // '\u{f7ff}' | '\u{e000}' | '\u{10ffff}' => c.escape_unicode().to_string(),
381        _ => c.escape_debug().to_string(),
382    }
383}
384
385pub fn escape_string(s: &str) -> String {
386    s.chars().map(escape_char).collect::<String>()
387}