Skip to main content

qusql_parse/
byte_to_char.rs

1//! ByteToChar is a utility for mapping byte positions in a UTF-8 string to character positions,
2//! which is useful for error reporting in parsers.
3use alloc::vec::Vec;
4
5/// Struct to map byte positions in a UTF-8 string to character positions, for error reporting.
6///
7/// Memory usage is optimized by storing cumulative character counts at 128 byte intervals,
8/// which allows for efficient mapping of byte positions to character positions without
9/// needing to store a mapping for every single byte.
10pub struct ByteToChar<'a> {
11    bytes: &'a [u8],
12    cnt: Vec<u32>, // Cumulative count of characters up for each 128 byte block
13}
14
15impl<'a> ByteToChar<'a> {
16    /// Create a new ByteToChar mapping for the given byte slice.
17    pub fn new(bytes: &'a [u8]) -> Self {
18        let mut cnt = Vec::new();
19        let mut char_count = 0;
20        for chunk in bytes.chunks(128) {
21            cnt.push(char_count);
22            char_count += chunk.iter().filter(|&&b| (b & 0xC0) != 0x80).count() as u32;
23        }
24        Self { bytes, cnt }
25    }
26
27    /// Map a byte position to a character position.
28    pub fn map(&self, byte_pos: usize) -> usize {
29        let block_index = byte_pos / 128;
30        let block_start_byte = block_index * 128;
31        let char_count_before_block = self.cnt.get(block_index).cloned().unwrap_or(0) as usize;
32        let char_count_in_block = self.bytes[block_start_byte..byte_pos]
33            .iter()
34            .filter(|&&b| (b & 0xC0) != 0x80)
35            .count();
36        char_count_before_block + char_count_in_block
37    }
38
39    /// Map a byte-offset span to a char-offset span.
40    pub fn map_span(&self, span: core::ops::Range<usize>) -> core::ops::Range<usize> {
41        self.map(span.start)..self.map(span.end)
42    }
43}
44
45#[cfg(test)]
46mod tests {
47    use super::*;
48
49    #[test]
50    fn test_byte_to_char() {
51        let s = "Hello, 世界!"; // "Hello, " is 7 bytes, "世界" is 6 bytes, "!" is 1 byte
52        let b2c = ByteToChar::new(s.as_bytes());
53
54        assert_eq!(b2c.map(0), 0); // 'H'
55        assert_eq!(b2c.map(7), 7); // ','
56        assert_eq!(b2c.map(13), 9); // '界'
57        assert_eq!(b2c.map(14), 10); // '!'
58        assert_eq!(b2c.map_span(0..14), 0..10); // Full string
59
60        // Test with more than one block
61        let long_str = "a".repeat(200) + "世界"; // 200 'a' (1 byte each) + 6 bytes for '世界'
62        let b2c_long = ByteToChar::new(long_str.as_bytes());
63        assert_eq!(b2c_long.map(0), 0); // 'a'
64        assert_eq!(b2c_long.map(199), 199); // Last 'a'
65        assert_eq!(b2c_long.map(200), 200); // '世'
66        assert_eq!(b2c_long.map(206), 202); // '界'
67    }
68}