Skip to main content

qusql_parse/
byte_to_char.rs

1//! ByteToChar is a utility for mapping byte positions in a UTF-8 string to character positions,
2//! which is useful for error reporting in parsers.
3use alloc::vec::Vec;
4
5/// Struct to map byte positions in a UTF-8 string to character positions, for error reporting.
6///
7/// Memory usage is optimized by storing cumulative character counts at 128 byte intervals,
8/// which allows for efficient mapping of byte positions to character positions without
9/// needing to store a mapping for every single byte.
10///
11/// ```
12/// # let sql = "SELECT 1";
13/// # let byte_span = 0..1usize;
14/// use qusql_parse::ByteToChar;
15///
16/// let b2c = ByteToChar::new(sql.as_bytes());
17/// let char_span = b2c.map_span(byte_span.start..byte_span.end);
18/// ```
19pub struct ByteToChar<'a> {
20    bytes: &'a [u8],
21    cnt: Vec<u32>, // Cumulative count of characters up for each 128 byte block
22}
23
24impl<'a> ByteToChar<'a> {
25    /// Create a new ByteToChar mapping for the given byte slice.
26    pub fn new(bytes: &'a [u8]) -> Self {
27        let mut cnt = Vec::new();
28        let mut char_count = 0;
29        for chunk in bytes.chunks(128) {
30            cnt.push(char_count);
31            char_count += chunk.iter().filter(|&&b| (b & 0xC0) != 0x80).count() as u32;
32        }
33        Self { bytes, cnt }
34    }
35
36    /// Map a byte position to a character position.
37    pub fn map(&self, byte_pos: usize) -> usize {
38        let block_index = byte_pos / 128;
39        let block_start_byte = block_index * 128;
40        let char_count_before_block = self.cnt.get(block_index).cloned().unwrap_or(0) as usize;
41        let char_count_in_block = self.bytes[block_start_byte..byte_pos]
42            .iter()
43            .filter(|&&b| (b & 0xC0) != 0x80)
44            .count();
45        char_count_before_block + char_count_in_block
46    }
47
48    /// Map a byte-offset span to a char-offset span.
49    pub fn map_span(&self, span: core::ops::Range<usize>) -> core::ops::Range<usize> {
50        self.map(span.start)..self.map(span.end)
51    }
52}
53
54#[cfg(test)]
55mod tests {
56    use super::*;
57
58    #[test]
59    fn test_byte_to_char() {
60        let s = "Hello, 世界!"; // "Hello, " is 7 bytes, "世界" is 6 bytes, "!" is 1 byte
61        let b2c = ByteToChar::new(s.as_bytes());
62
63        assert_eq!(b2c.map(0), 0); // 'H'
64        assert_eq!(b2c.map(7), 7); // ','
65        assert_eq!(b2c.map(13), 9); // '界'
66        assert_eq!(b2c.map(14), 10); // '!'
67        assert_eq!(b2c.map_span(0..14), 0..10); // Full string
68
69        // Test with more than one block
70        let long_str = "a".repeat(200) + "世界"; // 200 'a' (1 byte each) + 6 bytes for '世界'
71        let b2c_long = ByteToChar::new(long_str.as_bytes());
72        assert_eq!(b2c_long.map(0), 0); // 'a'
73        assert_eq!(b2c_long.map(199), 199); // Last 'a'
74        assert_eq!(b2c_long.map(200), 200); // '世'
75        assert_eq!(b2c_long.map(206), 202); // '界'
76    }
77}