qusql_parse/byte_to_char.rs
1//! ByteToChar is a utility for mapping byte positions in a UTF-8 string to character positions,
2//! which is useful for error reporting in parsers.
3use alloc::vec::Vec;
4
5/// Struct to map byte positions in a UTF-8 string to character positions, for error reporting.
6///
7/// Memory usage is optimized by storing cumulative character counts at 128 byte intervals,
8/// which allows for efficient mapping of byte positions to character positions without
9/// needing to store a mapping for every single byte.
10///
11/// ```
12/// # let sql = "SELECT 1";
13/// # let byte_span = 0..1usize;
14/// use qusql_parse::ByteToChar;
15///
16/// let b2c = ByteToChar::new(sql.as_bytes());
17/// let char_span = b2c.map_span(byte_span.start..byte_span.end);
18/// ```
19pub struct ByteToChar<'a> {
20 bytes: &'a [u8],
21 cnt: Vec<u32>, // Cumulative count of characters up for each 128 byte block
22}
23
24impl<'a> ByteToChar<'a> {
25 /// Create a new ByteToChar mapping for the given byte slice.
26 pub fn new(bytes: &'a [u8]) -> Self {
27 let mut cnt = Vec::new();
28 let mut char_count = 0;
29 for chunk in bytes.chunks(128) {
30 cnt.push(char_count);
31 char_count += chunk.iter().filter(|&&b| (b & 0xC0) != 0x80).count() as u32;
32 }
33 Self { bytes, cnt }
34 }
35
36 /// Map a byte position to a character position.
37 pub fn map(&self, byte_pos: usize) -> usize {
38 let block_index = byte_pos / 128;
39 let block_start_byte = block_index * 128;
40 let char_count_before_block = self.cnt.get(block_index).cloned().unwrap_or(0) as usize;
41 let char_count_in_block = self.bytes[block_start_byte..byte_pos]
42 .iter()
43 .filter(|&&b| (b & 0xC0) != 0x80)
44 .count();
45 char_count_before_block + char_count_in_block
46 }
47
48 /// Map a byte-offset span to a char-offset span.
49 pub fn map_span(&self, span: core::ops::Range<usize>) -> core::ops::Range<usize> {
50 self.map(span.start)..self.map(span.end)
51 }
52}
53
54#[cfg(test)]
55mod tests {
56 use super::*;
57
58 #[test]
59 fn test_byte_to_char() {
60 let s = "Hello, 世界!"; // "Hello, " is 7 bytes, "世界" is 6 bytes, "!" is 1 byte
61 let b2c = ByteToChar::new(s.as_bytes());
62
63 assert_eq!(b2c.map(0), 0); // 'H'
64 assert_eq!(b2c.map(7), 7); // ','
65 assert_eq!(b2c.map(13), 9); // '界'
66 assert_eq!(b2c.map(14), 10); // '!'
67 assert_eq!(b2c.map_span(0..14), 0..10); // Full string
68
69 // Test with more than one block
70 let long_str = "a".repeat(200) + "世界"; // 200 'a' (1 byte each) + 6 bytes for '世界'
71 let b2c_long = ByteToChar::new(long_str.as_bytes());
72 assert_eq!(b2c_long.map(0), 0); // 'a'
73 assert_eq!(b2c_long.map(199), 199); // Last 'a'
74 assert_eq!(b2c_long.map(200), 200); // '世'
75 assert_eq!(b2c_long.map(206), 202); // '界'
76 }
77}