Skip to main content

sevenmark_utils/
utf16_offset_converter.rs

1//! UTF-8 to UTF-16 offset conversion utilities
2
3use serde::Serialize;
4use sevenmark_parser::ast::{Element, Span};
5
6/// UTF-16 code unit offset position (0-based)
7/// Designed for CodeMirror 6 compatibility
8#[derive(Debug, Clone, Serialize)]
9pub struct Utf16Position {
10    pub start: u32, // 0-based UTF-16 code unit offset
11    pub end: u32,   // 0-based UTF-16 code unit offset
12}
13
14/// UTF-8 byte offset to UTF-16 code unit offset converter
15///
16/// CodeMirror 6 uses absolute UTF-16 code unit offsets for positions.
17/// This converter provides O(1) lookups after O(n) preprocessing.
18pub struct Utf16OffsetConverter {
19    /// Maps byte position to UTF-16 code unit position
20    /// Index: byte offset, Value: UTF-16 offset at that byte
21    byte_to_utf16: Vec<u32>,
22}
23
24impl Utf16OffsetConverter {
25    /// Creates a new converter with O(n) preprocessing
26    ///
27    /// Builds a lookup table mapping each byte position to its
28    /// corresponding UTF-16 code unit position.
29    pub fn new(input: &str) -> Self {
30        let mut map = vec![0u32; input.len() + 1];
31        let mut utf16_pos = 0u32;
32
33        for (byte_pos, ch) in input.char_indices() {
34            map[byte_pos] = utf16_pos;
35            utf16_pos += ch.len_utf16() as u32;
36        }
37        map[input.len()] = utf16_pos;
38
39        Self { byte_to_utf16: map }
40    }
41
42    /// Converts a byte offset to UTF-16 code unit offset in O(1)
43    pub fn convert(&self, byte_offset: usize) -> u32 {
44        self.byte_to_utf16
45            .get(byte_offset)
46            .copied()
47            .unwrap_or(*self.byte_to_utf16.last().unwrap_or(&0))
48    }
49
50    /// Converts a Span to UTF-16 position
51    pub fn convert_span(&self, span: &Span) -> Utf16Position {
52        Utf16Position {
53            start: self.convert(span.start),
54            end: self.convert(span.end),
55        }
56    }
57
58    /// Converts SevenMark AST elements to JSON with UTF-16 positions
59    pub fn convert_elements(&self, elements: &[Element]) -> serde_json::Value {
60        let mut json = serde_json::to_value(elements).unwrap_or(serde_json::Value::Null);
61        self.convert_spans_in_json(&mut json);
62        json
63    }
64
65    /// Recursively transforms span fields in JSON values
66    fn convert_spans_in_json(&self, value: &mut serde_json::Value) {
67        match value {
68            serde_json::Value::Object(map) => {
69                // Check for span field and convert it
70                if let Some(span_value) = map.get("span")
71                    && let Ok(span) = serde_json::from_value::<Span>(span_value.clone())
72                {
73                    let utf16_span = self.convert_span(&span);
74                    map.insert(
75                        "span".to_string(),
76                        serde_json::to_value(utf16_span).unwrap(),
77                    );
78                }
79
80                // Recursively process all other fields
81                for (_, v) in map.iter_mut() {
82                    self.convert_spans_in_json(v);
83                }
84            }
85            serde_json::Value::Array(arr) => {
86                for item in arr.iter_mut() {
87                    self.convert_spans_in_json(item);
88                }
89            }
90            _ => {}
91        }
92    }
93}
94
95/// Converts SevenMark AST to JSON with UTF-16 absolute offsets
96///
97/// Main entry point for converting parsed SevenMark elements to JSON format
98/// with 0-based UTF-16 code unit offsets (for CodeMirror 6 compatibility).
99///
100/// # Arguments
101/// * `elements` - The parsed SevenMark AST elements
102/// * `input` - The original input string used for offset calculation
103///
104/// # Returns
105/// JSON string with 0-based UTF-16 code unit offsets
106pub fn convert_ast_to_utf16_offset_json(elements: &[Element], input: &str) -> String {
107    let converter = Utf16OffsetConverter::new(input);
108    let result = converter.convert_elements(elements);
109    serde_json::to_string(&result).unwrap_or_default()
110}
111
112#[cfg(test)]
113mod tests {
114    use super::*;
115
116    #[test]
117    fn test_ascii() {
118        let input = "hello\nworld";
119        let converter = Utf16OffsetConverter::new(input);
120
121        assert_eq!(converter.convert(0), 0); // 'h'
122        assert_eq!(converter.convert(5), 5); // '\n'
123        assert_eq!(converter.convert(6), 6); // 'w'
124        assert_eq!(converter.convert(11), 11); // end
125    }
126
127    #[test]
128    fn test_korean() {
129        // "한국어" = 9 bytes UTF-8, 3 UTF-16 code units
130        let input = "한국어";
131        let converter = Utf16OffsetConverter::new(input);
132
133        assert_eq!(converter.convert(0), 0); // '한' start
134        assert_eq!(converter.convert(3), 1); // '국' start
135        assert_eq!(converter.convert(6), 2); // '어' start
136        assert_eq!(converter.convert(9), 3); // end
137    }
138
139    #[test]
140    fn test_emoji() {
141        // "a🚀b" = 1 + 4 + 1 = 6 bytes UTF-8
142        // UTF-16: 1 + 2 (surrogate pair) + 1 = 4 code units
143        let input = "a🚀b";
144        let converter = Utf16OffsetConverter::new(input);
145
146        assert_eq!(converter.convert(0), 0); // 'a'
147        assert_eq!(converter.convert(1), 1); // '🚀' start
148        assert_eq!(converter.convert(5), 3); // 'b'
149        assert_eq!(converter.convert(6), 4); // end
150    }
151
152    #[test]
153    fn test_mixed() {
154        // "한\n글" = 3 + 1 + 3 = 7 bytes, 1 + 1 + 1 = 3 UTF-16 units
155        let input = "한\n글";
156        let converter = Utf16OffsetConverter::new(input);
157
158        assert_eq!(converter.convert(0), 0); // '한'
159        assert_eq!(converter.convert(3), 1); // '\n'
160        assert_eq!(converter.convert(4), 2); // '글'
161        assert_eq!(converter.convert(7), 3); // end
162    }
163
164    #[test]
165    fn test_empty() {
166        let input = "";
167        let converter = Utf16OffsetConverter::new(input);
168        assert_eq!(converter.convert(0), 0);
169    }
170
171    #[test]
172    fn test_boundary() {
173        let input = "abc";
174        let converter = Utf16OffsetConverter::new(input);
175        // Out of bounds returns last valid position
176        assert_eq!(converter.convert(100), 3);
177    }
178
179    #[test]
180    fn test_complex() {
181        // "Hello, 세계! 🌍"
182        // UTF-8: 7 + 6 + 2 + 4 = 19 bytes
183        // UTF-16: 7 + 2 + 2 + 2 = 13 units
184        let input = "Hello, 세계! 🌍";
185        let converter = Utf16OffsetConverter::new(input);
186
187        assert_eq!(converter.convert(0), 0); // 'H'
188        assert_eq!(converter.convert(7), 7); // '세' start
189        assert_eq!(converter.convert(13), 9); // '!' start
190        assert_eq!(converter.convert(15), 11); // '🌍' start
191        assert_eq!(converter.convert(19), 13); // end
192    }
193}