sevenmark_utils/
utf16_offset_converter.rs1use serde::Serialize;
4use sevenmark_parser::ast::{Element, Span};
5
6#[derive(Debug, Clone, Serialize)]
9pub struct Utf16Position {
10 pub start: u32, pub end: u32, }
13
14pub struct Utf16OffsetConverter {
19 byte_to_utf16: Vec<u32>,
22}
23
24impl Utf16OffsetConverter {
25 pub fn new(input: &str) -> Self {
30 let mut map = vec![0u32; input.len() + 1];
31 let mut utf16_pos = 0u32;
32
33 for (byte_pos, ch) in input.char_indices() {
34 map[byte_pos] = utf16_pos;
35 utf16_pos += ch.len_utf16() as u32;
36 }
37 map[input.len()] = utf16_pos;
38
39 Self { byte_to_utf16: map }
40 }
41
42 pub fn convert(&self, byte_offset: usize) -> u32 {
44 self.byte_to_utf16
45 .get(byte_offset)
46 .copied()
47 .unwrap_or(*self.byte_to_utf16.last().unwrap_or(&0))
48 }
49
50 pub fn convert_span(&self, span: &Span) -> Utf16Position {
52 Utf16Position {
53 start: self.convert(span.start),
54 end: self.convert(span.end),
55 }
56 }
57
58 pub fn convert_elements(&self, elements: &[Element]) -> serde_json::Value {
60 let mut json = serde_json::to_value(elements).unwrap_or(serde_json::Value::Null);
61 self.convert_spans_in_json(&mut json);
62 json
63 }
64
65 fn convert_spans_in_json(&self, value: &mut serde_json::Value) {
67 match value {
68 serde_json::Value::Object(map) => {
69 if let Some(span_value) = map.get("span")
71 && let Ok(span) = serde_json::from_value::<Span>(span_value.clone())
72 {
73 let utf16_span = self.convert_span(&span);
74 map.insert(
75 "span".to_string(),
76 serde_json::to_value(utf16_span).unwrap(),
77 );
78 }
79
80 for (_, v) in map.iter_mut() {
82 self.convert_spans_in_json(v);
83 }
84 }
85 serde_json::Value::Array(arr) => {
86 for item in arr.iter_mut() {
87 self.convert_spans_in_json(item);
88 }
89 }
90 _ => {}
91 }
92 }
93}
94
95pub fn convert_ast_to_utf16_offset_json(elements: &[Element], input: &str) -> String {
107 let converter = Utf16OffsetConverter::new(input);
108 let result = converter.convert_elements(elements);
109 serde_json::to_string(&result).unwrap_or_default()
110}
111
112#[cfg(test)]
113mod tests {
114 use super::*;
115
116 #[test]
117 fn test_ascii() {
118 let input = "hello\nworld";
119 let converter = Utf16OffsetConverter::new(input);
120
121 assert_eq!(converter.convert(0), 0); assert_eq!(converter.convert(5), 5); assert_eq!(converter.convert(6), 6); assert_eq!(converter.convert(11), 11); }
126
127 #[test]
128 fn test_korean() {
129 let input = "한국어";
131 let converter = Utf16OffsetConverter::new(input);
132
133 assert_eq!(converter.convert(0), 0); assert_eq!(converter.convert(3), 1); assert_eq!(converter.convert(6), 2); assert_eq!(converter.convert(9), 3); }
138
139 #[test]
140 fn test_emoji() {
141 let input = "a🚀b";
144 let converter = Utf16OffsetConverter::new(input);
145
146 assert_eq!(converter.convert(0), 0); assert_eq!(converter.convert(1), 1); assert_eq!(converter.convert(5), 3); assert_eq!(converter.convert(6), 4); }
151
152 #[test]
153 fn test_mixed() {
154 let input = "한\n글";
156 let converter = Utf16OffsetConverter::new(input);
157
158 assert_eq!(converter.convert(0), 0); assert_eq!(converter.convert(3), 1); assert_eq!(converter.convert(4), 2); assert_eq!(converter.convert(7), 3); }
163
164 #[test]
165 fn test_empty() {
166 let input = "";
167 let converter = Utf16OffsetConverter::new(input);
168 assert_eq!(converter.convert(0), 0);
169 }
170
171 #[test]
172 fn test_boundary() {
173 let input = "abc";
174 let converter = Utf16OffsetConverter::new(input);
175 assert_eq!(converter.convert(100), 3);
177 }
178
179 #[test]
180 fn test_complex() {
181 let input = "Hello, 세계! 🌍";
185 let converter = Utf16OffsetConverter::new(input);
186
187 assert_eq!(converter.convert(0), 0); assert_eq!(converter.convert(7), 7); assert_eq!(converter.convert(13), 9); assert_eq!(converter.convert(15), 11); assert_eq!(converter.convert(19), 13); }
193}