1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum UtfIndexClamp {
11 Down,
13 Up,
15}
16
17pub fn utf16_offset_to_utf8_byte_offset(
24 text: &str,
25 utf16_offset: usize,
26 clamp: UtfIndexClamp,
27) -> usize {
28 let target = utf16_offset;
29 let mut utf16_units = 0usize;
30 let mut last_byte = 0usize;
31
32 for (byte, ch) in text.char_indices() {
33 if utf16_units == target {
34 return byte;
35 }
36 if utf16_units > target {
37 return match clamp {
38 UtfIndexClamp::Down => last_byte,
39 UtfIndexClamp::Up => byte,
40 };
41 }
42
43 last_byte = byte;
44 utf16_units = utf16_units.saturating_add(ch.len_utf16());
45 if utf16_units == target {
46 return byte + ch.len_utf8();
47 }
48 if utf16_units > target {
49 return match clamp {
50 UtfIndexClamp::Down => byte,
51 UtfIndexClamp::Up => byte + ch.len_utf8(),
52 };
53 }
54 }
55
56 if utf16_units <= target {
58 text.len()
59 } else {
60 match clamp {
62 UtfIndexClamp::Down => text.len(),
63 UtfIndexClamp::Up => text.len(),
64 }
65 }
66}
67
68pub fn utf8_byte_offset_to_utf16_offset(
74 text: &str,
75 utf8_offset: usize,
76 clamp: UtfIndexClamp,
77) -> usize {
78 let target = utf8_offset.min(text.len());
79 if target == 0 {
80 return 0;
81 }
82
83 let mut utf16_units = 0usize;
84 for (byte_start, ch) in text.char_indices() {
85 let byte_end = byte_start + ch.len_utf8();
86 let utf16_start = utf16_units;
87 let utf16_end = utf16_start + ch.len_utf16();
88
89 if target == byte_start {
90 return utf16_start;
91 }
92 if target > byte_start && target < byte_end {
93 return match clamp {
94 UtfIndexClamp::Down => utf16_start,
95 UtfIndexClamp::Up => utf16_end,
96 };
97 }
98
99 utf16_units = utf16_end;
100 }
101
102 utf16_units
103}
104
105pub fn utf16_range_to_utf8_byte_range(
109 text: &str,
110 start_utf16: usize,
111 end_utf16: usize,
112) -> (usize, usize) {
113 let start = utf16_offset_to_utf8_byte_offset(text, start_utf16, UtfIndexClamp::Down);
114 let end = utf16_offset_to_utf8_byte_offset(text, end_utf16, UtfIndexClamp::Up);
115 (start.min(end), end.max(start))
116}
117
118pub fn utf8_byte_range_to_utf16_range(
122 text: &str,
123 start_utf8: usize,
124 end_utf8: usize,
125) -> (usize, usize) {
126 let start = utf8_byte_offset_to_utf16_offset(text, start_utf8, UtfIndexClamp::Down);
127 let end = utf8_byte_offset_to_utf16_offset(text, end_utf8, UtfIndexClamp::Up);
128 (start.min(end), end.max(start))
129}
130
131#[cfg(test)]
132mod tests {
133 use super::*;
134
135 #[test]
136 fn utf16_to_utf8_ascii_roundtrips() {
137 let s = "hello";
138 for i in 0..=5 {
139 let b = utf16_offset_to_utf8_byte_offset(s, i, UtfIndexClamp::Down);
140 assert_eq!(b, i);
141 let u16 = utf8_byte_offset_to_utf16_offset(s, b, UtfIndexClamp::Down);
142 assert_eq!(u16, i);
143 }
144 }
145
146 #[test]
147 fn utf16_to_utf8_surrogate_pair_clamps() {
148 let s = "a😀b";
149 assert_eq!(
151 utf16_offset_to_utf8_byte_offset(s, 0, UtfIndexClamp::Down),
152 0
153 );
154 assert_eq!(
155 utf16_offset_to_utf8_byte_offset(s, 1, UtfIndexClamp::Down),
156 1
157 );
158 assert_eq!(
160 utf16_offset_to_utf8_byte_offset(s, 2, UtfIndexClamp::Down),
161 1
162 );
163 assert_eq!(
164 utf16_offset_to_utf8_byte_offset(s, 2, UtfIndexClamp::Up),
165 1 + "😀".len()
166 );
167 assert_eq!(
168 utf16_offset_to_utf8_byte_offset(s, 3, UtfIndexClamp::Down),
169 1 + "😀".len()
170 );
171 assert_eq!(
172 utf16_offset_to_utf8_byte_offset(s, 4, UtfIndexClamp::Down),
173 s.len()
174 );
175 }
176
177 #[test]
178 fn utf16_range_converts_to_valid_utf8_range() {
179 let s = "a😀b";
180 let (bs, be) = utf16_range_to_utf8_byte_range(s, 1, 3);
182 assert_eq!(&s[bs..be], "😀");
183
184 let (bs, be) = utf16_range_to_utf8_byte_range(s, 2, 2);
186 assert!(bs <= be);
187 assert!(s.is_char_boundary(bs));
188 assert!(s.is_char_boundary(be));
189 }
190
191 #[test]
192 fn utf8_to_utf16_clamps_inside_codepoint() {
193 let s = "a😀b";
194 let inside = 2;
196 assert_eq!(
197 utf8_byte_offset_to_utf16_offset(s, inside, UtfIndexClamp::Down),
198 1
199 );
200 assert_eq!(
201 utf8_byte_offset_to_utf16_offset(s, inside, UtfIndexClamp::Up),
202 3
203 );
204 }
205}