Skip to main content

fret_core/
utf.rs

1//! UTF-8 / UTF-16 index conversion helpers.
2//!
3//! These utilities are primarily intended for platform bridges (e.g. wasm DOM IME/text input),
4//! where selection and composition ranges are typically expressed in UTF-16 code unit offsets.
5//!
6//! Conversions are deterministic and clamp to valid UTF-8 char boundaries.
7
8/// Clamp strategy when converting an offset that may land inside a multi-unit character.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum UtfIndexClamp {
11    /// Clamp down to the previous valid boundary (floor).
12    Down,
13    /// Clamp up to the next valid boundary (ceil).
14    Up,
15}
16
17/// Convert a UTF-16 code unit offset into a UTF-8 byte offset.
18///
19/// Notes:
20/// - Offsets are clamped to `[0, text.len()]`.
21/// - If `utf16_offset` lands inside a scalar value that encodes to 2 UTF-16 code units (surrogate
22///   pair), the result is clamped according to `clamp`.
23pub fn utf16_offset_to_utf8_byte_offset(
24    text: &str,
25    utf16_offset: usize,
26    clamp: UtfIndexClamp,
27) -> usize {
28    let target = utf16_offset;
29    let mut utf16_units = 0usize;
30    let mut last_byte = 0usize;
31
32    for (byte, ch) in text.char_indices() {
33        if utf16_units == target {
34            return byte;
35        }
36        if utf16_units > target {
37            return match clamp {
38                UtfIndexClamp::Down => last_byte,
39                UtfIndexClamp::Up => byte,
40            };
41        }
42
43        last_byte = byte;
44        utf16_units = utf16_units.saturating_add(ch.len_utf16());
45        if utf16_units == target {
46            return byte + ch.len_utf8();
47        }
48        if utf16_units > target {
49            return match clamp {
50                UtfIndexClamp::Down => byte,
51                UtfIndexClamp::Up => byte + ch.len_utf8(),
52            };
53        }
54    }
55
56    // Target is at/after end.
57    if utf16_units <= target {
58        text.len()
59    } else {
60        // Should not happen, but keep deterministic.
61        match clamp {
62            UtfIndexClamp::Down => text.len(),
63            UtfIndexClamp::Up => text.len(),
64        }
65    }
66}
67
68/// Convert a UTF-8 byte offset into a UTF-16 code unit offset.
69///
70/// Notes:
71/// - `utf8_offset` is clamped to `[0, text.len()]`.
72/// - If `utf8_offset` lands inside a UTF-8 code point, the result is clamped according to `clamp`.
73pub fn utf8_byte_offset_to_utf16_offset(
74    text: &str,
75    utf8_offset: usize,
76    clamp: UtfIndexClamp,
77) -> usize {
78    let target = utf8_offset.min(text.len());
79    if target == 0 {
80        return 0;
81    }
82
83    let mut utf16_units = 0usize;
84    for (byte_start, ch) in text.char_indices() {
85        let byte_end = byte_start + ch.len_utf8();
86        let utf16_start = utf16_units;
87        let utf16_end = utf16_start + ch.len_utf16();
88
89        if target == byte_start {
90            return utf16_start;
91        }
92        if target > byte_start && target < byte_end {
93            return match clamp {
94                UtfIndexClamp::Down => utf16_start,
95                UtfIndexClamp::Up => utf16_end,
96            };
97        }
98
99        utf16_units = utf16_end;
100    }
101
102    utf16_units
103}
104
105/// Convert a UTF-16 range to a UTF-8 byte range.
106///
107/// Start is clamped down, end is clamped up, so the resulting byte range is always valid.
108pub fn utf16_range_to_utf8_byte_range(
109    text: &str,
110    start_utf16: usize,
111    end_utf16: usize,
112) -> (usize, usize) {
113    let start = utf16_offset_to_utf8_byte_offset(text, start_utf16, UtfIndexClamp::Down);
114    let end = utf16_offset_to_utf8_byte_offset(text, end_utf16, UtfIndexClamp::Up);
115    (start.min(end), end.max(start))
116}
117
118/// Convert a UTF-8 byte range to a UTF-16 range.
119///
120/// Start is clamped down, end is clamped up, so the resulting UTF-16 range is always valid.
121pub fn utf8_byte_range_to_utf16_range(
122    text: &str,
123    start_utf8: usize,
124    end_utf8: usize,
125) -> (usize, usize) {
126    let start = utf8_byte_offset_to_utf16_offset(text, start_utf8, UtfIndexClamp::Down);
127    let end = utf8_byte_offset_to_utf16_offset(text, end_utf8, UtfIndexClamp::Up);
128    (start.min(end), end.max(start))
129}
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134
135    #[test]
136    fn utf16_to_utf8_ascii_roundtrips() {
137        let s = "hello";
138        for i in 0..=5 {
139            let b = utf16_offset_to_utf8_byte_offset(s, i, UtfIndexClamp::Down);
140            assert_eq!(b, i);
141            let u16 = utf8_byte_offset_to_utf16_offset(s, b, UtfIndexClamp::Down);
142            assert_eq!(u16, i);
143        }
144    }
145
146    #[test]
147    fn utf16_to_utf8_surrogate_pair_clamps() {
148        let s = "a😀b";
149        // UTF-16 code units: a(1) 😀(2) b(1)
150        assert_eq!(
151            utf16_offset_to_utf8_byte_offset(s, 0, UtfIndexClamp::Down),
152            0
153        );
154        assert_eq!(
155            utf16_offset_to_utf8_byte_offset(s, 1, UtfIndexClamp::Down),
156            1
157        );
158        // Inside the surrogate pair: down clamps to start of 😀, up clamps to end of 😀.
159        assert_eq!(
160            utf16_offset_to_utf8_byte_offset(s, 2, UtfIndexClamp::Down),
161            1
162        );
163        assert_eq!(
164            utf16_offset_to_utf8_byte_offset(s, 2, UtfIndexClamp::Up),
165            1 + "😀".len()
166        );
167        assert_eq!(
168            utf16_offset_to_utf8_byte_offset(s, 3, UtfIndexClamp::Down),
169            1 + "😀".len()
170        );
171        assert_eq!(
172            utf16_offset_to_utf8_byte_offset(s, 4, UtfIndexClamp::Down),
173            s.len()
174        );
175    }
176
177    #[test]
178    fn utf16_range_converts_to_valid_utf8_range() {
179        let s = "a😀b";
180        // Select only the emoji in UTF-16: [1, 3)
181        let (bs, be) = utf16_range_to_utf8_byte_range(s, 1, 3);
182        assert_eq!(&s[bs..be], "😀");
183
184        // If the DOM reports an invalid split [2, 2), it clamps to a valid (possibly empty) span.
185        let (bs, be) = utf16_range_to_utf8_byte_range(s, 2, 2);
186        assert!(bs <= be);
187        assert!(s.is_char_boundary(bs));
188        assert!(s.is_char_boundary(be));
189    }
190
191    #[test]
192    fn utf8_to_utf16_clamps_inside_codepoint() {
193        let s = "a😀b";
194        // 😀 occupies bytes [1,5). Pick a byte inside the code point.
195        let inside = 2;
196        assert_eq!(
197            utf8_byte_offset_to_utf16_offset(s, inside, UtfIndexClamp::Down),
198            1
199        );
200        assert_eq!(
201            utf8_byte_offset_to_utf16_offset(s, inside, UtfIndexClamp::Up),
202            3
203        );
204    }
205}