pub(crate) fn utf16_to_utf8_byte_map(text: &str) -> Vec<u32> {
let utf16_len: usize = text.chars().map(|c| c.len_utf16()).sum();
let mut map = vec![0u32; utf16_len + 1];
let mut utf16_idx = 0usize;
for (utf8_byte, ch) in text.char_indices() {
let units = ch.len_utf16();
for k in 0..units {
map[utf16_idx + k] = utf8_byte as u32;
}
utf16_idx += units;
}
map[utf16_len] = text.len() as u32;
map
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_one_to_one() {
let m = utf16_to_utf8_byte_map("abc");
assert_eq!(m, vec![0, 1, 2, 3]);
}
#[test]
fn hiragana_three_byte_chars() {
let m = utf16_to_utf8_byte_map("こん");
assert_eq!(m, vec![0, 3, 6]);
}
#[test]
fn emoji_supplementary_pair() {
let s = "a🇯b";
let m = utf16_to_utf8_byte_map(s);
assert_eq!(m, vec![0, 1, 1, 5, 6]);
}
#[test]
fn empty_string() {
let m = utf16_to_utf8_byte_map("");
assert_eq!(m, vec![0]);
}
#[test]
fn zwj_sequence() {
let s = "a\u{1F468}\u{200D}\u{1F466}b";
let m = utf16_to_utf8_byte_map(s);
assert_eq!(m, vec![0, 1, 1, 5, 8, 8, 12, 13]);
}
}