use std::collections::HashMap;
use std::marker::PhantomData;
#[derive(Debug, Clone)]
pub struct Utf16IndexMap<'t> {
map: HashMap<usize, usize>,
marker: PhantomData<&'t str>,
}
impl<'t> Utf16IndexMap<'t> {
pub fn new(text: &'t str) -> Self {
let mut map = HashMap::new();
let mut utf16_index = 0;
let mut last_utf8_index = None;
for (utf8_index, ch) in text.char_indices() {
map.insert(utf8_index, utf16_index);
utf16_index += ch.len_utf16();
last_utf8_index = Some(utf8_index + ch.len_utf8());
}
if let Some(utf8_index) = last_utf8_index {
map.insert(utf8_index, utf16_index);
}
Utf16IndexMap {
map,
marker: PhantomData,
}
}
#[inline]
pub fn get_index(&self, utf8_index: usize) -> usize {
self.map[&utf8_index]
}
}
#[cfg(test)]
mod test {
use super::*;
use proptest::prelude::*;
#[test]
fn utf16_indices() {
macro_rules! test {
($text:expr, $spans:expr) => {{
let map = Utf16IndexMap::new($text);
let spans: &[(usize, usize)] = &$spans;
let start_indices: Vec<usize> = spans.iter().map(|span| span.0).collect();
let end_indices: Vec<usize> = spans.iter().map(|span| span.1).collect();
let start_iterator = $text.char_indices().zip(start_indices).enumerate();
let end_iterator = $text.char_indices().zip(end_indices).enumerate();
for (char_index, ((utf8_index, _), expected_utf16_index)) in
start_iterator
{
let actual_utf16_index = map.get_index(utf8_index);
assert_eq!(
expected_utf16_index,
actual_utf16_index,
"Actual UTF-16 start index doesn't match expected (char #{})",
char_index + 1,
);
}
for (char_index, ((utf8_index, ch), expected_utf16_index)) in end_iterator
{
let actual_utf16_index = map.get_index(utf8_index + ch.len_utf8());
assert_eq!(
expected_utf16_index,
actual_utf16_index,
"Actual UTF-16 end index doesn't match expected (char #{})",
char_index + 1,
);
}
}};
}
test!("", []);
test!("abc", [(0, 1), (1, 2), (2, 3)]);
test!("aßc", [(0, 1), (1, 2), (2, 3)]);
test!("aℝc", [(0, 1), (1, 2), (2, 3)]);
test!("a🦀c", [(0, 1), (1, 3), (3, 4)]);
test!("x💣yßz", [(0, 1), (1, 3), (3, 4), (4, 5), (5, 6)]);
}
fn check(text: &str) {
let map = Utf16IndexMap::new(text);
let utf16_bytes: Vec<u16> = text.encode_utf16().collect();
for (utf8_start, ch) in text.char_indices() {
let utf8_stop = utf8_start + ch.len_utf8();
let utf8_slice = &text[utf8_start..utf8_stop];
let utf16_start = map.get_index(utf8_start);
let utf16_stop = map.get_index(utf8_stop);
let utf16_slice = &utf16_bytes[utf16_start..utf16_stop];
let utf16_conv_str =
String::from_utf16(utf16_slice).expect("UTF-16 slice wasn't valid");
assert_eq!(
utf8_slice, utf16_conv_str,
"Converted UTF-16 -> UTF-8 slice didn't match",
);
let utf8_conv_bytes: Vec<u16> = utf8_slice.encode_utf16().collect();
assert_eq!(
utf16_slice, utf8_conv_bytes,
"Converted UTF-8 -> UTF-16 slice didn't match",
);
}
}
#[test]
fn utf16_slices() {
check("");
check("a");
check("abc");
check("aßc");
check("aℝc");
check("a🦀c");
check("b");
check("ß");
check("ℝ");
check("🦀");
check("1b");
check("1ß");
check("1ℝ");
check("1🦀");
check("b1");
check("ß1");
check("ℝ1");
check("🦀1");
check("bb");
check("ßß");
check("ℝℝ");
check("🦀🦀");
check("2bb");
check("2ßß");
check("2ℝℝ");
check("2🦀🦀");
check("bb2");
check("ßß2");
check("ℝℝ2");
check("🦀🦀2");
check("bßℝ🦀");
check("🦀ℝßb");
check("b_ß_ℝ_🦀");
check("b__ß__ℝ__🦀");
check("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
check("ßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßßß");
check("ℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝℝ");
check("🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀");
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(4096))]
#[test]
fn utf16_prop(s in ".*") {
check(&s);
}
}
}