use std::ops::Range;
use unicode_segmentation::UnicodeSegmentation;
pub(crate) fn word_range_at(text: &str, byte: usize) -> Range<usize> {
let mut last = byte..byte;
for (start, seg) in text.split_word_bound_indices() {
let end = start + seg.len();
if byte < end {
return start..end;
}
last = start..end;
}
last
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn inside_word_selects_that_word() {
let t = "hello world";
assert_eq!(word_range_at(t, 2), 0..5, "byte 2 inside 'hello'");
assert_eq!(word_range_at(t, 7), 6..11, "byte 7 inside 'world'");
assert_eq!(word_range_at(t, 11), 6..11);
}
#[test]
fn on_whitespace_selects_the_whitespace_run() {
assert_eq!(word_range_at("hello world", 5), 5..6);
let t = "a b"; assert_eq!(word_range_at(t, 2), 1..4, "click in the run → whole run");
}
#[test]
fn cjk_run_matches_unicode_word_rules() {
let t = "東京タワー";
let r = word_range_at(t, 6); let segs: Vec<_> = t
.split_word_bound_indices()
.map(|(s, w)| s..s + w.len())
.collect();
assert!(
segs.contains(&r),
"{r:?} must be a word-bound segment of {segs:?}"
);
assert!(
r.start <= 6 && 6 < r.end,
"range must contain the click byte"
);
}
#[test]
fn emoji_zwj_sequence_is_one_unit() {
let family = "👨\u{200D}👩\u{200D}👧";
let t = format!("{family}!");
let mid = family.len() / 2;
let r = word_range_at(&t, mid);
assert_eq!(r, 0..family.len(), "ZWJ sequence selected as one unit");
}
#[test]
fn empty_text_is_empty_range() {
assert_eq!(word_range_at("", 0), 0..0);
}
}