use crate::data::word_break::{is_extended_pictographic, wb, Wb};
#[must_use]
pub fn word_boundaries(s: &str, _locale: Option<&str>) -> Vec<usize> {
let chars: Vec<char> = s.chars().collect();
let n = chars.len();
if n == 0 {
return vec![0];
}
let wbs: Vec<Wb> = chars.iter().map(|&c| wb(c)).collect();
let offsets: Vec<usize> = {
let mut v = Vec::with_capacity(n);
let mut off = 0usize;
for &c in &chars {
v.push(off);
off += c.len_utf8();
}
v
};
let mut out = vec![0]; let mut ri_count: u32 = 0;
if !is_transparent(wbs[0]) && wbs[0] == Wb::RegionalIndicator {
ri_count = 1;
}
for i in 1..n {
let prev_raw = wbs[i - 1];
let next_raw = wbs[i];
if prev_raw == Wb::Cr && next_raw == Wb::Lf {
continue; }
if matches!(prev_raw, Wb::Newline | Wb::Cr | Wb::Lf) {
out.push(offsets[i]);
ri_count = 0;
continue;
}
if matches!(next_raw, Wb::Newline | Wb::Cr | Wb::Lf) {
out.push(offsets[i]);
ri_count = 0;
continue;
}
if prev_raw == Wb::Zwj && is_extended_pictographic(chars[i]) {
continue; }
if prev_raw == Wb::WSegSpace && next_raw == Wb::WSegSpace {
continue; }
if is_transparent(next_raw) {
continue; }
let next = next_raw;
let prev = resolve_prev(&wbs, i);
let prev_prev = resolve_prev_prev(&wbs, i);
let next_next = resolve_next_next(&wbs, i);
let brk = match prev {
Some(p) => apply_wb_rules(p, prev_prev, next, next_next, ri_count),
None => true, };
if next == Wb::RegionalIndicator {
if brk {
ri_count = 1;
} else {
ri_count += 1;
}
} else {
ri_count = 0;
}
if brk {
out.push(offsets[i]);
}
}
out
}
fn is_transparent(w: Wb) -> bool {
matches!(w, Wb::Extend | Wb::Format | Wb::Zwj)
}
fn resolve_prev(wbs: &[Wb], i: usize) -> Option<Wb> {
let mut j = i;
while j > 0 {
j -= 1;
if !is_transparent(wbs[j]) {
return Some(wbs[j]);
}
}
None
}
fn resolve_prev_prev(wbs: &[Wb], i: usize) -> Option<Wb> {
let mut j = i;
while j > 0 {
j -= 1;
if !is_transparent(wbs[j]) {
break;
}
if j == 0 {
return None;
}
}
while j > 0 {
j -= 1;
if !is_transparent(wbs[j]) {
return Some(wbs[j]);
}
}
None
}
fn resolve_next_next(wbs: &[Wb], i: usize) -> Option<Wb> {
let mut j = i + 1;
while j < wbs.len() {
if !is_transparent(wbs[j]) {
return Some(wbs[j]);
}
j += 1;
}
None
}
fn apply_wb_rules(
prev: Wb,
prev_prev: Option<Wb>,
next: Wb,
next_next: Option<Wb>,
ri_count: u32,
) -> bool {
let ah_letter = |w: Wb| matches!(w, Wb::ALetter | Wb::HebrewLetter);
let mid_letter = |w: Wb| matches!(w, Wb::MidLetter | Wb::MidNumLet | Wb::SingleQuote);
let mid_num = |w: Wb| matches!(w, Wb::MidNum | Wb::MidNumLet | Wb::SingleQuote);
if ah_letter(prev) && ah_letter(next) {
return false;
}
if ah_letter(prev) && mid_letter(next) {
if let Some(nn) = next_next {
if ah_letter(nn) {
return false;
}
}
}
if let Some(pp) = prev_prev {
if ah_letter(pp) && mid_letter(prev) && ah_letter(next) {
return false;
}
}
if prev == Wb::HebrewLetter && next == Wb::SingleQuote {
return false;
}
if prev == Wb::HebrewLetter && next == Wb::DoubleQuote {
if let Some(nn) = next_next {
if nn == Wb::HebrewLetter {
return false;
}
}
}
if let Some(pp) = prev_prev {
if pp == Wb::HebrewLetter && prev == Wb::DoubleQuote && next == Wb::HebrewLetter {
return false;
}
}
if prev == Wb::Numeric && next == Wb::Numeric {
return false;
}
if ah_letter(prev) && next == Wb::Numeric {
return false;
}
if prev == Wb::Numeric && ah_letter(next) {
return false;
}
if let Some(pp) = prev_prev {
if pp == Wb::Numeric && mid_num(prev) && next == Wb::Numeric {
return false;
}
}
if prev == Wb::Numeric && mid_num(next) {
if let Some(nn) = next_next {
if nn == Wb::Numeric {
return false;
}
}
}
if prev == Wb::Katakana && next == Wb::Katakana {
return false;
}
if next == Wb::ExtendNumLet
&& matches!(
prev,
Wb::ALetter
| Wb::HebrewLetter
| Wb::Numeric
| Wb::Katakana
| Wb::ExtendNumLet
)
{
return false;
}
if prev == Wb::ExtendNumLet
&& matches!(
next,
Wb::ALetter | Wb::HebrewLetter | Wb::Numeric | Wb::Katakana
)
{
return false;
}
if prev == Wb::RegionalIndicator && next == Wb::RegionalIndicator {
if ri_count % 2 == 1 {
return false; }
}
true
}