use crate::data::sentence_break::{sb, Sb};
#[derive(Clone, Copy, PartialEq)]
enum SbCtx {
None,
SATerm { is_aterm: bool, after_close: bool },
SATermSp { is_aterm: bool },
}
fn is_sb_transparent(s: Sb) -> bool {
matches!(s, Sb::Extend | Sb::Format)
}
fn sb8_lookahead_lower(sbs: &[Sb], start: usize) -> bool {
for s in sbs.iter().skip(start) {
if is_sb_transparent(*s) {
continue;
}
match s {
Sb::Lower => return true,
Sb::OLetter | Sb::Upper | Sb::Sep | Sb::Cr | Sb::Lf | Sb::STerm | Sb::ATerm => {
return false;
}
_ => continue, }
}
false
}
#[must_use]
pub fn sentence_boundaries(s: &str, _locale: Option<&str>) -> Vec<usize> {
let chars: Vec<char> = s.chars().collect();
let n = chars.len();
if n == 0 {
return vec![0];
}
let sbs: Vec<Sb> = chars.iter().map(|&c| sb(c)).collect();
let offsets: Vec<usize> = {
let mut v = Vec::with_capacity(n);
let mut off = 0usize;
for &c in &chars {
v.push(off);
off += c.len_utf8();
}
v
};
let mut out = vec![0]; let mut ctx = SbCtx::None;
let mut before_saterm: Option<Sb> = Option::None;
let mut last_resolved: Option<Sb> = if !is_sb_transparent(sbs[0]) {
Some(sbs[0])
} else {
Option::None
};
if !is_sb_transparent(sbs[0]) {
match sbs[0] {
Sb::ATerm => {
before_saterm = Option::None;
ctx = SbCtx::SATerm {
is_aterm: true,
after_close: false,
};
}
Sb::STerm => {
before_saterm = Option::None;
ctx = SbCtx::SATerm {
is_aterm: false,
after_close: false,
};
}
_ => {}
}
}
for i in 1..n {
let prev_raw = sbs[i - 1];
let next_raw = sbs[i];
if prev_raw == Sb::Cr && next_raw == Sb::Lf {
continue; }
if matches!(prev_raw, Sb::Sep | Sb::Cr | Sb::Lf) {
out.push(offsets[i]);
ctx = SbCtx::None;
if is_sb_transparent(next_raw) {
last_resolved = Option::None;
} else {
last_resolved = Some(next_raw);
match next_raw {
Sb::ATerm => {
before_saterm = Option::None;
ctx = SbCtx::SATerm {
is_aterm: true,
after_close: false,
};
}
Sb::STerm => {
before_saterm = Option::None;
ctx = SbCtx::SATerm {
is_aterm: false,
after_close: false,
};
}
_ => {}
}
}
continue;
}
if is_sb_transparent(next_raw) {
continue; }
let next = next_raw;
let should_break = match ctx {
SbCtx::SATerm {
is_aterm,
after_close,
} => {
if is_aterm && !after_close && next == Sb::Numeric {
false
} else if is_aterm
&& !after_close
&& next == Sb::Upper
&& matches!(before_saterm, Some(Sb::Upper) | Some(Sb::Lower))
{
false
} else if is_aterm && sb8_lookahead_lower(&sbs, i) {
false
} else if matches!(next, Sb::SContinue | Sb::STerm | Sb::ATerm) {
false
} else if matches!(next, Sb::Close | Sb::Sp | Sb::Sep | Sb::Cr | Sb::Lf) {
false
} else {
true
}
}
SbCtx::SATermSp { is_aterm } => {
if is_aterm && sb8_lookahead_lower(&sbs, i) {
false
} else if matches!(next, Sb::SContinue | Sb::STerm | Sb::ATerm) {
false
} else if matches!(next, Sb::Sp | Sb::Sep | Sb::Cr | Sb::Lf) {
false
} else {
true
}
}
SbCtx::None => {
false
}
};
if should_break {
out.push(offsets[i]);
}
match next {
Sb::ATerm => {
before_saterm = last_resolved;
ctx = SbCtx::SATerm {
is_aterm: true,
after_close: false,
};
}
Sb::STerm => {
before_saterm = last_resolved;
ctx = SbCtx::SATerm {
is_aterm: false,
after_close: false,
};
}
Sb::Close => {
if !should_break && matches!(ctx, SbCtx::SATerm { .. }) {
if let SbCtx::SATerm { is_aterm, .. } = ctx {
ctx = SbCtx::SATerm {
is_aterm,
after_close: true,
};
}
} else {
ctx = SbCtx::None;
}
}
Sb::Sp => match ctx {
SbCtx::SATerm { is_aterm, .. } if !should_break => {
ctx = SbCtx::SATermSp { is_aterm };
}
SbCtx::SATermSp { .. } if !should_break => {
}
_ => {
ctx = SbCtx::None;
}
},
_ => {
ctx = SbCtx::None;
}
}
last_resolved = Some(next);
}
out
}