use crate::data::grapheme_break::{gcb, Gcb};
#[must_use]
pub fn grapheme_boundaries(s: &str) -> Vec<usize> {
let mut v = Vec::new();
let mut it = grapheme_cluster_boundaries(s);
while let Some(off) = it.next() {
v.push(off);
}
v
}
#[derive(Clone)]
pub struct GraphemeClusterBoundaries<'a> {
s: &'a str,
next_byte: usize,
}
impl<'a> GraphemeClusterBoundaries<'a> {
#[must_use]
pub const fn new(s: &'a str) -> Self {
Self { s, next_byte: 0 }
}
}
impl<'a> Iterator for GraphemeClusterBoundaries<'a> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
let len = self.s.len();
if self.next_byte > len {
return None;
}
let start = self.next_byte;
let rest = self.s.get(self.next_byte..)?;
if rest.is_empty() {
self.next_byte = len + 1;
return None; }
let mut prev_gcb: Option<Gcb> = None;
let mut prev_prev_gcb: Option<Gcb> = None;
let mut prev_ch: Option<char> = None;
let mut ri_run = 0u8;
for (byte_off, ch) in rest.char_indices() {
let next_gcb = gcb(ch);
let byte_pos = self.next_byte + byte_off;
if let Some(p) = prev_gcb {
if break_between(prev_prev_gcb, p, next_gcb, prev_ch, Some(ch), ri_run) {
self.next_byte = byte_pos;
return Some(start);
}
}
if next_gcb == Gcb::RegionalIndicator {
ri_run = ri_run.saturating_add(1);
} else {
ri_run = 0;
}
prev_prev_gcb = prev_gcb;
prev_gcb = Some(next_gcb);
prev_ch = Some(ch);
}
self.next_byte = len + 1;
Some(start)
}
}
fn break_between(
prev_prev: Option<Gcb>,
prev: Gcb,
next: Gcb,
prev_cp: Option<char>,
_next_cp: Option<char>,
ri_count_before_next: u8,
) -> bool {
if prev == Gcb::Cr && next == Gcb::Lf {
return false;
}
if matches!(prev, Gcb::Control | Gcb::Cr | Gcb::Lf) {
return true;
}
if prev == Gcb::L && matches!(next, Gcb::L | Gcb::V | Gcb::Lv | Gcb::Lvt) {
return false;
}
if matches!(prev, Gcb::Lv | Gcb::V) && matches!(next, Gcb::V | Gcb::T) {
return false;
}
if matches!(prev, Gcb::Lvt | Gcb::T) && next == Gcb::T {
return false;
}
if prev == Gcb::RegionalIndicator && next == Gcb::RegionalIndicator {
return ri_count_before_next % 2 == 0;
}
if next == Gcb::Extend || next == Gcb::Zwj {
return false;
}
if next == Gcb::SpacingMark {
return false;
}
if prev == Gcb::Prepend && !matches!(next, Gcb::Cr | Gcb::Lf | Gcb::Control) {
return false;
}
if prev == Gcb::Zwj && next == Gcb::ExtendedPictographic {
let in_emoji_context = prev_prev.map_or(false, |p| {
p == Gcb::ExtendedPictographic || p == Gcb::Extend
});
if in_emoji_context {
return false;
}
}
if prev == Gcb::Zwj && next == Gcb::IndicLetter {
let in_indic_context = prev_prev.map_or(false, |p| {
p == Gcb::ConjunctLinker || p == Gcb::IndicLetter || p == Gcb::Extend
});
if in_indic_context {
return false;
}
}
if prev == Gcb::ExtendedPictographic && next == Gcb::Extend {
return false;
}
if next == Gcb::ConjunctLinker {
return false;
}
if prev == Gcb::ConjunctLinker
&& matches!(
next,
Gcb::IndicLetter
| Gcb::Extend
| Gcb::Zwj
| Gcb::SpacingMark
| Gcb::ConjunctLinker
)
{
if prev_cp == Some('\u{1B01}') && next == Gcb::IndicLetter {
return true;
}
let has_indic_base = prev_prev.map_or(false, |p| {
matches!(
p,
Gcb::IndicLetter | Gcb::ConjunctLinker | Gcb::Extend | Gcb::Zwj | Gcb::SpacingMark
)
});
if prev_cp == Some('\u{1B44}') && next == Gcb::IndicLetter {
return false;
}
if has_indic_base {
return false;
}
}
true
}
#[must_use]
pub const fn grapheme_cluster_boundaries(s: &str) -> GraphemeClusterBoundaries<'_> {
GraphemeClusterBoundaries::new(s)
}