use super::generated::segmentation as gen;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(clippy::upper_case_acronyms, dead_code)]
pub(crate) enum Gcb {
Other,
CR,
LF,
Control,
Extend,
ZWJ,
RegionalIndicator,
Prepend,
SpacingMark,
L,
V,
T,
LV,
LVT,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(dead_code)]
pub(crate) enum Incb {
None,
Consonant,
Linker,
Extend,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(clippy::upper_case_acronyms, dead_code)]
pub(crate) enum Wb {
Other,
CR,
LF,
Newline,
Extend,
ZWJ,
RegionalIndicator,
Format,
Katakana,
HebrewLetter,
ALetter,
SingleQuote,
DoubleQuote,
MidNumLet,
MidLetter,
MidNum,
Numeric,
ExtendNumLet,
WSegSpace,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(clippy::upper_case_acronyms, dead_code)]
pub(crate) enum Sb {
Other,
CR,
LF,
Extend,
Sep,
Format,
Sp,
Lower,
Upper,
OLetter,
Numeric,
ATerm,
SContinue,
STerm,
Close,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(clippy::upper_case_acronyms, dead_code)]
pub(crate) enum Lb {
AI,
AK,
AL,
AP,
AS,
B2,
BA,
BB,
BK,
CB,
CJ,
CL,
CM,
CP,
CR,
EB,
EM,
EX,
GL,
H2,
H3,
HH,
HL,
HY,
ID,
IN,
IS,
JL,
JT,
JV,
LF,
NL,
NS,
NU,
OP,
PO,
PR,
QU,
RI,
SA,
SG,
SP,
SY,
VF,
VI,
WJ,
XX,
ZW,
ZWJ,
}
#[inline]
fn gcb(c: char) -> Gcb {
gen::grapheme_break(c as u32)
}
#[inline]
fn pictographic(c: char) -> bool {
gen::extended_pictographic(c as u32)
}
#[inline]
fn incb(c: char) -> Incb {
gen::indic_conjunct_break(c as u32)
}
#[derive(Clone, Copy, PartialEq)]
enum Emoji {
None,
Pictographic, Zwj, }
#[derive(Clone, Copy, PartialEq)]
enum Conjunct {
None,
Consonant, LinkerSeen, }
#[derive(Clone, Copy)]
struct State {
ri: u32,
emoji: Emoji,
conjunct: Conjunct,
}
impl State {
fn start(c: char) -> Self {
State {
ri: u32::from(gcb(c) == Gcb::RegionalIndicator),
emoji: if pictographic(c) {
Emoji::Pictographic
} else {
Emoji::None
},
conjunct: if incb(c) == Incb::Consonant {
Conjunct::Consonant
} else {
Conjunct::None
},
}
}
fn advance(&mut self, c: char) {
self.ri = if gcb(c) == Gcb::RegionalIndicator {
self.ri + 1
} else {
0
};
self.emoji =
if pictographic(c) || (gcb(c) == Gcb::Extend && self.emoji == Emoji::Pictographic) {
Emoji::Pictographic
} else if gcb(c) == Gcb::ZWJ && self.emoji == Emoji::Pictographic {
Emoji::Zwj
} else {
Emoji::None
};
self.conjunct = match incb(c) {
Incb::Consonant => Conjunct::Consonant,
Incb::Linker if self.conjunct != Conjunct::None => Conjunct::LinkerSeen,
Incb::Extend if self.conjunct != Conjunct::None => self.conjunct,
_ => Conjunct::None,
};
}
}
fn is_break(prev: char, cur: char, st: &State) -> bool {
let (l, r) = (gcb(prev), gcb(cur));
use Gcb::*;
if l == CR && r == LF {
return false;
}
if matches!(l, Control | CR | LF) || matches!(r, Control | CR | LF) {
return true;
}
if l == L && matches!(r, L | V | LV | LVT) {
return false;
}
if matches!(l, LV | V) && matches!(r, V | T) {
return false;
}
if matches!(l, LVT | T) && r == T {
return false;
}
if matches!(r, Extend | ZWJ) || r == SpacingMark || l == Prepend {
return false;
}
if st.conjunct == Conjunct::LinkerSeen && incb(cur) == Incb::Consonant {
return false;
}
if st.emoji == Emoji::Zwj && pictographic(cur) {
return false;
}
if l == RegionalIndicator && r == RegionalIndicator && st.ri % 2 == 1 {
return false;
}
true
}
#[derive(Clone)]
pub struct Graphemes<'a> {
s: &'a str,
pos: usize,
}
impl<'a> Iterator for Graphemes<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
let rest = &self.s[self.pos..];
let mut iter = rest.char_indices();
let (_, first) = iter.next()?;
let mut state = State::start(first);
let mut prev = first;
let mut len = rest.len();
for (i, c) in iter {
if is_break(prev, c, &state) {
len = i;
break;
}
state.advance(c);
prev = c;
}
let cluster = &rest[..len];
self.pos += len;
Some(cluster)
}
}
#[must_use]
pub fn graphemes(s: &str) -> Graphemes<'_> {
Graphemes { s, pos: 0 }
}
#[inline]
fn wb(c: char) -> Wb {
gen::word_break(c as u32)
}
#[derive(Clone, Copy)]
struct WbUnit {
cat: Wb,
pictographic: bool, ends_zwj: bool, bare: bool, end: usize, }
fn wb_unit(s: &str, i: usize) -> WbUnit {
let base = s[i..].chars().next().unwrap();
let cat = wb(base);
let base_end = i + base.len_utf8();
let mut end = base_end;
let mut ends_zwj = cat == Wb::ZWJ;
if !matches!(cat, Wb::CR | Wb::LF | Wb::Newline) {
for c in s[end..].chars() {
match wb(c) {
t @ (Wb::Extend | Wb::Format | Wb::ZWJ) => {
ends_zwj = t == Wb::ZWJ;
end += c.len_utf8();
}
_ => break,
}
}
}
WbUnit {
cat,
pictographic: pictographic(base),
ends_zwj,
bare: end == base_end,
end,
}
}
#[inline]
fn ah(w: Wb) -> bool {
matches!(w, Wb::ALetter | Wb::HebrewLetter)
}
#[allow(clippy::too_many_arguments)]
fn word_break(prev2: Wb, prev: &WbUnit, cur: &WbUnit, next: Wb, ri: u32) -> bool {
use Wb::*;
let (p, c) = (prev.cat, cur.cat);
if p == CR && c == LF {
return false; }
if matches!(p, Newline | CR | LF) || matches!(c, Newline | CR | LF) {
return true; }
if prev.ends_zwj && cur.pictographic {
return false; }
if p == WSegSpace && c == WSegSpace && prev.bare {
return false; }
if matches!(c, Extend | Format | ZWJ) {
return false; }
if ah(p) && ah(c) {
return false; }
if ah(p) && matches!(c, MidLetter | MidNumLet | SingleQuote) && ah(next) {
return false; }
if ah(prev2) && matches!(p, MidLetter | MidNumLet | SingleQuote) && ah(c) {
return false; }
if p == HebrewLetter && c == SingleQuote {
return false; }
if p == HebrewLetter && c == DoubleQuote && next == HebrewLetter {
return false; }
if prev2 == HebrewLetter && p == DoubleQuote && c == HebrewLetter {
return false; }
if p == Numeric && c == Numeric {
return false; }
if ah(p) && c == Numeric {
return false; }
if p == Numeric && ah(c) {
return false; }
if prev2 == Numeric && matches!(p, MidNum | MidNumLet | SingleQuote) && c == Numeric {
return false; }
if p == Numeric && matches!(c, MidNum | MidNumLet | SingleQuote) && next == Numeric {
return false; }
if p == Katakana && c == Katakana {
return false; }
if matches!(
p,
ALetter | HebrewLetter | Numeric | Katakana | ExtendNumLet
) && c == ExtendNumLet
{
return false; }
if p == ExtendNumLet && matches!(c, ALetter | HebrewLetter | Numeric | Katakana) {
return false; }
if p == RegionalIndicator && c == RegionalIndicator && ri % 2 == 1 {
return false; }
true }
#[derive(Clone)]
pub struct Words<'a> {
s: &'a str,
pos: usize,
}
impl<'a> Iterator for Words<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
if self.pos >= self.s.len() {
return None;
}
let start = self.pos;
let mut prev2 = Wb::Other; let mut prev = wb_unit(self.s, start);
let mut ri = u32::from(prev.cat == Wb::RegionalIndicator);
let mut at = prev.end;
while at < self.s.len() {
let cur = wb_unit(self.s, at);
let next = if cur.end < self.s.len() {
wb_unit(self.s, cur.end).cat
} else {
Wb::Other
};
if word_break(prev2, &prev, &cur, next, ri) {
break;
}
ri = if cur.cat == Wb::RegionalIndicator {
ri + 1
} else {
0
};
prev2 = prev.cat;
prev = cur;
at = cur.end;
}
let word = &self.s[start..at];
self.pos = at;
Some(word)
}
}
#[must_use]
pub fn words(s: &str) -> Words<'_> {
Words { s, pos: 0 }
}
#[inline]
fn sb(c: char) -> Sb {
gen::sentence_break(c as u32)
}
fn sb_unit(s: &str, i: usize) -> (Sb, usize) {
let base = s[i..].chars().next().unwrap();
let cat = sb(base);
let mut end = i + base.len_utf8();
if !matches!(cat, Sb::CR | Sb::LF | Sb::Sep) {
for c in s[end..].chars() {
match sb(c) {
Sb::Extend | Sb::Format => end += c.len_utf8(),
_ => break,
}
}
}
(cat, end)
}
#[derive(Clone, Copy, PartialEq)]
enum Term {
None,
A,
AClose,
ASp,
S,
SClose,
SSp,
}
fn term_next(t: Term, c: Sb) -> Term {
match c {
Sb::ATerm => Term::A,
Sb::STerm => Term::S,
Sb::Close => match t {
Term::A | Term::AClose => Term::AClose,
Term::S | Term::SClose => Term::SClose,
_ => Term::None,
},
Sb::Sp => match t {
Term::A | Term::AClose | Term::ASp => Term::ASp,
Term::S | Term::SClose | Term::SSp => Term::SSp,
_ => Term::None,
},
_ => Term::None,
}
}
#[inline]
fn in_aterm_seq(t: Term) -> bool {
matches!(t, Term::A | Term::AClose | Term::ASp)
}
#[inline]
fn in_term_seq(t: Term) -> bool {
!matches!(t, Term::None)
}
#[inline]
fn in_close_phase(t: Term) -> bool {
matches!(t, Term::A | Term::AClose | Term::S | Term::SClose)
}
fn sb8_lower_ahead_at(s: &str, mut at: usize) -> (bool, usize) {
while at < s.len() {
let (cat, end) = sb_unit(s, at);
match cat {
Sb::Lower => return (true, at),
Sb::OLetter | Sb::Upper | Sb::Sep | Sb::CR | Sb::LF | Sb::STerm | Sb::ATerm => {
return (false, at)
}
_ => {}
}
at = end;
}
(false, s.len())
}
fn sentence_break(prev2: Sb, prev: Sb, term: Term, cur: Sb, lower_ahead: bool) -> bool {
use Sb::*;
if prev == CR && cur == LF {
return false; }
if matches!(prev, Sep | CR | LF) {
return true; }
if prev == ATerm && cur == Numeric {
return false; }
if matches!(prev2, Upper | Lower) && prev == ATerm && cur == Upper {
return false; }
if in_aterm_seq(term) && lower_ahead {
return false; }
if in_term_seq(term) && matches!(cur, SContinue | STerm | ATerm) {
return false; }
if in_close_phase(term) && matches!(cur, Close | Sp | Sep | CR | LF) {
return false; }
if in_term_seq(term) && matches!(cur, Sp | Sep | CR | LF) {
return false; }
if in_term_seq(term) {
return true; }
false }
#[derive(Clone)]
pub struct Sentences<'a> {
s: &'a str,
pos: usize,
}
impl<'a> Iterator for Sentences<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
if self.pos >= self.s.len() {
return None;
}
let start = self.pos;
let (mut prev, mut at) = sb_unit(self.s, start);
let mut prev2 = Sb::Other; let mut term = term_next(Term::None, prev);
let mut cache_lower = false;
let mut cache_decisive = 0usize; while at < self.s.len() {
let (cur, end) = sb_unit(self.s, at);
let lower_ahead = in_aterm_seq(term) && {
if at >= cache_decisive {
let (l, d) = sb8_lower_ahead_at(self.s, at);
cache_lower = l;
cache_decisive = d;
}
cache_lower
};
if sentence_break(prev2, prev, term, cur, lower_ahead) {
break;
}
term = term_next(term, cur);
prev2 = prev;
prev = cur;
at = end;
}
let sentence = &self.s[start..at];
self.pos = at;
Some(sentence)
}
}
#[must_use]
pub fn sentences(s: &str) -> Sentences<'_> {
Sentences { s, pos: 0 }
}
#[inline]
fn lb(c: char) -> Lb {
gen::line_break(c as u32)
}
struct LbUnit {
cls: Lb,
base: char,
ends_zwj: bool,
end: usize,
}
impl LbUnit {
fn wide(&self) -> bool {
use super::width::EastAsianWidth::*;
matches!(
super::width::east_asian_width(self.base),
Fullwidth | Wide | Halfwidth
)
}
fn pi(&self) -> bool {
super::generated::general_category::general_category(self.base as u32)
== super::category::GeneralCategory::InitialPunctuation
}
fn pf(&self) -> bool {
super::generated::general_category::general_category(self.base as u32)
== super::category::GeneralCategory::FinalPunctuation
}
}
fn lb_unit(s: &str, i: usize) -> LbUnit {
use Lb::*;
let base = s[i..].chars().next().unwrap();
let raw = lb(base);
let cls = if matches!(raw, CM | ZWJ) { AL } else { raw };
let mut end = i + base.len_utf8();
let mut ends_zwj = raw == ZWJ;
if !matches!(raw, BK | CR | LF | NL | SP | ZW) {
for c in s[end..].chars() {
match lb(c) {
t @ (CM | ZWJ) => {
ends_zwj = t == ZWJ;
end += c.len_utf8();
}
_ => break,
}
}
}
LbUnit {
cls,
base,
ends_zwj,
end,
}
}
struct LbState {
regional: Lb, r_pi: bool, before: Lb, sp: bool, open_ri: u32, num: bool, }
#[inline]
fn al_hl(c: Lb) -> bool {
matches!(c, Lb::AL | Lb::HL)
}
fn line_break_before(
prev2: Option<&LbUnit>,
prev: &LbUnit,
cur: &LbUnit,
next: Option<&LbUnit>,
st: &LbState,
) -> (bool, bool) {
use Lb::*;
let p = prev.cls;
let c = cur.cls;
let l = st.regional;
if p == BK || p == LF || p == NL {
return (true, true); }
if p == CR {
return if c == LF {
(false, false)
} else {
(true, true)
}; }
if matches!(c, BK | CR | LF | NL) {
return (false, false); }
if matches!(c, SP | ZW) {
return (false, false); }
if l == ZW {
return (true, false); }
if prev.ends_zwj {
return (false, false); }
if c == WJ || p == WJ {
return (false, false); }
if p == GL {
return (false, false); }
if c == GL && !st.sp && !matches!(l, BA | HY | HH) {
return (false, false); }
if matches!(c, CL | CP | EX | SY) {
return (false, false); }
if l == OP {
return (false, false); }
if l == QU && st.r_pi && matches!(st.before, XX | OP | QU | GL | SP | ZW) {
return (false, false);
}
if c == QU && cur.pf() {
let ok = next.is_none_or(|n| {
matches!(
n.cls,
SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW
) || n.wide()
});
if ok {
return (false, false);
}
}
if st.sp && c == IS && next.is_some_and(|n| n.cls == NU) {
return (true, false);
}
if c == IS {
return (false, false);
}
if matches!(l, CL | CP) && c == NS {
return (false, false); }
if l == B2 && c == B2 {
return (false, false); }
if st.sp {
return (true, false); }
if c == QU && (!prev.wide() || next.is_none_or(|n| !n.wide())) {
return (false, false);
}
if p == QU && (!cur.wide() || prev2.is_none_or(|n| !n.wide())) {
return (false, false);
}
if c == CB || p == CB {
return (true, false); }
if matches!(p, HY | HH)
&& al_hl(c)
&& prev2.is_none_or(|u| matches!(u.cls, BK | CR | LF | NL | SP | ZW | CB | GL))
{
return (false, false);
}
if matches!(c, BA | HY | NS | HH) || p == BB {
return (false, false); }
if prev2.is_some_and(|u| u.cls == HL) && p == HY && c != HL {
return (false, false); }
if p == SY && c == HL {
return (false, false); }
if c == IN {
return (false, false); }
if (al_hl(p) && c == NU) || (p == NU && al_hl(c)) {
return (false, false); }
if (p == PR && matches!(c, ID | EB | EM)) || (matches!(p, ID | EB | EM) && c == PO) {
return (false, false); }
if (matches!(p, PR | PO) && al_hl(c)) || (al_hl(p) && matches!(c, PR | PO)) {
return (false, false); }
if (matches!(p, PR | PO | OP | HY | IS) && c == NU)
|| (matches!(p, PR | PO) && matches!(c, OP | HY) && next.is_some_and(|n| n.cls == NU))
{
return (false, false);
}
if st.num && matches!(c, NU | SY | IS | PO | PR) {
return (false, false);
}
if (p == JL && matches!(c, JL | JV | H2 | H3))
|| (matches!(p, JV | H2) && matches!(c, JV | JT))
|| (matches!(p, JT | H3) && c == JT)
{
return (false, false);
}
if (matches!(p, JL | JV | JT | H2 | H3) && matches!(c, IN | PO))
|| (p == PR && matches!(c, JL | JV | JT | H2 | H3))
{
return (false, false);
}
if al_hl(p) && al_hl(c) {
return (false, false); }
let dc = |u: &LbUnit| u.base == '\u{25CC}' || matches!(u.cls, AK | AS);
let dc_prev = dc(prev);
let dc_cur = dc(cur);
if (p == AP && (dc_cur || c == AK || c == AS)) || (dc_prev && matches!(c, VF | VI)) || (p == VI && prev2.is_some_and(dc) && (c == AK || cur.base == '\u{25CC}')) || (dc_prev && dc_cur && next.is_some_and(|n| n.cls == VF))
{
return (false, false);
}
if p == IS && al_hl(c) {
return (false, false); }
if matches!(p, AL | HL | NU) && c == OP && !cur.wide() {
return (false, false);
}
if p == CP && matches!(c, AL | HL | NU) && !prev.wide() {
return (false, false);
}
if p == RI && c == RI && st.open_ri % 2 == 1 {
return (false, false); }
if c == EM
&& (p == EB
|| (pictographic(prev.base)
&& super::generated::general_category::general_category(prev.base as u32)
== super::category::GeneralCategory::Unassigned))
{
return (false, false);
}
(true, false) }
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct LineBreak<'a> {
pub text: &'a str,
pub mandatory: bool,
}
#[derive(Clone)]
pub struct LineBreaks<'a> {
s: &'a str,
pos: usize,
}
impl<'a> Iterator for LineBreaks<'a> {
type Item = LineBreak<'a>;
fn next(&mut self) -> Option<LineBreak<'a>> {
if self.pos >= self.s.len() {
return None;
}
let start = self.pos;
let mut prev = lb_unit(self.s, start);
let is_sp = prev.cls == Lb::SP;
let mut st = LbState {
regional: if is_sp { Lb::XX } else { prev.cls },
r_pi: !is_sp && prev.pi(),
before: Lb::XX, sp: is_sp,
open_ri: u32::from(prev.cls == Lb::RI),
num: prev.cls == Lb::NU,
};
let mut prev2: Option<LbUnit> = None;
let mut at = prev.end;
let mut mandatory = false;
while at < self.s.len() {
let cur = lb_unit(self.s, at);
let next = (cur.end < self.s.len()).then(|| lb_unit(self.s, cur.end));
let (brk, mand) = line_break_before(prev2.as_ref(), &prev, &cur, next.as_ref(), &st);
if brk {
mandatory = mand;
break;
}
if cur.cls == Lb::SP {
st.sp = true;
} else {
st.before = st.regional;
st.regional = cur.cls;
st.r_pi = cur.pi();
st.sp = false;
}
st.open_ri = if cur.cls == Lb::RI { st.open_ri + 1 } else { 0 };
st.num = match cur.cls {
Lb::NU => true,
Lb::SY | Lb::IS | Lb::CL | Lb::CP => st.num,
_ => false,
};
at = cur.end;
prev2 = Some(prev);
prev = cur;
}
let text = &self.s[start..at];
self.pos = at;
Some(LineBreak { text, mandatory })
}
}
#[must_use]
pub fn line_breaks(s: &str) -> LineBreaks<'_> {
LineBreaks { s, pos: 0 }
}