extern crate unicode_segmentation;
use core::{cmp::Ordering, iter::Peekable, str::FromStr};
use LineBreakClass::*;
use self::unicode_segmentation::UnicodeSegmentation;
use crate::text_processing::{tables::LINE_BREAK_RULES, types::LineBreakClass};
#[derive(Debug, PartialEq)]
pub enum LineBreakCandidate {
MandatoryBreak,
BreakAllowed,
}
use LineBreakCandidate::*;
pub struct LineBreakCandidateIter<'a> {
text: &'a str,
iter: Peekable<unicode_segmentation::GraphemeIndices<'a>>,
pos: usize,
reg_ind_streak: u32,
}
impl<'a> LineBreakCandidateIter<'a> {
pub fn new(text: &'a str) -> Self {
LineBreakCandidateIter {
text,
pos: 0,
iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
reg_ind_streak: 0,
}
}
}
macro_rules! get_base_character {
($grapheme:ident) => {{
char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
$grapheme.get(0..2).unwrap_or_else(|| {
$grapheme
.get(0..3)
.unwrap_or_else(|| $grapheme.get(0..4).unwrap())
})
}))
}};
($grapheme:expr) => {{
char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
$grapheme.get(0..2).unwrap_or_else(|| {
$grapheme
.get(0..3)
.unwrap_or_else(|| $grapheme.get(0..4).unwrap())
})
}))
}};
}
macro_rules! get_class {
($grapheme:ident) => {{
get_base_character!($grapheme)
.map(|char| search_table(char as u32, LINE_BREAK_RULES))
.unwrap_or(XX)
}};
($grapheme:expr) => {{
get_base_character!($grapheme)
.map(|char| search_table(char as u32, LINE_BREAK_RULES))
.unwrap_or(XX)
}};
}
macro_rules! next_grapheme_class {
($graph_iter:ident, $grapheme:ident) => ({
if let Some((_, g)) = $graph_iter.next() {
$grapheme = g;
Some(get_class!(g))
} else { None }
});
(($next_char:ident is $class:expr)) => ({
$next_char.is_some() && get_class!(($next_char.unwrap().1)) == $class
});
(($next_char:ident is $($class:ident),+)) => ({
$next_char.is_some() && ($(get_class!(($next_char.unwrap().1)) == $class)||+)
});
}
impl<'a> Iterator for LineBreakCandidateIter<'a> {
type Item = (usize, LineBreakCandidate);
fn next(&mut self) -> Option<Self::Item> {
if self.pos >= self.text.len() {
return None;
}
if self.pos + 1 == self.text.len() {
self.pos += 1;
return Some((self.pos, MandatoryBreak));
}
let (idx, mut grapheme) = self.iter.next().unwrap();
let LineBreakCandidateIter {
ref mut iter,
text,
ref mut reg_ind_streak,
ref mut pos,
} = self;
let iter = iter.by_ref();
debug_assert_eq!(idx, *pos);
if idx == 0 {
*pos += grapheme.len();
return self.next();
}
let class = get_class!(grapheme);
if class != RI {
*reg_ind_streak = 0;
}
let next_char: Option<&(usize, &str)> = iter.peek();
match class {
BK => {
*pos += grapheme.len();
return Some((*pos, MandatoryBreak));
}
CR if next_grapheme_class!((next_char is LF)) => {
*pos += grapheme.len();
assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
*pos += grapheme.len();
return Some((*pos, MandatoryBreak));
}
CR | LF | NL => {
*pos += grapheme.len();
return Some((*pos, MandatoryBreak));
}
_ => {}
}
if let Some((_, next_grapheme)) = next_char {
let next_class = get_class!(next_grapheme);
match next_class {
BK | CR | LF | NL => {
*pos += grapheme.len();
return self.next();
}
SP | ZW => {
*pos += grapheme.len();
return self.next();
}
_ => {}
}
}
match class {
ZW => {
*pos += grapheme.len();
while Some(SP) == next_grapheme_class!(iter, grapheme) {
*pos += grapheme.len();
}
return Some((*pos, MandatoryBreak));
}
ZWJ => {
*pos += grapheme.len();
return self.next();
}
CM => {
unreachable!();
}
WJ => {
*pos += grapheme.len();
if next_grapheme_class!(iter, grapheme).is_some() {
*pos += grapheme.len();
}
return self.next();
}
GL => {
*pos += grapheme.len();
return self.next();
}
_ => {}
}
if let Some((next_idx, next_grapheme)) = next_char {
let next_class = get_class!(next_grapheme);
match next_class {
GL if ![SP, BA, HY].contains(&class) => {
*pos += grapheme.len();
return self.next();
}
CL | CP | EX | IS | SY => {
*pos = *next_idx;
return self.next();
}
_ => {}
}
}
match class {
SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
*pos += grapheme.len();
while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
{
*pos += grapheme.len();
}
*pos += grapheme.len();
return self.next();
}
OP => {
for (idx, grapheme) in self.iter.by_ref() {
*pos = idx + grapheme.len();
if !(get_class!(grapheme) == SP) {
break;
}
}
return self.next();
}
QU if get_class!(text[idx..].trim_start()) == OP => {
*pos += grapheme.len();
while Some(SP) == next_grapheme_class!(iter, grapheme) {
*pos += grapheme.len();
}
*pos = idx;
return self.next();
}
QU => {
*pos += grapheme.len();
if let Some((_, g)) = self.iter.next() {
*pos += g.len();
}
return self.next();
}
LineBreakClass::CL | LineBreakClass::CP
if get_class!(text[idx..].trim_start()) == NS =>
{
*pos += grapheme.len();
while Some(SP) == next_grapheme_class!(iter, grapheme) {
*pos += grapheme.len();
}
return self.next();
}
B2 if get_class!(text[idx..].trim_start()) == B2 => {
*pos += grapheme.len();
while Some(SP) == next_grapheme_class!(iter, grapheme) {
*pos += grapheme.len();
}
return self.next();
}
SP => {
*pos += 1;
return Some((*pos, BreakAllowed));
}
_ => {}
}
if let Some((next_idx, next_grapheme)) = next_char {
let next_class = get_class!(next_grapheme);
match next_class {
QU if class != SP => {
*pos = *next_idx + next_grapheme.len();
self.iter.next();
return self.next();
}
_ => {}
}
}
match class {
CB => {
*pos += grapheme.len();
return Some((*pos - 1, BreakAllowed));
}
BB => {
*pos += grapheme.len();
return self.next();
}
_ => {}
}
if let Some((_, next_grapheme)) = next_char {
let next_class = get_class!(next_grapheme);
match next_class {
BA | HY | NS => {
*pos += grapheme.len();
return self.next();
}
_ => {}
}
}
match class {
HL if next_grapheme_class!((next_char is HY, BA)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
SY if next_grapheme_class!((next_char is HL)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next().unwrap();
if let Some((idx, next_grapheme)) = self.iter.next() {
*pos = idx + next_grapheme.len();
}
self.next()
}
AL | HL if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
EX if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
EX => {
*pos += grapheme.len();
self.next()
}
ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
IN if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
NU if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
AL | HL if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
NU if next_grapheme_class!((next_char is AL, HL)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
PR | PO if next_grapheme_class!((next_char is AL, HL)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
CL if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
CP if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
CL if next_grapheme_class!((next_char is PR)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
CP if next_grapheme_class!((next_char is PR)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
NU if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
NU if next_grapheme_class!((next_char is PR)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
PO if next_grapheme_class!((next_char is OP)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
PO if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
PR if next_grapheme_class!((next_char is OP)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
PR if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
HY if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
IS if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
NU if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
SY if next_grapheme_class!((next_char is NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
JT | H3 if next_grapheme_class!((next_char is JT)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
AL | HL if next_grapheme_class!((next_char is AL, HL)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
IS if next_grapheme_class!((next_char is AL, HL)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
AL | HL | NU if next_grapheme_class!((next_char is OP)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
EB if next_grapheme_class!((next_char is EM)) => {
let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
self.iter.next();
self.next()
}
RI => {
*reg_ind_streak += 1;
*pos += grapheme.len();
if *reg_ind_streak % 2 == 1 {
return Some((*pos - grapheme.len(), BreakAllowed));
}
self.iter.next();
self.next()
}
_ => {
*pos += grapheme.len();
Some((*pos - grapheme.len(), BreakAllowed))
}
}
}
}
fn search_table(c: u32, t: &'static [(u32, u32, LineBreakClass)]) -> LineBreakClass {
match t.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi {
Ordering::Equal
} else if hi < c {
Ordering::Less
} else {
Ordering::Greater
}
}) {
Ok(idx) => t[idx].2,
Err(_) => XX,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_line_breaks() {
let s = "Fell past it.\n\n‘Well!’ thought Alice to herself.";
let breaks = LineBreakCandidateIter::new(s).collect::<Vec<(usize, LineBreakCandidate)>>();
let mut prev = 0;
for b in breaks {
println!("{:?}", &s[prev..b.0]);
prev = b.0;
}
println!("{:?}", &s[prev..]);
}
}