use core::cmp;
use crate::tables::grapheme::GraphemeCat;
#[derive(Debug, Clone)]
pub struct GraphemeIndices<'a> {
start_offset: usize,
iter: Graphemes<'a>,
}
impl<'a> GraphemeIndices<'a> {
#[inline]
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for GraphemeIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
#[derive(Clone, Debug)]
pub struct Graphemes<'a> {
string: &'a str,
cursor: GraphemeCursor,
cursor_back: GraphemeCursor,
}
impl<'a> Graphemes<'a> {
#[inline]
pub fn as_str(&self) -> &'a str {
&self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
}
}
impl<'a> Iterator for Graphemes<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
let start = self.cursor.cur_cursor();
if start == self.cursor_back.cur_cursor() {
return None;
}
let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
Some(&self.string[start..next])
}
}
impl<'a> DoubleEndedIterator for Graphemes<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
let end = self.cursor_back.cur_cursor();
if end == self.cursor.cur_cursor() {
return None;
}
let prev = self
.cursor_back
.prev_boundary(self.string, 0)
.unwrap()
.unwrap();
Some(&self.string[prev..end])
}
}
#[inline]
pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> {
let len = s.len();
Graphemes {
string: s,
cursor: GraphemeCursor::new(0, len, is_extended),
cursor_back: GraphemeCursor::new(len, len, is_extended),
}
}
#[inline]
pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> {
GraphemeIndices {
start_offset: s.as_ptr() as usize,
iter: new_graphemes(s, is_extended),
}
}
#[derive(PartialEq, Eq, Clone, Debug)]
enum GraphemeState {
Unknown,
NotBreak,
Break,
InCbConsonant,
Regional,
Emoji,
}
#[derive(Clone, Debug)]
pub struct GraphemeCursor {
offset: usize,
len: usize,
is_extended: bool,
state: GraphemeState,
cat_before: Option<GraphemeCat>,
cat_after: Option<GraphemeCat>,
pre_context_offset: Option<usize>,
incb_linker_count: Option<usize>,
ris_count: Option<usize>,
resuming: bool,
grapheme_cat_cache: (u32, u32, GraphemeCat),
}
#[derive(PartialEq, Eq, Debug)]
pub enum GraphemeIncomplete {
PreContext(usize),
PrevChunk,
NextChunk,
InvalidOffset,
}
#[derive(PartialEq, Eq)]
enum PairResult {
NotBreak,
Break,
Extended,
InCbConsonant,
Regional,
Emoji,
}
#[inline]
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
use self::PairResult::*;
use crate::tables::grapheme::GraphemeCat::*;
match (before, after) {
(GC_CR, GC_LF) => NotBreak, (GC_Control | GC_CR | GC_LF, _) => Break, (_, GC_Control | GC_CR | GC_LF) => Break, (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak, (GC_LV | GC_V, GC_V | GC_T) => NotBreak, (GC_LVT | GC_T, GC_T) => NotBreak, (_, GC_Extend | GC_ZWJ) => NotBreak, (_, GC_SpacingMark) => Extended, (GC_Prepend, _) => Extended, (_, GC_InCB_Consonant) => InCbConsonant, (GC_ZWJ, GC_Extended_Pictographic) => Emoji, (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, (_, _) => Break, }
}
impl GraphemeCursor {
pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
let state = if offset == 0 || offset == len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
GraphemeCursor {
offset,
len,
state,
is_extended,
cat_before: None,
cat_after: None,
pre_context_offset: None,
incb_linker_count: None,
ris_count: None,
resuming: false,
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
}
}
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
use crate::tables::grapheme as gr;
use crate::tables::grapheme::GraphemeCat::*;
if ch <= '\u{7e}' {
if ch >= '\u{20}' {
GC_Any
} else if ch == '\n' {
GC_LF
} else if ch == '\r' {
GC_CR
} else {
GC_Control
}
} else {
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
self.grapheme_cat_cache = gr::grapheme_category(ch);
}
self.grapheme_cat_cache.2
}
}
pub fn set_cursor(&mut self, offset: usize) {
if offset != self.offset {
self.offset = offset;
self.state = if offset == 0 || offset == self.len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
self.cat_before = None;
self.cat_after = None;
self.incb_linker_count = None;
self.ris_count = None;
}
}
#[inline]
pub fn cur_cursor(&self) -> usize {
self.offset
}
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap());
self.pre_context_offset = None;
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().next_back().unwrap();
if self.grapheme_category(ch) == gr::GC_Prepend {
self.decide(false); return;
}
}
match self.state {
GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
_ => {
if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
let ch = chunk.chars().next_back().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
}
}
}
}
#[inline]
fn decide(&mut self, is_break: bool) {
self.state = if is_break {
GraphemeState::Break
} else {
GraphemeState::NotBreak
};
}
#[inline]
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
self.decide(is_break);
Ok(is_break)
}
#[inline]
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
if self.state == GraphemeState::Break {
Ok(true)
} else if self.state == GraphemeState::NotBreak {
Ok(false)
} else if let Some(pre_context_offset) = self.pre_context_offset {
Err(GraphemeIncomplete::PreContext(pre_context_offset))
} else {
unreachable!("inconsistent state");
}
}
#[inline]
fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::{self, grapheme as gr};
if !self.is_extended {
self.decide(true);
return;
}
let mut incb_linker_count = self.incb_linker_count.unwrap_or(0);
for ch in chunk.chars().rev() {
if tables::is_incb_linker(ch) {
incb_linker_count += 1;
self.incb_linker_count = Some(incb_linker_count);
} else if tables::derived_property::InCB_Extend(ch) {
} else {
let result = !(self.incb_linker_count.unwrap_or(0) > 0
&& self.grapheme_category(ch) == gr::GC_InCB_Consonant);
self.decide(result);
return;
}
}
if chunk_start == 0 {
self.decide(true);
} else {
self.pre_context_offset = Some(chunk_start);
self.state = GraphemeState::InCbConsonant;
}
}
#[inline]
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
let mut ris_count = self.ris_count.unwrap_or(0);
for ch in chunk.chars().rev() {
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
self.ris_count = Some(ris_count);
self.decide((ris_count % 2) == 0);
return;
}
ris_count += 1;
}
self.ris_count = Some(ris_count);
if chunk_start == 0 {
self.decide((ris_count % 2) == 0);
} else {
self.pre_context_offset = Some(chunk_start);
self.state = GraphemeState::Regional;
}
}
#[inline]
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
if self.grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
match self.grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_Extended_Pictographic => {
self.decide(false);
return;
}
_ => {
self.decide(true);
return;
}
}
}
if chunk_start == 0 {
self.decide(true);
} else {
self.pre_context_offset = Some(chunk_start);
self.state = GraphemeState::Emoji;
}
}
#[inline]
pub fn is_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<bool, GraphemeIncomplete> {
use crate::tables::grapheme as gr;
if self.state == GraphemeState::Break {
return Ok(true);
}
if self.state == GraphemeState::NotBreak {
return Ok(false);
}
if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len()))
&& (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none())
{
return Err(GraphemeIncomplete::InvalidOffset);
}
if let Some(pre_context_offset) = self.pre_context_offset {
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
}
let offset_in_chunk = self.offset.saturating_sub(chunk_start);
if self.cat_after.is_none() {
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
self.cat_after = Some(self.grapheme_category(ch));
}
if self.offset == chunk_start {
let mut need_pre_context = true;
match self.cat_after.unwrap() {
gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
_ => need_pre_context = self.cat_before.is_none(),
}
if need_pre_context {
self.pre_context_offset = Some(chunk_start);
return Err(GraphemeIncomplete::PreContext(chunk_start));
}
}
if self.cat_before.is_none() {
let ch = chunk[..offset_in_chunk].chars().next_back().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
}
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
PairResult::NotBreak => self.decision(false),
PairResult::Break => self.decision(true),
PairResult::Extended => {
let is_extended = self.is_extended;
self.decision(!is_extended)
}
PairResult::InCbConsonant => {
self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
PairResult::Regional => {
if let Some(ris_count) = self.ris_count {
return self.decision((ris_count % 2) == 0);
}
self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
PairResult::Emoji => {
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
}
}
#[inline]
pub fn next_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == self.len {
return Ok(None);
}
let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars();
let mut ch = match iter.next() {
Some(ch) => ch,
None => return Err(GraphemeIncomplete::NextChunk),
};
loop {
if self.resuming {
if self.cat_after.is_none() {
self.cat_after = Some(self.grapheme_category(ch));
}
} else {
self.offset = self.offset.saturating_add(ch.len_utf8());
self.state = GraphemeState::Unknown;
self.cat_before = self.cat_after.take();
if self.cat_before.is_none() {
self.cat_before = Some(self.grapheme_category(ch));
}
if crate::tables::is_incb_linker(ch) {
self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1));
} else if !crate::tables::derived_property::InCB_Extend(ch) {
self.incb_linker_count = Some(0);
}
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
self.ris_count = self.ris_count.map(|c| c + 1);
} else {
self.ris_count = Some(0);
}
if let Some(next_ch) = iter.next() {
ch = next_ch;
self.cat_after = Some(self.grapheme_category(ch));
} else if self.offset == self.len {
self.decide(true);
} else {
self.resuming = true;
return Err(GraphemeIncomplete::NextChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
pub fn prev_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == 0 {
return Ok(None);
}
if self.offset == chunk_start {
return Err(GraphemeIncomplete::PrevChunk);
}
let mut iter = chunk[..self.offset.saturating_sub(chunk_start)]
.chars()
.rev();
let mut ch = iter.next().unwrap();
loop {
if self.offset == chunk_start {
self.resuming = true;
return Err(GraphemeIncomplete::PrevChunk);
}
if self.resuming {
self.cat_before = Some(self.grapheme_category(ch));
} else {
self.offset -= ch.len_utf8();
self.cat_after = self.cat_before.take();
self.state = GraphemeState::Unknown;
if let Some(incb_linker_count) = self.incb_linker_count {
self.ris_count = if incb_linker_count > 0 && crate::tables::is_incb_linker(ch) {
Some(incb_linker_count - 1)
} else if crate::tables::derived_property::InCB_Extend(ch) {
Some(incb_linker_count)
} else {
None
};
}
if let Some(ris_count) = self.ris_count {
self.ris_count = if ris_count > 0 {
Some(ris_count - 1)
} else {
None
};
}
if let Some(prev_ch) = iter.next() {
ch = prev_ch;
self.cat_before = Some(self.grapheme_category(ch));
} else if self.offset == 0 {
self.decide(true);
} else {
self.resuming = true;
self.cat_after = Some(self.grapheme_category(ch));
return Err(GraphemeIncomplete::PrevChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
}
#[test]
fn test_grapheme_cursor_ris_precontext() {
let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
let mut c = GraphemeCursor::new(8, s.len(), true);
assert_eq!(
c.is_boundary(&s[4..], 4),
Err(GraphemeIncomplete::PreContext(4))
);
c.provide_context(&s[..4], 0);
assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
}
#[test]
fn test_grapheme_cursor_chunk_start_require_precontext() {
let s = "\r\n";
let mut c = GraphemeCursor::new(1, s.len(), true);
assert_eq!(
c.is_boundary(&s[1..], 1),
Err(GraphemeIncomplete::PreContext(1))
);
c.provide_context(&s[..1], 0);
assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
}
#[test]
fn test_grapheme_cursor_prev_boundary() {
let s = "abcd";
let mut c = GraphemeCursor::new(3, s.len(), true);
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
}
#[test]
fn test_grapheme_cursor_prev_boundary_chunk_start() {
let s = "abcd";
let mut c = GraphemeCursor::new(2, s.len(), true);
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
}