use crate::complex::*;
use crate::indices::*;
use crate::language::*;
use crate::provider::*;
use crate::symbols::*;
use alloc::string::String;
use alloc::vec;
use alloc::vec::Vec;
use core::char;
use core::str::CharIndices;
use icu_locid::{locale, Locale};
use icu_provider::prelude::*;
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum LineBreakRule {
Loose,
Normal,
Strict,
Anywhere,
}
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum WordBreakRule {
Normal,
BreakAll,
KeepAll,
}
#[non_exhaustive]
#[derive(Clone, PartialEq, Eq)]
pub struct LineBreakOptions {
pub line_break_rule: LineBreakRule,
pub word_break_rule: WordBreakRule,
pub ja_zh: bool,
}
impl Default for LineBreakOptions {
fn default() -> Self {
Self {
line_break_rule: LineBreakRule::Strict,
word_break_rule: WordBreakRule::Normal,
ja_zh: false,
}
}
}
pub type LineBreakIteratorUtf8<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf8>;
pub type LineBreakIteratorLatin1<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeLatin1>;
pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf16>;
pub struct LineBreakSegmenter {
options: LineBreakOptions,
payload: DataPayload<LineBreakDataV1Marker>,
dictionary: Dictionary,
lstm: LstmPayloads,
}
impl LineBreakSegmenter {
#[cfg(feature = "lstm")]
pub fn try_new<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<LineBreakDataV1Marker> + DataProvider<LstmDataV1Marker> + ?Sized,
{
Self::try_new_with_options(provider, Default::default())
}
#[cfg(not(feature = "lstm"))]
pub fn try_new<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<LineBreakDataV1Marker>
+ DataProvider<UCharDictionaryBreakDataV1Marker>
+ ?Sized,
{
Self::try_new_with_options(provider, Default::default())
}
#[cfg(feature = "lstm")]
pub fn try_new_with_options<D>(
provider: &D,
options: LineBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<LineBreakDataV1Marker> + DataProvider<LstmDataV1Marker> + ?Sized,
{
let payload = provider.load(Default::default())?.take_payload()?;
let burmese = Self::load_lstm(provider, locale!("my")).ok();
let khmer = Self::load_lstm(provider, locale!("km")).ok();
let lao = Self::load_lstm(provider, locale!("lo")).ok();
let thai = Self::load_lstm(provider, locale!("th")).ok();
Ok(Self {
options,
payload,
dictionary: Dictionary::default(),
lstm: LstmPayloads {
burmese,
khmer,
lao,
thai,
},
})
}
#[cfg(not(feature = "lstm"))]
pub fn try_new_with_options<D>(
provider: &D,
options: LineBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<LineBreakDataV1Marker>
+ DataProvider<UCharDictionaryBreakDataV1Marker>
+ ?Sized,
{
let payload = provider.load(Default::default())?.take_payload()?;
let khmer = Self::load_dictionary(provider, locale!("km")).ok();
let lao = Self::load_dictionary(provider, locale!("lo")).ok();
let burmese = Self::load_dictionary(provider, locale!("my")).ok();
let thai = Self::load_dictionary(provider, locale!("th")).ok();
Ok(Self {
options,
payload,
dictionary: Dictionary {
burmese,
khmer,
lao,
thai,
cj: None,
},
lstm: LstmPayloads::default(),
})
}
#[cfg(not(feature = "lstm"))]
fn load_dictionary<D: DataProvider<UCharDictionaryBreakDataV1Marker> + ?Sized>(
provider: &D,
locale: Locale,
) -> Result<DataPayload<UCharDictionaryBreakDataV1Marker>, DataError> {
provider
.load(DataRequest {
locale: &DataLocale::from(locale),
metadata: Default::default(),
})?
.take_payload()
}
#[cfg(feature = "lstm")]
fn load_lstm<D: DataProvider<LstmDataV1Marker> + ?Sized>(
provider: &D,
locale: Locale,
) -> Result<DataPayload<LstmDataV1Marker>, DataError> {
provider
.load(DataRequest {
locale: &DataLocale::from(locale),
metadata: Default::default(),
})?
.take_payload()
}
pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
LineBreakIterator {
iter: input.char_indices(),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.payload.get(),
options: &self.options,
dictionary: &self.dictionary,
lstm: &self.lstm,
}
}
pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> LineBreakIteratorLatin1<'l, 's> {
LineBreakIterator {
iter: Latin1Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.payload.get(),
options: &self.options,
dictionary: &self.dictionary,
lstm: &self.lstm,
}
}
pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
LineBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.payload.get(),
options: &self.options,
dictionary: &self.dictionary,
lstm: &self.lstm,
}
}
}
fn get_linebreak_property_utf32_with_rule(
property_table: &RuleBreakPropertyTable<'_>,
codepoint: u32,
line_break_rule: LineBreakRule,
word_break_rule: WordBreakRule,
) -> u8 {
let prop = property_table.0.get(codepoint);
if word_break_rule == WordBreakRule::BreakAll
|| line_break_rule == LineBreakRule::Loose
|| line_break_rule == LineBreakRule::Normal
{
return match prop {
CJ => ID, _ => prop,
};
}
prop
}
#[inline]
fn get_linebreak_property_latin1(property_table: &RuleBreakPropertyTable<'_>, codepoint: u8) -> u8 {
property_table.0.get(codepoint as u32)
}
#[inline]
fn get_linebreak_property_with_rule(
property_table: &RuleBreakPropertyTable<'_>,
codepoint: char,
linebreak_rule: LineBreakRule,
wordbreak_rule: WordBreakRule,
) -> u8 {
get_linebreak_property_utf32_with_rule(
property_table,
codepoint as u32,
linebreak_rule,
wordbreak_rule,
)
}
#[inline]
fn is_break_utf32_by_normal(codepoint: u32, ja_zh: bool) -> bool {
match codepoint {
0x301C => ja_zh,
0x30A0 => ja_zh,
_ => false,
}
}
#[inline]
fn is_break_utf32_by_loose(
right_codepoint: u32,
left_prop: u8,
right_prop: u8,
ja_zh: bool,
) -> Option<bool> {
if right_prop == BA {
if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
return Some(true);
}
} else if right_prop == NS {
if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
return Some(ja_zh);
}
if right_codepoint == 0x3005
|| right_codepoint == 0x303B
|| right_codepoint == 0x309D
|| right_codepoint == 0x309E
|| right_codepoint == 0x30FD
|| right_codepoint == 0x30FE
{
return Some(true);
}
if right_codepoint == 0x30FB
|| right_codepoint == 0xFF1A
|| right_codepoint == 0xFF1B
|| right_codepoint == 0xFF65
|| right_codepoint == 0x203C
|| (0x2047..=0x2049).contains(&right_codepoint)
{
return Some(ja_zh);
}
} else if right_prop == IN {
return Some(true);
} else if right_prop == EX {
if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
return Some(ja_zh);
}
}
if right_prop == PO_EAW {
return Some(ja_zh);
}
if left_prop == PR_EAW {
return Some(ja_zh);
}
None
}
#[inline]
fn is_break_from_table(
break_state_table: &RuleBreakStateTable<'_>,
property_count: u8,
left: u8,
right: u8,
) -> bool {
let rule = get_break_state_from_table(break_state_table, property_count, left, right);
if rule == KEEP_RULE {
return false;
}
if rule >= 0 {
return false;
}
true
}
#[inline]
fn is_non_break_by_keepall(left: u8, right: u8) -> bool {
(left == AI
|| left == AL
|| left == ID
|| left == NU
|| left == HY
|| left == H2
|| left == H3
|| left == JL
|| left == JV
|| left == JT
|| left == CJ)
&& (right == AI
|| right == AL
|| right == ID
|| right == NU
|| right == HY
|| right == H2
|| right == H3
|| right == JL
|| right == JV
|| right == JT
|| right == CJ)
}
#[inline]
fn get_break_state_from_table(
break_state_table: &RuleBreakStateTable<'_>,
property_count: u8,
left: u8,
right: u8,
) -> i8 {
let idx = (left as usize) * (property_count as usize) + (right as usize);
break_state_table.0.get(idx).unwrap_or(KEEP_RULE)
}
#[inline]
fn use_complex_breaking_utf32(property_table: &RuleBreakPropertyTable<'_>, codepoint: u32) -> bool {
let line_break_property = get_linebreak_property_utf32_with_rule(
property_table,
codepoint,
LineBreakRule::Strict,
WordBreakRule::Normal,
);
line_break_property == SA
&& matches!(get_language(codepoint), Language::Thai | Language::Burmese)
}
pub trait LineBreakType<'l, 's> {
type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
type CharType: Copy + Into<u32>;
fn use_complex_breaking(iterator: &LineBreakIterator<'l, 's, Self>, c: Self::CharType) -> bool;
fn get_linebreak_property_with_rule(
iterator: &LineBreakIterator<'l, 's, Self>,
c: Self::CharType,
) -> u8;
fn get_current_position_character_len(iterator: &LineBreakIterator<'l, 's, Self>) -> usize;
fn handle_complex_language(
iterator: &mut LineBreakIterator<'l, 's, Self>,
left_codepoint: Self::CharType,
) -> Option<usize>;
}
pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
iter: Y::IterAttr,
len: usize,
current_pos_data: Option<(usize, Y::CharType)>,
result_cache: Vec<usize>,
data: &'l RuleBreakDataV1<'l>,
options: &'l LineBreakOptions,
dictionary: &'l Dictionary,
lstm: &'l LstmPayloads,
}
impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
if self.check_eof() {
return None;
}
if !self.result_cache.is_empty() {
let mut i = 0;
loop {
if i == *self.result_cache.first().unwrap() {
self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
return Some(self.current_pos_data.unwrap().0);
}
i += Y::get_current_position_character_len(self);
self.current_pos_data = self.iter.next();
if self.current_pos_data.is_none() {
self.result_cache.clear();
return Some(self.len);
}
}
}
loop {
let mut left_prop = self.get_linebreak_property();
let left_codepoint = self.current_pos_data;
self.current_pos_data = self.iter.next();
if self.current_pos_data.is_none() {
return Some(self.len);
}
let right_prop = self.get_linebreak_property();
match self.options.word_break_rule {
WordBreakRule::BreakAll => {
left_prop = match left_prop {
AL => ID,
NU => ID,
SA => ID,
_ => left_prop,
};
}
WordBreakRule::KeepAll => {
if is_non_break_by_keepall(left_prop, right_prop) {
continue;
}
}
_ => (),
}
match self.options.line_break_rule {
LineBreakRule::Normal => {
if self.is_break_by_normal() {
return Some(self.current_pos_data.unwrap().0);
}
}
LineBreakRule::Loose => {
if let Some(breakable) = is_break_utf32_by_loose(
self.current_pos_data.unwrap().1.into(),
left_prop,
right_prop,
self.options.ja_zh,
) {
if breakable {
return Some(self.current_pos_data.unwrap().0);
}
continue;
}
}
LineBreakRule::Anywhere => {
return Some(self.current_pos_data.unwrap().0);
}
_ => (),
};
if self.options.word_break_rule != WordBreakRule::BreakAll
&& Y::use_complex_breaking(self, left_codepoint.unwrap().1)
&& Y::use_complex_breaking(self, self.current_pos_data.unwrap().1)
{
let result = Y::handle_complex_language(self, left_codepoint.unwrap().1);
if result.is_some() {
return result;
}
}
let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
if break_state >= 0_i8 {
let mut previous_iter = self.iter.clone();
let mut previous_pos_data = self.current_pos_data;
loop {
self.current_pos_data = self.iter.next();
if self.current_pos_data.is_none() {
let break_state = self
.get_break_state_from_table(break_state as u8, self.data.eot_property);
if break_state == NOT_MATCH_RULE {
self.iter = previous_iter;
self.current_pos_data = previous_pos_data;
return Some(previous_pos_data.unwrap().0);
}
return Some(self.len);
}
let prop = self.get_linebreak_property();
break_state = self.get_break_state_from_table(break_state as u8, prop);
if break_state < 0 {
break;
}
previous_iter = self.iter.clone();
previous_pos_data = self.current_pos_data;
}
if break_state == KEEP_RULE {
continue;
}
if break_state == NOT_MATCH_RULE {
self.iter = previous_iter;
self.current_pos_data = previous_pos_data;
return Some(previous_pos_data.unwrap().0);
}
return Some(self.current_pos_data.unwrap().0);
}
if self.is_break_from_table(left_prop, right_prop) {
return Some(self.current_pos_data.unwrap().0);
}
}
}
}
impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
#[inline]
fn check_eof(&mut self) -> bool {
if self.current_pos_data.is_none() {
self.current_pos_data = self.iter.next();
if self.current_pos_data.is_none() {
return true;
}
}
false
}
fn get_linebreak_property(&self) -> u8 {
Y::get_linebreak_property_with_rule(self, self.current_pos_data.unwrap().1)
}
fn is_break_by_normal(&self) -> bool {
is_break_utf32_by_normal(self.current_pos_data.unwrap().1.into(), self.options.ja_zh)
}
fn get_break_state_from_table(&self, left: u8, right: u8) -> i8 {
get_break_state_from_table(
&self.data.break_state_table,
self.data.property_count,
left,
right,
)
}
fn is_break_from_table(&self, left: u8, right: u8) -> bool {
is_break_from_table(
&self.data.break_state_table,
self.data.property_count,
left,
right,
)
}
}
pub struct LineBreakTypeUtf8;
impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf8 {
type IterAttr = CharIndices<'s>;
type CharType = char;
fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
get_linebreak_property_with_rule(
&iterator.data.property_table,
c,
iterator.options.line_break_rule,
iterator.options.word_break_rule,
)
}
#[inline]
fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
use_complex_breaking_utf32(&iterator.data.property_table, c as u32)
}
fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
iterator.current_pos_data.unwrap().1.len_utf8()
}
fn handle_complex_language(
iter: &mut LineBreakIterator<'l, 's, Self>,
left_codepoint: char,
) -> Option<usize> {
let start_iter = iter.iter.clone();
let start_point = iter.current_pos_data;
let mut s = String::new();
s.push(left_codepoint);
loop {
s.push(iter.current_pos_data.unwrap().1);
iter.current_pos_data = iter.iter.next();
if iter.current_pos_data.is_none() {
break;
}
if !Self::use_complex_breaking(iter, iter.current_pos_data.unwrap().1) {
break;
}
}
iter.iter = start_iter;
iter.current_pos_data = start_point;
let breaks = complex_language_segment_str(iter.dictionary, iter.lstm, &s);
iter.result_cache = breaks;
let mut i = iter.current_pos_data.unwrap().1.len_utf8();
loop {
if i == *iter.result_cache.first().unwrap() {
iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
return Some(iter.current_pos_data.unwrap().0);
}
iter.current_pos_data = iter.iter.next();
if iter.current_pos_data.is_none() {
iter.result_cache.clear();
return Some(iter.len);
}
i += Self::get_current_position_character_len(iter);
}
}
}
pub struct LineBreakTypeLatin1;
impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeLatin1 {
type IterAttr = Latin1Indices<'s>;
type CharType = u8;
fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
get_linebreak_property_latin1(&iterator.data.property_table, c)
}
#[inline]
fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
false
}
fn get_current_position_character_len(_: &LineBreakIterator<Self>) -> usize {
panic!("not reachable");
}
fn handle_complex_language(
_: &mut LineBreakIterator<Self>,
_: Self::CharType,
) -> Option<usize> {
panic!("not reachable");
}
}
pub struct LineBreakTypeUtf16;
impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
type IterAttr = Utf16Indices<'s>;
type CharType = u32;
fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
get_linebreak_property_utf32_with_rule(
&iterator.data.property_table,
c,
iterator.options.line_break_rule,
iterator.options.word_break_rule,
)
}
#[inline]
fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
use_complex_breaking_utf32(&iterator.data.property_table, c)
}
fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
let ch = iterator.current_pos_data.unwrap().1;
if ch >= 0x10000 {
2
} else {
1
}
}
fn handle_complex_language(
iterator: &mut LineBreakIterator<Self>,
left_codepoint: Self::CharType,
) -> Option<usize> {
let start_iter = iterator.iter.clone();
let start_point = iterator.current_pos_data;
let mut s = vec![left_codepoint as u16];
loop {
s.push(iterator.current_pos_data.unwrap().1 as u16);
iterator.current_pos_data = iterator.iter.next();
if iterator.current_pos_data.is_none() {
break;
}
if !Self::use_complex_breaking(iterator, iterator.current_pos_data.unwrap().1) {
break;
}
}
iterator.iter = start_iter;
iterator.current_pos_data = start_point;
let breaks = complex_language_segment_utf16(iterator.dictionary, iterator.lstm, &s);
let mut i = 1;
iterator.result_cache = breaks;
loop {
if i == *iterator.result_cache.first().unwrap() {
iterator.result_cache = iterator
.result_cache
.iter()
.skip(1)
.map(|r| r - i)
.collect();
return Some(iterator.current_pos_data.unwrap().0);
}
iterator.current_pos_data = iterator.iter.next();
if iterator.current_pos_data.is_none() {
iterator.result_cache.clear();
return Some(iterator.len);
}
i += 1;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn linebreak_propery() {
let provider = icu_testdata::get_provider();
let payload: DataPayload<LineBreakDataV1Marker> = provider
.load(Default::default())
.expect("Loading should succeed!")
.take_payload()
.expect("Data should be present!");
let lb_data: &RuleBreakDataV1 = payload.get();
let get_linebreak_property = |codepoint| {
get_linebreak_property_with_rule(
&lb_data.property_table,
codepoint,
LineBreakRule::Strict,
WordBreakRule::Normal,
)
};
assert_eq!(get_linebreak_property('\u{0020}'), SP);
assert_eq!(get_linebreak_property('\u{0022}'), QU);
assert_eq!(get_linebreak_property('('), OP_OP30);
assert_eq!(get_linebreak_property('\u{0030}'), NU);
assert_eq!(get_linebreak_property('['), OP_OP30);
assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
assert_eq!(get_linebreak_property('\u{20000}'), ID);
assert_eq!(get_linebreak_property('\u{e0020}'), CM);
assert_eq!(get_linebreak_property('\u{3041}'), CJ);
assert_eq!(get_linebreak_property('\u{0025}'), PO);
assert_eq!(get_linebreak_property('\u{00A7}'), AI);
assert_eq!(get_linebreak_property('\u{50005}'), XX);
assert_eq!(get_linebreak_property('\u{17D6}'), NS);
assert_eq!(get_linebreak_property('\u{2014}'), B2);
}
#[test]
#[allow(clippy::bool_assert_comparison)] fn break_rule() {
let provider = icu_testdata::get_provider();
let payload: DataPayload<LineBreakDataV1Marker> = provider
.load(Default::default())
.expect("Loading should succeed!")
.take_payload()
.expect("Data should be present!");
let lb_data: &RuleBreakDataV1 = payload.get();
let is_break = |left, right| {
is_break_from_table(
&lb_data.break_state_table,
lb_data.property_count,
left,
right,
)
};
assert_eq!(is_break(BK, AL), true);
assert_eq!(is_break(CR, LF), false);
assert_eq!(is_break(CR, AL), true);
assert_eq!(is_break(LF, AL), true);
assert_eq!(is_break(NL, AL), true);
assert_eq!(is_break(AL, BK), false);
assert_eq!(is_break(AL, CR), false);
assert_eq!(is_break(AL, LF), false);
assert_eq!(is_break(AL, NL), false);
assert_eq!(is_break(AL, SP), false);
assert_eq!(is_break(AL, ZW), false);
assert_eq!(is_break(ZWJ, AL), false);
assert_eq!(is_break(AL, ZWJ), false);
assert_eq!(is_break(AL, CM), false);
assert_eq!(is_break(ID, ZWJ), false);
assert_eq!(is_break(ZWJ, SP), false);
assert_eq!(is_break(SP, CM), true);
assert_eq!(is_break(AL, WJ), false);
assert_eq!(is_break(WJ, AL), false);
assert_eq!(is_break(GL, AL), false);
assert_eq!(is_break(AL, GL), false);
assert_eq!(is_break(SP, GL), true);
assert_eq!(is_break(AL, CL), false);
assert_eq!(is_break(AL, CP), false);
assert_eq!(is_break(AL, EX), false);
assert_eq!(is_break(AL, IS), false);
assert_eq!(is_break(AL, SY), false);
assert_eq!(is_break(SP, AL), true);
assert_eq!(is_break(AL, QU), false);
assert_eq!(is_break(QU, AL), false);
assert_eq!(is_break(AL, CB), true);
assert_eq!(is_break(CB, AL), true);
assert_eq!(is_break(AL, BA), false);
assert_eq!(is_break(AL, HY), false);
assert_eq!(is_break(AL, NS), false);
assert_eq!(is_break(AL, BA), false);
assert_eq!(is_break(BB, AL), false);
assert_eq!(is_break(ID, BA), false);
assert_eq!(is_break(ID, NS), false);
assert_eq!(is_break(SY, HL), false);
assert_eq!(is_break(AL, IN), false);
assert_eq!(is_break(AL, NU), false);
assert_eq!(is_break(HL, NU), false);
assert_eq!(is_break(PR, ID), false);
assert_eq!(is_break(PR, EB), false);
assert_eq!(is_break(PR, EM), false);
assert_eq!(is_break(ID, PO), false);
assert_eq!(is_break(EB, PO), false);
assert_eq!(is_break(EM, PO), false);
assert_eq!(is_break(JL, JL), false);
assert_eq!(is_break(JL, JV), false);
assert_eq!(is_break(JL, H2), false);
assert_eq!(is_break(JL, IN), false);
assert_eq!(is_break(JL, PO), false);
assert_eq!(is_break(PR, JL), false);
assert_eq!(is_break(AL, AL), false);
assert_eq!(is_break(HL, AL), false);
assert_eq!(is_break(IS, AL), false);
assert_eq!(is_break(IS, HL), false);
assert_eq!(is_break(EB, EM), false);
assert_eq!(is_break(ID, ID), true);
}
#[test]
fn linebreak() {
let provider = icu_testdata::get_provider();
let segmenter = LineBreakSegmenter::try_new(&provider).expect("Data exists");
let mut iter = segmenter.segment_str("hello world");
assert_eq!(Some(6), iter.next());
assert_eq!(Some(11), iter.next());
assert_eq!(None, iter.next());
iter = segmenter.segment_str("$10 $10");
assert_eq!(Some(4), iter.next());
assert_eq!(Some(7), iter.next());
iter = segmenter.segment_str("[ abc def");
assert_eq!(Some(7), iter.next());
assert_eq!(Some(10), iter.next());
assert_eq!(None, iter.next());
let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
let mut iter_u8 = segmenter.segment_latin1(&input);
assert_eq!(Some(7), iter_u8.next());
assert_eq!(Some(10), iter_u8.next());
assert_eq!(None, iter_u8.next());
let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(7), iter_u16.next());
iter = segmenter.segment_str("abc\u{0022} (def");
assert_eq!(Some(10), iter.next());
let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
let mut iter_u8 = segmenter.segment_latin1(&input);
assert_eq!(Some(10), iter_u8.next());
let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(10), iter_u16.next());
iter = segmenter.segment_str("\u{0029}\u{203C}");
assert_eq!(Some(4), iter.next());
iter = segmenter.segment_str("\u{0029} \u{203C}");
assert_eq!(Some(6), iter.next());
let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(4), iter_u16.next());
iter = segmenter.segment_str("\u{2014}\u{2014}aa");
assert_eq!(Some(6), iter.next());
iter = segmenter.segment_str("\u{2014} \u{2014}aa");
assert_eq!(Some(8), iter.next());
iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc");
assert_eq!(Some(14), iter.next());
assert_eq!(Some(18), iter.next());
assert_eq!(Some(21), iter.next());
let mut iter = segmenter.segment_str("(0,1)+(2,3)");
assert_eq!(Some(11), iter.next());
let input: [u16; 11] = [
0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(11), iter_u16.next());
let input: [u16; 13] = [
0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(6), iter_u16.next());
iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
assert_eq!(Some(5), iter.next());
}
}