use crate::complex::*;
use crate::indices::*;
use crate::provider::*;
use crate::rule_segmenter::*;
use alloc::string::String;
use alloc::vec;
use alloc::vec::Vec;
use core::char;
use icu_locale_core::subtags::{language, Language};
use icu_locale_core::LanguageIdentifier;
use icu_provider::prelude::*;
use utf8_iter::Utf8CharIndices;
#[allow(dead_code)]
const UNKNOWN: u8 = 0;
#[allow(dead_code)]
const AI: u8 = 1;
#[allow(dead_code)]
const AK: u8 = 2;
#[allow(dead_code)]
const AL: u8 = 3;
#[allow(dead_code)]
const AL_DOTTED_CIRCLE: u8 = 4;
#[allow(dead_code)]
const AP: u8 = 5;
#[allow(dead_code)]
const AS: u8 = 6;
#[allow(dead_code)]
const B2: u8 = 7;
#[allow(dead_code)]
const BA: u8 = 8;
#[allow(dead_code)]
const BB: u8 = 9;
#[allow(dead_code)]
const BK: u8 = 10;
#[allow(dead_code)]
const CB: u8 = 11;
#[allow(dead_code)]
const CJ: u8 = 12;
#[allow(dead_code)]
const CL: u8 = 13;
#[allow(dead_code)]
const CM: u8 = 14;
#[allow(dead_code)]
const CP: u8 = 15;
#[allow(dead_code)]
const CR: u8 = 16;
#[allow(dead_code)]
const EB: u8 = 17;
#[allow(dead_code)]
const EM: u8 = 18;
#[allow(dead_code)]
const EX: u8 = 19;
#[allow(dead_code)]
const GL: u8 = 20;
#[allow(dead_code)]
const H2: u8 = 21;
#[allow(dead_code)]
const H3: u8 = 22;
#[allow(dead_code)]
const HL: u8 = 23;
#[allow(dead_code)]
const HY: u8 = 24;
#[allow(dead_code)]
const ID: u8 = 25;
#[allow(dead_code)]
const ID_CN: u8 = 26;
#[allow(dead_code)]
const IN: u8 = 27;
#[allow(dead_code)]
const IS: u8 = 28;
#[allow(dead_code)]
const JL: u8 = 29;
#[allow(dead_code)]
const JT: u8 = 30;
#[allow(dead_code)]
const JV: u8 = 31;
#[allow(dead_code)]
const LF: u8 = 32;
#[allow(dead_code)]
const NL: u8 = 33;
#[allow(dead_code)]
const NS: u8 = 34;
#[allow(dead_code)]
const NU: u8 = 35;
#[allow(dead_code)]
const OP_EA: u8 = 36;
#[allow(dead_code)]
const OP_OP30: u8 = 37;
#[allow(dead_code)]
const PO: u8 = 38;
#[allow(dead_code)]
const PO_EAW: u8 = 39;
#[allow(dead_code)]
const PR: u8 = 40;
#[allow(dead_code)]
const PR_EAW: u8 = 41;
#[allow(dead_code)]
const QU: u8 = 42;
#[allow(dead_code)]
const QU_PF: u8 = 43;
#[allow(dead_code)]
const QU_PI: u8 = 44;
#[allow(dead_code)]
const RI: u8 = 45;
#[allow(dead_code)]
const SA: u8 = 46;
#[allow(dead_code)]
const SP: u8 = 47;
#[allow(dead_code)]
const SY: u8 = 48;
#[allow(dead_code)]
const VF: u8 = 49;
#[allow(dead_code)]
const VI: u8 = 50;
#[allow(dead_code)]
const WJ: u8 = 51;
#[allow(dead_code)]
const XX: u8 = 52;
#[allow(dead_code)]
const ZW: u8 = 53;
#[allow(dead_code)]
const ZWJ: u8 = 54;
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub enum LineBreakStrictness {
Loose,
Normal,
#[default]
Strict,
Anywhere,
}
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub enum LineBreakWordOption {
#[default]
Normal,
BreakAll,
KeepAll,
}
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct LineBreakOptions<'a> {
pub strictness: Option<LineBreakStrictness>,
pub word_option: Option<LineBreakWordOption>,
pub content_locale: Option<&'a LanguageIdentifier>,
}
impl LineBreakOptions<'_> {
pub const fn default() -> Self {
Self {
strictness: None,
word_option: None,
content_locale: None,
}
}
}
#[derive(Debug, Clone, Copy)]
struct ResolvedLineBreakOptions {
strictness: LineBreakStrictness,
word_option: LineBreakWordOption,
ja_zh: bool,
}
impl LineBreakOptions<'_> {
const fn resolve(self) -> ResolvedLineBreakOptions {
ResolvedLineBreakOptions {
strictness: match self.strictness {
Some(s) => s,
None => LineBreakStrictness::Strict,
},
word_option: match self.word_option {
Some(s) => s,
None => LineBreakWordOption::Normal,
},
ja_zh: if let Some(content_locale) = self.content_locale.as_ref() {
const JA: Language = language!("ja");
const ZH: Language = language!("zh");
matches!(content_locale.language, JA | ZH)
} else {
false
},
}
}
}
#[derive(Debug)]
pub struct LineSegmenter {
options: ResolvedLineBreakOptions,
payload: DataPayload<SegmenterBreakLineV1>,
complex: ComplexPayloads,
}
#[derive(Clone, Debug, Copy)]
pub struct LineSegmenterBorrowed<'data> {
options: ResolvedLineBreakOptions,
data: &'data RuleBreakData<'data>,
complex: ComplexPayloadsBorrowed<'data>,
}
impl LineSegmenter {
#[cfg(feature = "auto")]
#[cfg(feature = "compiled_data")]
pub fn new_auto(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
Self::new_lstm(options)
}
#[cfg(feature = "auto")]
icu_provider::gen_buffer_data_constructors!(
(options: LineBreakOptions) -> error: DataError,
functions: [
new_auto: skip,
try_new_auto_with_buffer_provider,
try_new_auto_unstable,
Self,
]
);
#[cfg(feature = "auto")]
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
pub fn try_new_auto_unstable<D>(
provider: &D,
options: LineBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakLineV1>
+ DataProvider<SegmenterLstmAutoV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
Self::try_new_lstm_unstable(provider, options)
}
#[cfg(feature = "lstm")]
#[cfg(feature = "compiled_data")]
pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
let mut s = Self::new_for_non_complex_scripts(options);
s.load_lstm();
s
}
#[cfg(feature = "lstm")]
icu_provider::gen_buffer_data_constructors!(
(options: LineBreakOptions) -> error: DataError,
functions: [
try_new_lstm: skip,
try_new_lstm_with_buffer_provider,
try_new_lstm_unstable,
Self,
]
);
#[cfg(feature = "lstm")]
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
pub fn try_new_lstm_unstable<D>(
provider: &D,
options: LineBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakLineV1>
+ DataProvider<SegmenterLstmAutoV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
let mut s = Self::try_new_for_non_complex_scripts_unstable(provider, options)?;
s.load_lstm_unstable(provider)?;
Ok(s)
}
#[cfg(feature = "compiled_data")]
pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
let mut s = Self::new_for_non_complex_scripts(options);
s.load_dictionary();
s
}
icu_provider::gen_buffer_data_constructors!(
(options: LineBreakOptions) -> error: DataError,
functions: [
new_dictionary: skip,
try_new_dictionary_with_buffer_provider,
try_new_dictionary_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
pub fn try_new_dictionary_unstable<D>(
provider: &D,
options: LineBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakLineV1>
+ DataProvider<SegmenterDictionaryExtendedV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
let mut s = Self::try_new_for_non_complex_scripts_unstable(provider, options)?;
s.load_dictionary_unstable(provider)?;
Ok(s)
}
#[cfg(feature = "compiled_data")]
pub const fn new_for_non_complex_scripts(
options: LineBreakOptions,
) -> LineSegmenterBorrowed<'static> {
LineSegmenterBorrowed {
options: options.resolve(),
data: Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
complex: ComplexPayloadsBorrowed::new(),
}
}
icu_provider::gen_buffer_data_constructors!(
(options: LineBreakOptions) -> error: DataError,
functions: [
new_for_non_complex_scripts: skip,
try_new_for_non_complex_scripts_with_buffer_provider,
try_new_for_non_complex_scripts_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
pub fn try_new_for_non_complex_scripts_unstable<D>(
provider: &D,
options: LineBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakLineV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
Ok(Self {
options: options.resolve(),
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new(provider)?,
})
}
#[cfg(feature = "lstm")]
pub fn load_lstm_unstable<D>(&mut self, provider: &D) -> Result<(), DataError>
where
D: DataProvider<SegmenterLstmAutoV1> + ?Sized,
{
self.complex.with_southeast_asian_lstms(provider)?;
Ok(())
}
#[cfg(feature = "serde")]
#[cfg(feature = "lstm")]
pub fn load_lstm_with_buffer_provider(
&mut self,
provider: &(impl BufferProvider + ?Sized),
) -> Result<(), DataError> {
self.load_lstm_unstable(&provider.as_deserializing())
}
pub fn load_dictionary_unstable<D>(&mut self, provider: &D) -> Result<(), DataError>
where
D: DataProvider<SegmenterDictionaryExtendedV1> + ?Sized,
{
self.complex.with_southeast_asian_dictionaries(provider)?;
Ok(())
}
#[cfg(feature = "serde")]
pub fn load_dictionary_with_buffer_provider(
&mut self,
provider: &(impl BufferProvider + ?Sized),
) -> Result<(), DataError> {
self.load_dictionary_unstable(&provider.as_deserializing())
}
pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_> {
LineSegmenterBorrowed {
options: self.options,
data: self.payload.get(),
complex: self.complex.as_borrowed(),
}
}
}
impl<'data> LineSegmenterBorrowed<'data> {
#[doc(hidden)]
pub fn with_options(self, options: LineBreakOptions) -> Self {
Self {
options: options.resolve(),
..self
}
}
pub fn segment_str<'s>(self, input: &'s str) -> LineBreakIterator<'data, 's, Utf8> {
LineBreakIterator {
iter: input.char_indices(),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
options: self.options,
complex: self.complex,
}
}
pub fn segment_utf8<'s>(
self,
input: &'s [u8],
) -> LineBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
LineBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
options: self.options,
complex: self.complex,
}
}
pub fn segment_latin1<'s>(self, input: &'s [u8]) -> LineBreakIterator<'data, 's, Latin1> {
LineBreakIterator {
iter: Latin1Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
options: self.options,
complex: self.complex,
}
}
pub fn segment_utf16<'s>(self, input: &'s [u16]) -> LineBreakIterator<'data, 's, Utf16> {
LineBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
options: self.options,
complex: self.complex,
}
}
}
impl LineSegmenterBorrowed<'static> {
#[cfg(feature = "lstm")]
#[cfg(feature = "compiled_data")]
pub fn load_lstm(&mut self) {
self.complex.with_southeast_asian_lstms();
}
#[cfg(feature = "compiled_data")]
pub fn load_dictionary(&mut self) {
self.complex.with_southeast_asian_dictionaries();
}
pub fn static_to_owned(self) -> LineSegmenter {
LineSegmenter {
payload: DataPayload::from_static_ref(self.data),
complex: self.complex.static_to_owned(),
options: self.options,
}
}
}
impl RuleBreakData<'_> {
fn get_linebreak_property_utf32_with_rule(
&self,
codepoint: u32,
strictness: LineBreakStrictness,
word_option: LineBreakWordOption,
) -> u8 {
let prop = self.property_table.get32(codepoint);
if word_option == LineBreakWordOption::BreakAll
|| strictness == LineBreakStrictness::Loose
|| strictness == LineBreakStrictness::Normal
{
return match prop {
CJ => ID, _ => prop,
};
}
prop
}
#[inline]
fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
let idx = (left as usize) * (self.property_count as usize) + (right as usize);
self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
}
#[inline]
fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
let line_break_property = self.get_linebreak_property_utf32_with_rule(
codepoint,
LineBreakStrictness::Strict,
LineBreakWordOption::Normal,
);
line_break_property == SA
}
}
#[inline]
fn is_break_utf32_by_loose(
right_codepoint: u32,
left_prop: u8,
right_prop: u8,
ja_zh: bool,
) -> Option<bool> {
if right_prop == BA {
if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
return Some(true);
}
} else if right_prop == NS {
if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
return Some(ja_zh);
}
if right_codepoint == 0x3005
|| right_codepoint == 0x303B
|| right_codepoint == 0x309D
|| right_codepoint == 0x309E
|| right_codepoint == 0x30FD
|| right_codepoint == 0x30FE
{
return Some(true);
}
if right_codepoint == 0x30FB
|| right_codepoint == 0xFF1A
|| right_codepoint == 0xFF1B
|| right_codepoint == 0xFF65
|| right_codepoint == 0x203C
|| (0x2047..=0x2049).contains(&right_codepoint)
{
return Some(ja_zh);
}
} else if right_prop == IN {
return Some(true);
} else if right_prop == EX {
if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
return Some(ja_zh);
}
}
if right_prop == PO_EAW {
return Some(ja_zh);
}
if left_prop == PR_EAW {
return Some(ja_zh);
}
None
}
pub trait LineBreakType: crate::private::Sealed + Sized + RuleBreakType {
#[doc(hidden)]
fn use_complex_breaking(iterator: &LineBreakIterator<'_, '_, Self>, c: Self::CharType) -> bool;
#[doc(hidden)]
fn get_linebreak_property_with_rule(
iterator: &LineBreakIterator<'_, '_, Self>,
c: Self::CharType,
) -> u8;
#[doc(hidden)]
fn line_handle_complex_language(
iterator: &mut LineBreakIterator<'_, '_, Self>,
left_codepoint: Self::CharType,
) -> Option<usize>;
}
#[derive(Debug)]
pub struct LineBreakIterator<'data, 's, Y: LineBreakType> {
iter: Y::IterAttr<'s>,
len: usize,
current_pos_data: Option<(usize, Y::CharType)>,
result_cache: Vec<usize>,
data: &'data RuleBreakData<'data>,
options: ResolvedLineBreakOptions,
complex: ComplexPayloadsBorrowed<'data>,
}
impl<Y: LineBreakType> Iterator for LineBreakIterator<'_, '_, Y> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
match self.check_eof() {
StringBoundaryPosType::Start => return Some(0),
StringBoundaryPosType::End => return None,
_ => (),
}
if let Some(&first_pos) = self.result_cache.first() {
let mut i = 0;
loop {
if i == first_pos {
self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
return self.get_current_position();
}
i += self.get_current_codepoint().map_or(0, Y::char_len);
self.advance_iter();
if self.is_eof() {
self.result_cache.clear();
return Some(self.len);
}
}
}
let mut lb9_left: Option<u8> = None;
let mut lb8a_after_lb9 = false;
'a: loop {
debug_assert!(!self.is_eof());
let left_codepoint = self.get_current_codepoint()?;
let mut left_prop =
lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
self.advance_iter();
let Some(right_codepoint) = self.get_current_codepoint() else {
return Some(self.len);
};
let right_prop = self.get_linebreak_property(right_codepoint);
if (right_prop == CM
|| (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
&& left_prop != BK
&& left_prop != CR
&& left_prop != LF
&& left_prop != NL
&& left_prop != SP
&& left_prop != ZW
{
lb9_left = Some(left_prop);
lb8a_after_lb9 = right_prop == ZWJ;
continue;
} else {
lb9_left = None;
lb8a_after_lb9 = false;
}
match (self.options.word_option, left_prop, right_prop) {
(LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
left_prop = ID;
}
(
LineBreakWordOption::KeepAll,
AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
) => {
continue;
}
_ => (),
}
match self.options.strictness {
LineBreakStrictness::Normal => {
if self.is_break_by_normal(right_codepoint) && !after_zwj {
return self.get_current_position();
}
}
LineBreakStrictness::Loose => {
if let Some(breakable) = is_break_utf32_by_loose(
right_codepoint.into(),
left_prop,
right_prop,
self.options.ja_zh,
) {
if breakable && !after_zwj {
return self.get_current_position();
}
continue;
}
}
LineBreakStrictness::Anywhere => {
return self.get_current_position();
}
_ => (),
};
if self.options.word_option != LineBreakWordOption::BreakAll
&& Y::use_complex_breaking(self, left_codepoint)
&& Y::use_complex_breaking(self, right_codepoint)
{
let result = Y::line_handle_complex_language(self, left_codepoint);
if result.is_some() {
return result;
}
}
match self.data.get_break_state_from_table(left_prop, right_prop) {
BreakState::Break | BreakState::NoMatch => {
if after_zwj {
continue;
} else {
return self.get_current_position();
}
}
BreakState::Keep => continue,
BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
let mut previous_iter = self.iter.clone();
let mut previous_pos_data = self.current_pos_data;
let mut previous_is_after_zwj = after_zwj;
let mut left_prop_pre_lb9 = right_prop;
let is_intermediate_rule_no_match = if lb8a_after_lb9 {
true
} else {
index > self.data.last_codepoint_property
};
loop {
self.advance_iter();
let after_zwj = left_prop_pre_lb9 == ZWJ;
let previous_break_state_is_cp_prop =
index <= self.data.last_codepoint_property;
let Some(prop) = self.get_current_linebreak_property() else {
let break_state = self
.data
.get_break_state_from_table(index, self.data.eot_property);
if break_state == BreakState::NoMatch {
self.iter = previous_iter;
self.current_pos_data = previous_pos_data;
if previous_is_after_zwj {
continue 'a;
} else {
return self.get_current_position();
}
}
return Some(self.len);
};
if (prop == CM || prop == ZWJ)
&& left_prop_pre_lb9 != BK
&& left_prop_pre_lb9 != CR
&& left_prop_pre_lb9 != LF
&& left_prop_pre_lb9 != NL
&& left_prop_pre_lb9 != SP
&& left_prop_pre_lb9 != ZW
{
left_prop_pre_lb9 = prop;
continue;
}
match self.data.get_break_state_from_table(index, prop) {
BreakState::Keep => continue 'a,
BreakState::NoMatch => {
self.iter = previous_iter;
self.current_pos_data = previous_pos_data;
if after_zwj {
if is_intermediate_rule_no_match && !previous_is_after_zwj {
return self.get_current_position();
}
continue 'a;
} else if previous_is_after_zwj {
continue 'a;
} else {
return self.get_current_position();
}
}
BreakState::Break => {
if after_zwj {
continue 'a;
} else {
return self.get_current_position();
}
}
BreakState::Intermediate(i) => {
index = i;
previous_iter = self.iter.clone();
previous_pos_data = self.current_pos_data;
previous_is_after_zwj = after_zwj;
}
BreakState::Index(i) => {
index = i;
if previous_break_state_is_cp_prop {
previous_iter = self.iter.clone();
previous_pos_data = self.current_pos_data;
previous_is_after_zwj = after_zwj;
}
}
}
left_prop_pre_lb9 = prop;
}
}
}
}
}
}
enum StringBoundaryPosType {
Start,
Middle,
End,
}
impl<Y: LineBreakType> LineBreakIterator<'_, '_, Y> {
fn advance_iter(&mut self) {
self.current_pos_data = self.iter.next();
}
fn is_eof(&self) -> bool {
self.current_pos_data.is_none()
}
#[inline]
fn check_eof(&mut self) -> StringBoundaryPosType {
if self.is_eof() {
self.advance_iter();
if self.is_eof() {
if self.len == 0 {
self.len = 1;
StringBoundaryPosType::Start
} else {
StringBoundaryPosType::End
}
} else {
StringBoundaryPosType::Start
}
} else {
StringBoundaryPosType::Middle
}
}
fn get_current_position(&self) -> Option<usize> {
self.current_pos_data.map(|(pos, _)| pos)
}
fn get_current_codepoint(&self) -> Option<Y::CharType> {
self.current_pos_data.map(|(_, codepoint)| codepoint)
}
fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
Y::get_linebreak_property_with_rule(self, codepoint)
}
fn get_current_linebreak_property(&self) -> Option<u8> {
self.get_current_codepoint()
.map(|c| self.get_linebreak_property(c))
}
fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
match codepoint.into() {
0x301C | 0x30A0 => self.options.ja_zh,
_ => false,
}
}
}
impl LineBreakType for Utf8 {
fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
iterator.data.get_linebreak_property_utf32_with_rule(
c as u32,
iterator.options.strictness,
iterator.options.word_option,
)
}
#[inline]
fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
iterator.data.use_complex_breaking_utf32(c as u32)
}
fn line_handle_complex_language(
iter: &mut LineBreakIterator<'_, '_, Self>,
left_codepoint: char,
) -> Option<usize> {
line_handle_complex_language_utf8(iter, left_codepoint)
}
}
impl LineBreakType for PotentiallyIllFormedUtf8 {
fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
iterator.data.get_linebreak_property_utf32_with_rule(
c as u32,
iterator.options.strictness,
iterator.options.word_option,
)
}
#[inline]
fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
iterator.data.use_complex_breaking_utf32(c as u32)
}
fn line_handle_complex_language(
iter: &mut LineBreakIterator<'_, '_, Self>,
left_codepoint: char,
) -> Option<usize> {
line_handle_complex_language_utf8(iter, left_codepoint)
}
}
fn line_handle_complex_language_utf8<T>(
iter: &mut LineBreakIterator<'_, '_, T>,
left_codepoint: char,
) -> Option<usize>
where
T: LineBreakType<CharType = char>,
{
let start_iter = iter.iter.clone();
let start_point = iter.current_pos_data;
let mut s = String::new();
s.push(left_codepoint);
loop {
debug_assert!(!iter.is_eof());
s.push(iter.get_current_codepoint()?);
iter.advance_iter();
if let Some(current_codepoint) = iter.get_current_codepoint() {
if !T::use_complex_breaking(iter, current_codepoint) {
break;
}
} else {
break;
}
}
iter.iter = start_iter;
iter.current_pos_data = start_point;
let breaks = iter.complex.complex_language_segment_str(&s);
iter.result_cache = breaks;
let first_pos = *iter.result_cache.first()?;
let mut i = left_codepoint.len_utf8();
loop {
if i == first_pos {
iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
return iter.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iter.get_current_position()
);
i += iter.get_current_codepoint().map_or(0, T::char_len);
iter.advance_iter();
if iter.is_eof() {
iter.result_cache.clear();
return Some(iter.len);
}
}
}
impl LineBreakType for Latin1 {
fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
iterator.data.property_table.get32(c as u32)
}
#[inline]
fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
false
}
fn line_handle_complex_language(
_: &mut LineBreakIterator<Self>,
_: Self::CharType,
) -> Option<usize> {
unreachable!()
}
}
impl LineBreakType for Utf16 {
fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
iterator.data.get_linebreak_property_utf32_with_rule(
c,
iterator.options.strictness,
iterator.options.word_option,
)
}
#[inline]
fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
iterator.data.use_complex_breaking_utf32(c)
}
fn line_handle_complex_language(
iterator: &mut LineBreakIterator<Self>,
left_codepoint: Self::CharType,
) -> Option<usize> {
let start_iter = iterator.iter.clone();
let start_point = iterator.current_pos_data;
let mut s = vec![left_codepoint as u16];
loop {
debug_assert!(!iterator.is_eof());
s.push(iterator.get_current_codepoint()? as u16);
iterator.advance_iter();
if let Some(current_codepoint) = iterator.get_current_codepoint() {
if !Self::use_complex_breaking(iterator, current_codepoint) {
break;
}
} else {
break;
}
}
iterator.iter = start_iter;
iterator.current_pos_data = start_point;
let breaks = iterator.complex.complex_language_segment_utf16(&s);
iterator.result_cache = breaks;
let first_pos = *iterator.result_cache.first()?;
let mut i = 1;
loop {
if i == first_pos {
iterator.result_cache = iterator
.result_cache
.iter()
.skip(1)
.map(|r| r - i)
.collect();
return iterator.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iterator.get_current_position()
);
i += 1;
iterator.advance_iter();
if iterator.is_eof() {
iterator.result_cache.clear();
return Some(iterator.len);
}
}
}
}
#[cfg(test)]
#[cfg(feature = "serde")]
mod tests {
use super::*;
use crate::LineSegmenter;
#[test]
fn linebreak_property() {
let payload = DataProvider::<SegmenterBreakLineV1>::load(&Baked, Default::default())
.expect("Loading should succeed!")
.payload;
let get_linebreak_property = |codepoint| {
payload.get().get_linebreak_property_utf32_with_rule(
codepoint as u32,
LineBreakStrictness::Strict,
LineBreakWordOption::Normal,
)
};
assert_eq!(get_linebreak_property('\u{0020}'), SP);
assert_eq!(get_linebreak_property('\u{0022}'), QU);
assert_eq!(get_linebreak_property('('), OP_OP30);
assert_eq!(get_linebreak_property('\u{0030}'), NU);
assert_eq!(get_linebreak_property('['), OP_OP30);
assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
assert_eq!(get_linebreak_property('\u{20000}'), ID);
assert_eq!(get_linebreak_property('\u{e0020}'), CM);
assert_eq!(get_linebreak_property('\u{3041}'), CJ);
assert_eq!(get_linebreak_property('\u{0025}'), PO);
assert_eq!(get_linebreak_property('\u{00A7}'), AI);
assert_eq!(get_linebreak_property('\u{50005}'), XX);
assert_eq!(get_linebreak_property('\u{17D6}'), NS);
assert_eq!(get_linebreak_property('\u{2014}'), B2);
}
#[test]
#[expect(clippy::bool_assert_comparison)] fn break_rule() {
let payload = DataProvider::<SegmenterBreakLineV1>::load(&Baked, Default::default())
.expect("Loading should succeed!")
.payload;
let lb_data: &RuleBreakData = payload.get();
let is_break = |left, right| {
matches!(
lb_data.get_break_state_from_table(left, right),
BreakState::Break | BreakState::NoMatch
)
};
assert_eq!(is_break(BK, AL), true);
assert_eq!(is_break(CR, LF), false);
assert_eq!(is_break(CR, AL), true);
assert_eq!(is_break(LF, AL), true);
assert_eq!(is_break(NL, AL), true);
assert_eq!(is_break(AL, BK), false);
assert_eq!(is_break(AL, CR), false);
assert_eq!(is_break(AL, LF), false);
assert_eq!(is_break(AL, NL), false);
assert_eq!(is_break(AL, SP), false);
assert_eq!(is_break(AL, ZW), false);
assert_eq!(is_break(ZWJ, SP), false);
assert_eq!(is_break(SP, CM), true);
assert_eq!(is_break(AL, WJ), false);
assert_eq!(is_break(WJ, AL), false);
assert_eq!(is_break(GL, AL), false);
assert_eq!(is_break(AL, GL), false);
assert_eq!(is_break(SP, GL), true);
assert_eq!(is_break(AL, CL), false);
assert_eq!(is_break(AL, CP), false);
assert_eq!(is_break(AL, EX), false);
assert_eq!(is_break(AL, IS), false);
assert_eq!(is_break(AL, SY), false);
assert_eq!(is_break(SP, AL), true);
assert_eq!(is_break(AL, QU), false);
assert_eq!(is_break(QU, AL), false);
assert_eq!(is_break(AL, CB), true);
assert_eq!(is_break(CB, AL), true);
assert_eq!(is_break(AL, BA), false);
assert_eq!(is_break(AL, HY), false);
assert_eq!(is_break(AL, NS), false);
assert_eq!(is_break(AL, BA), false);
assert_eq!(is_break(BB, AL), false);
assert_eq!(is_break(ID, BA), false);
assert_eq!(is_break(ID, NS), false);
assert_eq!(is_break(SY, HL), false);
assert_eq!(is_break(AL, IN), false);
assert_eq!(is_break(AL, NU), false);
assert_eq!(is_break(HL, NU), false);
assert_eq!(is_break(PR, ID), false);
assert_eq!(is_break(PR, EB), false);
assert_eq!(is_break(PR, EM), false);
assert_eq!(is_break(ID, PO), false);
assert_eq!(is_break(EB, PO), false);
assert_eq!(is_break(EM, PO), false);
assert_eq!(is_break(JL, JL), false);
assert_eq!(is_break(JL, JV), false);
assert_eq!(is_break(JL, H2), false);
assert_eq!(is_break(JL, IN), false);
assert_eq!(is_break(JL, PO), false);
assert_eq!(is_break(PR, JL), false);
assert_eq!(is_break(AL, AL), false);
assert_eq!(is_break(HL, AL), false);
assert_eq!(is_break(IS, AL), false);
assert_eq!(is_break(IS, HL), false);
assert_eq!(is_break(EB, EM), false);
assert_eq!(is_break(ID, ID), true);
}
#[test]
fn linebreak() {
let segmenter = LineSegmenter::try_new_dictionary_unstable(&Baked, Default::default())
.expect("Data exists");
let segmenter = segmenter.as_borrowed();
let mut iter = segmenter.segment_str("hello world");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(6), iter.next());
assert_eq!(Some(11), iter.next());
assert_eq!(None, iter.next());
iter = segmenter.segment_str("$10 $10");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(4), iter.next());
assert_eq!(Some(7), iter.next());
assert_eq!(None, iter.next());
iter = segmenter.segment_str("[ abc def");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(7), iter.next());
assert_eq!(Some(10), iter.next());
assert_eq!(None, iter.next());
let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
let mut iter_u8 = segmenter.segment_latin1(&input);
assert_eq!(Some(0), iter_u8.next());
assert_eq!(Some(7), iter_u8.next());
assert_eq!(Some(10), iter_u8.next());
assert_eq!(None, iter_u8.next());
let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(0), iter_u16.next());
assert_eq!(Some(7), iter_u16.next());
assert_eq!(Some(10), iter_u16.next());
assert_eq!(None, iter_u16.next());
iter = segmenter.segment_str("abc\u{0022} (def");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(6), iter.next());
assert_eq!(Some(10), iter.next());
assert_eq!(None, iter.next());
let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
let mut iter_u8 = segmenter.segment_latin1(&input);
assert_eq!(Some(0), iter_u8.next());
assert_eq!(Some(6), iter_u8.next());
assert_eq!(Some(10), iter_u8.next());
assert_eq!(None, iter_u8.next());
let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(0), iter_u16.next());
assert_eq!(Some(6), iter_u16.next());
assert_eq!(Some(10), iter_u16.next());
assert_eq!(None, iter_u16.next());
iter = segmenter.segment_str("« miaou »");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(11), iter.next());
assert_eq!(None, iter.next());
let input: Vec<u8> = "« miaou »"
.chars()
.map(|c| u8::try_from(u32::from(c)).unwrap())
.collect();
let mut iter_u8 = segmenter.segment_latin1(&input);
assert_eq!(Some(0), iter_u8.next());
assert_eq!(Some(9), iter_u8.next());
assert_eq!(None, iter_u8.next());
let input: Vec<u16> = "« miaou »".encode_utf16().collect();
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(0), iter_u16.next());
assert_eq!(Some(9), iter_u16.next());
assert_eq!(None, iter_u16.next());
iter = segmenter.segment_str("Die Katze hat »miau« gesagt.");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(4), iter.next());
assert_eq!(Some(10), iter.next());
assert_eq!(Some(14), iter.next());
assert_eq!(Some(23), iter.next());
assert_eq!(Some(30), iter.next());
assert_eq!(None, iter.next());
let input: Vec<u8> = "Die Katze hat »miau« gesagt."
.chars()
.map(|c| u8::try_from(u32::from(c)).unwrap())
.collect();
let mut iter_u8 = segmenter.segment_latin1(&input);
assert_eq!(Some(0), iter_u8.next());
assert_eq!(Some(4), iter_u8.next());
assert_eq!(Some(10), iter_u8.next());
assert_eq!(Some(14), iter_u8.next());
assert_eq!(Some(21), iter_u8.next());
assert_eq!(Some(28), iter_u8.next());
assert_eq!(None, iter_u8.next());
let input: Vec<u16> = "Die Katze hat »miau« gesagt.".encode_utf16().collect();
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(0), iter_u16.next());
assert_eq!(Some(4), iter_u16.next());
assert_eq!(Some(10), iter_u16.next());
assert_eq!(Some(14), iter_u16.next());
assert_eq!(Some(21), iter_u16.next());
assert_eq!(Some(28), iter_u16.next());
assert_eq!(None, iter_u16.next());
iter = segmenter.segment_str("\u{0029}\u{203C}");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(4), iter.next());
assert_eq!(None, iter.next());
iter = segmenter.segment_str("\u{0029} \u{203C}");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(6), iter.next());
assert_eq!(None, iter.next());
let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(0), iter_u16.next());
assert_eq!(Some(4), iter_u16.next());
assert_eq!(None, iter_u16.next());
iter = segmenter.segment_str("\u{2014}\u{2014}aa");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(6), iter.next());
assert_eq!(Some(8), iter.next());
assert_eq!(None, iter.next());
iter = segmenter.segment_str("\u{2014} \u{2014}aa");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(8), iter.next());
assert_eq!(Some(10), iter.next());
assert_eq!(None, iter.next());
iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(14), iter.next());
assert_eq!(Some(18), iter.next());
assert_eq!(Some(21), iter.next());
assert_eq!(None, iter.next());
let mut iter = segmenter.segment_str("(0,1)+(2,3)");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(11), iter.next());
assert_eq!(None, iter.next());
let input: [u16; 11] = [
0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(0), iter_u16.next());
assert_eq!(Some(11), iter_u16.next());
assert_eq!(None, iter_u16.next());
let input: [u16; 13] = [
0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
];
let mut iter_u16 = segmenter.segment_utf16(&input);
assert_eq!(Some(0), iter_u16.next());
assert_eq!(Some(6), iter_u16.next());
assert_eq!(Some(10), iter_u16.next());
assert_eq!(Some(13), iter_u16.next());
assert_eq!(None, iter_u16.next());
iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
assert_eq!(Some(0), iter.next());
assert_eq!(Some(5), iter.next());
assert_eq!(Some(9), iter.next());
assert_eq!(None, iter.next());
}
#[test]
#[cfg(feature = "lstm")]
fn thai_line_break() {
const TEST_STR: &str = "ภาษาไทยภาษาไทย";
let segmenter = LineSegmenter::new_lstm(Default::default());
let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
assert_eq!(breaks, [0, 4], "Thai test");
}
#[test]
#[cfg(feature = "lstm")]
fn burmese_line_break() {
const TEST_STR: &str = "မြန်မာဘာသာစကား";
let segmenter = LineSegmenter::new_lstm(Default::default());
let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
}
#[test]
#[cfg(feature = "lstm")]
fn khmer_line_break() {
const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
let segmenter = LineSegmenter::new_lstm(Default::default());
let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
assert_eq!(
breaks,
[0, 13, 16, 18, 24, utf16.len()],
"Khmer utf-16 test"
);
}
#[test]
#[cfg(feature = "lstm")]
fn lao_line_break() {
const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
let segmenter = LineSegmenter::new_lstm(Default::default());
let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
}
#[test]
fn empty_string() {
let segmenter = LineSegmenter::new_auto(Default::default());
let breaks: Vec<usize> = segmenter.segment_str("").collect();
assert_eq!(breaks, [0]);
}
}