use crate::complex::*;
use crate::indices::{Latin1Indices, Utf16Indices};
use crate::iterator_helpers::derive_usize_iterator_with_type;
use crate::provider::*;
use crate::rule_segmenter::*;
use alloc::string::String;
use alloc::vec;
use alloc::vec::Vec;
use icu_locale_core::LanguageIdentifier;
use icu_provider::prelude::*;
use utf8_iter::Utf8CharIndices;
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct WordBreakOptions<'a> {
pub content_locale: Option<&'a LanguageIdentifier>,
pub invariant_options: WordBreakInvariantOptions,
}
impl WordBreakOptions<'_> {
pub const fn default() -> Self {
Self {
content_locale: None,
invariant_options: WordBreakInvariantOptions::default(),
}
}
}
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct WordBreakInvariantOptions {}
impl WordBreakInvariantOptions {
pub const fn default() -> Self {
Self {}
}
}
#[derive(Debug)]
pub struct WordBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
derive_usize_iterator_with_type!(WordBreakIterator, 'data);
pub(crate) mod inner {
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Debug)]
#[repr(u8)]
#[zerovec::make_ule(WordTypeULE)]
pub enum WordType {
None = 0,
Number = 1,
Letter = 2,
}
}
pub use inner::WordType;
impl WordType {
pub fn is_word_like(&self) -> bool {
self != &WordType::None
}
}
impl<'data, 's, Y: RuleBreakType> WordBreakIterator<'data, 's, Y> {
#[inline]
pub fn word_type(&self) -> WordType {
self.0.word_type()
}
pub fn iter_with_word_type(self) -> WordBreakIteratorWithWordType<'data, 's, Y> {
WordBreakIteratorWithWordType(self)
}
#[inline]
pub fn is_word_like(&self) -> bool {
self.word_type().is_word_like()
}
}
#[derive(Debug)]
pub struct WordBreakIteratorWithWordType<'data, 's, Y: RuleBreakType>(
WordBreakIterator<'data, 's, Y>,
);
impl<Y: RuleBreakType> Iterator for WordBreakIteratorWithWordType<'_, '_, Y> {
type Item = (usize, WordType);
fn next(&mut self) -> Option<Self::Item> {
let ret = self.0.next()?;
Some((ret, self.0 .0.word_type()))
}
}
#[derive(Debug)]
pub struct WordSegmenter {
payload: DataPayload<SegmenterBreakWordV1>,
complex: ComplexPayloads,
payload_locale_override: Option<DataPayload<SegmenterBreakWordOverrideV1>>,
}
#[derive(Clone, Debug, Copy)]
pub struct WordSegmenterBorrowed<'data> {
data: &'data RuleBreakData<'data>,
complex: ComplexPayloadsBorrowed<'data>,
locale_override: Option<&'data RuleBreakDataOverride<'data>>,
}
impl WordSegmenter {
#[cfg(feature = "compiled_data")]
#[cfg(feature = "auto")]
pub fn new_auto(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
let mut complex = ComplexPayloadsBorrowed::new();
complex.with_southeast_asian_lstms();
complex.with_japanese_dictionary();
WordSegmenterBorrowed {
data: Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
complex,
locale_override: None,
}
}
#[cfg(feature = "auto")]
icu_provider::gen_buffer_data_constructors!(
(options: WordBreakOptions) -> error: DataError,
functions: [
try_new_auto,
try_new_auto_with_buffer_provider,
try_new_auto_unstable,
Self
]
);
#[cfg(feature = "auto")]
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
pub fn try_new_auto_unstable<D>(
provider: &D,
options: WordBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakWordV1>
+ DataProvider<SegmenterBreakWordOverrideV1>
+ DataProvider<SegmenterDictionaryAutoV1>
+ DataProvider<SegmenterLstmAutoV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
let mut complex = ComplexPayloads::try_new(provider)?;
complex.with_southeast_asian_lstms(provider)?;
complex.with_japanese_dictionary(provider)?;
Ok(Self {
payload: provider.load(Default::default())?.payload,
complex,
payload_locale_override: if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
};
provider
.load(req)
.allow_identifier_not_found()?
.map(|r| r.payload)
} else {
None
},
})
}
#[cfg(feature = "compiled_data")]
#[cfg(feature = "lstm")]
pub fn new_lstm(options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
let mut s = Self::new_for_non_complex_scripts(options);
s.load_lstm();
s
}
#[cfg(feature = "lstm")]
icu_provider::gen_buffer_data_constructors!(
(options: WordBreakOptions) -> error: DataError,
functions: [
try_new_lstm,
try_new_lstm_with_buffer_provider,
try_new_lstm_unstable,
Self
]
);
#[cfg(feature = "lstm")]
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
pub fn try_new_lstm_unstable<D>(
provider: &D,
options: WordBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakWordV1>
+ DataProvider<SegmenterBreakWordOverrideV1>
+ DataProvider<SegmenterLstmAutoV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
let mut s = Self::try_new_for_non_complex_scripts_unstable(provider, options)?;
s.load_lstm_unstable(provider)?;
Ok(s)
}
#[cfg(feature = "compiled_data")]
pub fn new_dictionary(options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
let mut s = Self::new_for_non_complex_scripts(options);
s.load_dictionary();
s
}
icu_provider::gen_buffer_data_constructors!(
(options: WordBreakOptions) -> error: DataError,
functions: [
try_new_dictionary,
try_new_dictionary_with_buffer_provider,
try_new_dictionary_unstable,
Self
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
pub fn try_new_dictionary_unstable<D>(
provider: &D,
options: WordBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakWordV1>
+ DataProvider<SegmenterBreakWordOverrideV1>
+ DataProvider<SegmenterDictionaryAutoV1>
+ DataProvider<SegmenterDictionaryExtendedV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
let mut s = Self::try_new_for_non_complex_scripts_unstable(provider, options)?;
s.load_dictionary_unstable(provider)?;
Ok(s)
}
#[cfg(feature = "compiled_data")]
pub const fn new_for_non_complex_scripts(
_options: WordBreakInvariantOptions,
) -> WordSegmenterBorrowed<'static> {
WordSegmenterBorrowed {
data: Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
complex: ComplexPayloadsBorrowed::new(),
locale_override: None,
}
}
icu_provider::gen_buffer_data_constructors!(
(options: WordBreakOptions) -> error: DataError,
functions: [
try_new_for_non_complex_scripts,
try_new_for_non_complex_scripts_with_buffer_provider,
try_new_for_non_complex_scripts_unstable,
Self
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
pub fn try_new_for_non_complex_scripts_unstable<D>(
provider: &D,
options: WordBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakWordV1>
+ DataProvider<SegmenterBreakWordOverrideV1>
+ DataProvider<SegmenterBreakGraphemeClusterV1>
+ ?Sized,
{
Ok(Self {
payload: provider.load(Default::default())?.payload,
complex: ComplexPayloads::try_new(provider)?,
payload_locale_override: if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
};
provider
.load(req)
.allow_identifier_not_found()?
.map(|r| r.payload)
} else {
None
},
})
}
#[cfg(feature = "lstm")]
pub fn load_lstm_unstable<D>(&mut self, provider: &D) -> Result<(), DataError>
where
D: DataProvider<SegmenterLstmAutoV1> + ?Sized,
{
self.complex.with_southeast_asian_lstms(provider)?;
Ok(())
}
#[cfg(feature = "serde")]
#[cfg(feature = "lstm")]
pub fn load_lstm_with_buffer_provider(
&mut self,
provider: &(impl BufferProvider + ?Sized),
) -> Result<(), DataError> {
self.load_lstm_unstable(&provider.as_deserializing())
}
pub fn load_dictionary_unstable<D>(&mut self, provider: &D) -> Result<(), DataError>
where
D: DataProvider<SegmenterDictionaryAutoV1>
+ DataProvider<SegmenterDictionaryExtendedV1>
+ ?Sized,
{
self.complex.with_southeast_asian_dictionaries(provider)?;
self.complex.with_japanese_dictionary(provider)?;
Ok(())
}
#[cfg(feature = "serde")]
pub fn load_dictionary_with_buffer_provider(
&mut self,
provider: &(impl BufferProvider + ?Sized),
) -> Result<(), DataError> {
self.load_dictionary_unstable(&provider.as_deserializing())
}
pub fn as_borrowed(&self) -> WordSegmenterBorrowed<'_> {
WordSegmenterBorrowed {
data: self.payload.get(),
complex: self.complex.as_borrowed(),
locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
}
}
}
impl<'data> WordSegmenterBorrowed<'data> {
pub fn segment_str<'s>(self, input: &'s str) -> WordBreakIterator<'data, 's, Utf8> {
WordBreakIterator(RuleBreakIterator {
iter: input.char_indices(),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: Some(self.complex),
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: Utf8::word_handle_complex_language,
})
}
pub fn segment_utf8<'s>(
self,
input: &'s [u8],
) -> WordBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
WordBreakIterator(RuleBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: Some(self.complex),
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: PotentiallyIllFormedUtf8::word_handle_complex_language,
})
}
pub fn segment_latin1<'s>(self, input: &'s [u8]) -> WordBreakIterator<'data, 's, Latin1> {
WordBreakIterator(RuleBreakIterator {
iter: Latin1Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: Some(self.complex),
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: Latin1::word_handle_complex_language,
})
}
pub fn segment_utf16<'s>(self, input: &'s [u16]) -> WordBreakIterator<'data, 's, Utf16> {
WordBreakIterator(RuleBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: Some(self.complex),
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: Utf16::word_handle_complex_language,
})
}
}
impl WordSegmenterBorrowed<'static> {
#[cfg(feature = "lstm")]
#[cfg(feature = "compiled_data")]
pub fn load_lstm(&mut self) {
self.complex.with_southeast_asian_lstms();
}
#[cfg(feature = "compiled_data")]
pub fn load_dictionary(&mut self) {
self.complex.with_southeast_asian_dictionaries();
self.complex.with_japanese_dictionary();
}
pub fn static_to_owned(self) -> WordSegmenter {
let payload_locale_override = self.locale_override.map(DataPayload::from_static_ref);
WordSegmenter {
payload: DataPayload::from_static_ref(self.data),
complex: self.complex.static_to_owned(),
payload_locale_override,
}
}
}
pub trait WordBreakType: crate::private::Sealed + Sized + RuleBreakType {
#[doc(hidden)]
fn word_handle_complex_language(
iterator: &mut RuleBreakIterator<'_, '_, Self>,
left_codepoint: Self::CharType,
) -> Option<usize>;
}
impl WordBreakType for Utf8 {
fn word_handle_complex_language(
iter: &mut RuleBreakIterator<'_, '_, Self>,
left_codepoint: Self::CharType,
) -> Option<usize> {
handle_complex_language_utf8(iter, left_codepoint)
}
}
impl WordBreakType for PotentiallyIllFormedUtf8 {
fn word_handle_complex_language(
iter: &mut RuleBreakIterator<'_, '_, Self>,
left_codepoint: Self::CharType,
) -> Option<usize> {
handle_complex_language_utf8(iter, left_codepoint)
}
}
impl WordBreakType for Latin1 {
fn word_handle_complex_language(
_iter: &mut RuleBreakIterator<'_, '_, Self>,
_left_codepoint: Self::CharType,
) -> Option<usize> {
debug_assert!(
false,
"latin-1 text should never need complex language handling"
);
None
}
}
fn handle_complex_language_utf8<T>(
iter: &mut RuleBreakIterator<'_, '_, T>,
left_codepoint: T::CharType,
) -> Option<usize>
where
T: RuleBreakType<CharType = char>,
{
let start_iter = iter.iter.clone();
let start_point = iter.current_pos_data;
let mut s = String::new();
s.push(left_codepoint);
loop {
debug_assert!(!iter.is_eof());
s.push(iter.get_current_codepoint()?);
iter.advance_iter();
if let Some(current_break_property) = iter.get_current_break_property() {
if current_break_property != iter.data.complex_property {
break;
}
} else {
break;
}
}
iter.iter = start_iter;
iter.current_pos_data = start_point;
#[expect(clippy::unwrap_used)] let breaks = iter.complex.unwrap().complex_language_segment_str(&s);
iter.result_cache = breaks;
let first_pos = *iter.result_cache.first()?;
let mut i = left_codepoint.len_utf8();
loop {
if i == first_pos {
iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
return iter.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iter.get_current_position()
);
i += iter.get_current_codepoint().map_or(0, T::char_len);
iter.advance_iter();
if iter.is_eof() {
iter.result_cache.clear();
return Some(iter.len);
}
}
}
impl WordBreakType for Utf16 {
fn word_handle_complex_language(
iter: &mut RuleBreakIterator<Self>,
left_codepoint: Self::CharType,
) -> Option<usize> {
let start_iter = iter.iter.clone();
let start_point = iter.current_pos_data;
let mut s = vec![left_codepoint as u16];
loop {
debug_assert!(!iter.is_eof());
s.push(iter.get_current_codepoint()? as u16);
iter.advance_iter();
if let Some(current_break_property) = iter.get_current_break_property() {
if current_break_property != iter.data.complex_property {
break;
}
} else {
break;
}
}
iter.iter = start_iter;
iter.current_pos_data = start_point;
#[expect(clippy::unwrap_used)] let breaks = iter.complex.unwrap().complex_language_segment_utf16(&s);
iter.result_cache = breaks;
let first_pos = *iter.result_cache.first()?;
let mut i = 1;
loop {
if i == first_pos {
iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
return iter.get_current_position();
}
debug_assert!(
i < first_pos,
"we should always arrive at first_pos: near index {:?}",
iter.get_current_position()
);
i += 1;
iter.advance_iter();
if iter.is_eof() {
iter.result_cache.clear();
return Some(iter.len);
}
}
}
}
#[cfg(all(test, feature = "serde"))]
#[test]
fn empty_string() {
let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
let breaks: Vec<usize> = segmenter.segment_str("").collect();
assert_eq!(breaks, [0]);
}