use alloc::vec::Vec;
use icu_locale_core::LanguageIdentifier;
use icu_provider::prelude::*;
use crate::indices::{Latin1Indices, Utf16Indices};
use crate::iterator_helpers::derive_usize_iterator_with_type;
use crate::provider::*;
use crate::rule_segmenter::*;
use utf8_iter::Utf8CharIndices;
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct SentenceBreakOptions<'a> {
pub content_locale: Option<&'a LanguageIdentifier>,
pub invariant_options: SentenceBreakInvariantOptions,
}
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
pub struct SentenceBreakInvariantOptions {}
#[derive(Debug)]
pub struct SentenceBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
derive_usize_iterator_with_type!(SentenceBreakIterator, 'data);
#[derive(Debug)]
pub struct SentenceSegmenter {
payload: DataPayload<SegmenterBreakSentenceV1>,
payload_locale_override: Option<DataPayload<SegmenterBreakSentenceOverrideV1>>,
}
#[derive(Clone, Debug, Copy)]
pub struct SentenceSegmenterBorrowed<'data> {
data: &'data RuleBreakData<'data>,
locale_override: Option<&'data RuleBreakDataOverride<'data>>,
}
impl SentenceSegmenter {
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new(
_options: SentenceBreakInvariantOptions,
) -> SentenceSegmenterBorrowed<'static> {
SentenceSegmenterBorrowed {
data: Baked::SINGLETON_SEGMENTER_BREAK_SENTENCE_V1,
locale_override: None,
}
}
icu_provider::gen_buffer_data_constructors!(
(options: SentenceBreakOptions) -> error: DataError,
functions: [
try_new,
try_new_with_buffer_provider,
try_new_unstable,
Self
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<D>(
provider: &D,
options: SentenceBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakSentenceV1>
+ DataProvider<SegmenterBreakSentenceOverrideV1>
+ ?Sized,
{
let payload = provider.load(Default::default())?.payload;
let payload_locale_override = if let Some(locale) = options.content_locale {
let locale = DataLocale::from(locale);
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
};
provider
.load(req)
.allow_identifier_not_found()?
.map(|r| r.payload)
} else {
None
};
Ok(Self {
payload,
payload_locale_override,
})
}
pub fn as_borrowed(&self) -> SentenceSegmenterBorrowed<'_> {
SentenceSegmenterBorrowed {
data: self.payload.get(),
locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
}
}
}
impl<'data> SentenceSegmenterBorrowed<'data> {
pub fn segment_str<'s>(self, input: &'s str) -> SentenceBreakIterator<'data, 's, Utf8> {
SentenceBreakIterator(RuleBreakIterator {
iter: input.char_indices(),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: empty_handle_complex_language,
})
}
pub fn segment_utf8<'s>(
self,
input: &'s [u8],
) -> SentenceBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
SentenceBreakIterator(RuleBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: empty_handle_complex_language,
})
}
pub fn segment_latin1<'s>(self, input: &'s [u8]) -> SentenceBreakIterator<'data, 's, Latin1> {
SentenceBreakIterator(RuleBreakIterator {
iter: Latin1Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: empty_handle_complex_language,
})
}
pub fn segment_utf16<'s>(self, input: &'s [u16]) -> SentenceBreakIterator<'data, 's, Utf16> {
SentenceBreakIterator(RuleBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: self.locale_override,
handle_complex_language: empty_handle_complex_language,
})
}
}
impl SentenceSegmenterBorrowed<'static> {
pub const fn static_to_owned(self) -> SentenceSegmenter {
let payload_locale_override = if let Some(d) = self.locale_override {
Some(DataPayload::from_static_ref(d))
} else {
None
};
SentenceSegmenter {
payload: DataPayload::from_static_ref(self.data),
payload_locale_override,
}
}
}
#[cfg(all(test, feature = "serde"))]
#[test]
fn empty_string() {
let segmenter = SentenceSegmenter::new(Default::default());
let breaks: Vec<usize> = segmenter.segment_str("").collect();
assert_eq!(breaks, [0]);
}