use alloc::vec::Vec;
use icu_provider::prelude::*;
use crate::indices::{Latin1Indices, Utf16Indices};
use crate::iterator_helpers::derive_usize_iterator_with_type;
use crate::provider::*;
use crate::rule_segmenter::*;
use utf8_iter::Utf8CharIndices;
#[derive(Debug)]
pub struct GraphemeClusterBreakIterator<'data, 's, Y: RuleBreakType>(
RuleBreakIterator<'data, 's, Y>,
);
derive_usize_iterator_with_type!(GraphemeClusterBreakIterator, 'data);
#[derive(Debug)]
pub struct GraphemeClusterSegmenter {
payload: DataPayload<SegmenterBreakGraphemeClusterV1>,
}
#[derive(Clone, Debug, Copy)]
pub struct GraphemeClusterSegmenterBorrowed<'data> {
data: &'data RuleBreakData<'data>,
}
impl GraphemeClusterSegmenter {
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)] pub const fn new() -> GraphemeClusterSegmenterBorrowed<'static> {
GraphemeClusterSegmenterBorrowed {
data: Baked::SINGLETON_SEGMENTER_BREAK_GRAPHEME_CLUSTER_V1,
}
}
icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
functions: [
new: skip,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,
{
let payload = provider.load(Default::default())?.payload;
Ok(Self { payload })
}
pub fn as_borrowed(&self) -> GraphemeClusterSegmenterBorrowed<'_> {
GraphemeClusterSegmenterBorrowed {
data: self.payload.get(),
}
}
}
impl<'data> GraphemeClusterSegmenterBorrowed<'data> {
pub fn segment_str<'s>(self, input: &'s str) -> GraphemeClusterBreakIterator<'data, 's, Utf8> {
GraphemeClusterBreakIterator(RuleBreakIterator {
iter: input.char_indices(),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: None,
handle_complex_language: empty_handle_complex_language,
})
}
pub fn segment_utf8<'s>(
self,
input: &'s [u8],
) -> GraphemeClusterBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
GraphemeClusterBreakIterator(RuleBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: None,
handle_complex_language: empty_handle_complex_language,
})
}
pub fn segment_latin1<'s>(
self,
input: &'s [u8],
) -> GraphemeClusterBreakIterator<'data, 's, Latin1> {
GraphemeClusterBreakIterator(RuleBreakIterator {
iter: Latin1Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: None,
handle_complex_language: empty_handle_complex_language,
})
}
pub fn segment_utf16<'s>(
self,
input: &'s [u16],
) -> GraphemeClusterBreakIterator<'data, 's, Utf16> {
GraphemeClusterBreakIterator(RuleBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
current_pos_data: None,
result_cache: Vec::new(),
data: self.data,
complex: None,
boundary_property: 0,
locale_override: None,
handle_complex_language: empty_handle_complex_language,
})
}
}
impl GraphemeClusterSegmenterBorrowed<'static> {
pub const fn static_to_owned(self) -> GraphemeClusterSegmenter {
GraphemeClusterSegmenter {
payload: DataPayload::from_static_ref(self.data),
}
}
}
#[test]
fn empty_string() {
let segmenter = GraphemeClusterSegmenter::new();
let breaks: Vec<usize> = segmenter.segment_str("").collect();
assert_eq!(breaks, [0]);
}
#[test]
fn emoji_flags() {
let segmenter = GraphemeClusterSegmenter::new();
let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴").collect();
assert_eq!(breaks, [0, 8, 36]);
}