use std::sync::LazyLock;
use auto_enums::auto_enum;
use icu_segmenter::{
options::{SentenceBreakInvariantOptions, WordBreakInvariantOptions},
GraphemeClusterSegmenter, GraphemeClusterSegmenterBorrowed, SentenceSegmenter,
SentenceSegmenterBorrowed, WordSegmenter, WordSegmenterBorrowed,
};
use itertools::Itertools;
use strum::EnumIter;
pub const GRAPHEME_SEGMENTER: GraphemeClusterSegmenterBorrowed<'static> =
GraphemeClusterSegmenter::new();
static WORD_SEGMENTER: LazyLock<WordSegmenterBorrowed<'static>> =
LazyLock::new(|| WordSegmenter::new_dictionary(WordBreakInvariantOptions::default()));
static SENTENCE_SEGMENTER: LazyLock<SentenceSegmenterBorrowed<'static>> =
LazyLock::new(|| SentenceSegmenter::new(SentenceBreakInvariantOptions::default()));
#[derive(Clone, Copy, Debug, EnumIter, Eq, PartialEq, Ord, PartialOrd)]
pub enum FallbackLevel {
Char,
GraphemeCluster,
Word,
Sentence,
}
impl FallbackLevel {
#[auto_enum(Iterator)]
pub fn sections(self, text: &str) -> impl Iterator<Item = (usize, &str)> {
match self {
Self::Char => text.char_indices().map(move |(i, c)| {
(
i,
text.get(i..i + c.len_utf8()).expect("char should be valid"),
)
}),
Self::GraphemeCluster => GRAPHEME_SEGMENTER
.segment_str(text)
.tuple_windows()
.map(|(i, j)| (i, &text[i..j])),
Self::Word => WORD_SEGMENTER
.segment_str(text)
.tuple_windows()
.map(|(i, j)| (i, &text[i..j])),
Self::Sentence => SENTENCE_SEGMENTER
.segment_str(text)
.tuple_windows()
.map(|(i, j)| (i, &text[i..j])),
}
}
}