#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
mod lstm;
pub use lstm::*;
use crate::options::WordType;
use icu_collections::codepointtrie::CodePointTrie;
use icu_provider::prelude::*;
use zerovec::ZeroVec;
#[cfg(feature = "compiled_data")]
#[derive(Debug)]
pub struct Baked;
#[cfg(feature = "compiled_data")]
#[allow(unused_imports)]
const _: () = {
use icu_segmenter_data::*;
pub mod icu {
pub use crate as segmenter;
pub use icu_collections as collections;
pub use icu_locale as locale;
}
make_provider!(Baked);
impl_segmenter_break_sentence_v1!(Baked);
impl_segmenter_dictionary_auto_v1!(Baked);
impl_segmenter_break_grapheme_cluster_v1!(Baked);
impl_segmenter_dictionary_extended_v1!(Baked);
impl_segmenter_break_line_v1!(Baked);
#[cfg(feature = "lstm")]
impl_segmenter_lstm_auto_v1!(Baked);
impl_segmenter_break_word_v1!(Baked);
impl_segmenter_break_word_override_v1!(Baked);
impl_segmenter_break_sentence_override_v1!(Baked);
};
icu_provider::data_marker!(
SegmenterLstmAutoV1,
"segmenter/lstm/auto/v1",
LstmData<'static>,
#[cfg(feature = "datagen")]
attributes_domain = "segmenter"
);
icu_provider::data_marker!(
SegmenterDictionaryAutoV1,
"segmenter/dictionary/auto/v1",
UCharDictionaryBreakData<'static>,
#[cfg(feature = "datagen")]
attributes_domain = "segmenter"
);
icu_provider::data_marker!(
SegmenterDictionaryExtendedV1,
"segmenter/dictionary/extended/v1",
UCharDictionaryBreakData<'static>,
#[cfg(feature = "datagen")]
attributes_domain = "segmenter"
);
icu_provider::data_marker!(
SegmenterBreakSentenceOverrideV1,
"segmenter/break/sentence/override/v1",
RuleBreakDataOverride<'static>,
);
icu_provider::data_marker!(
SegmenterBreakWordOverrideV1,
"segmenter/break/word/override/v1",
RuleBreakDataOverride<'static>,
);
icu_provider::data_marker!(
SegmenterBreakLineV1,
"segmenter/break/line/v1",
RuleBreakData<'static>,
is_singleton = true
);
icu_provider::data_marker!(
SegmenterBreakWordV1,
"segmenter/break/word/v1",
RuleBreakData<'static>,
is_singleton = true
);
icu_provider::data_marker!(
SegmenterBreakGraphemeClusterV1,
"segmenter/break/grapheme/cluster/v1",
RuleBreakData<'static>,
is_singleton = true
);
icu_provider::data_marker!(
SegmenterBreakSentenceV1,
"segmenter/break/sentence/v1",
RuleBreakData<'static>,
is_singleton = true
);
pub use crate::word::inner::WordTypeULE;
#[cfg(feature = "datagen")]
pub const MARKERS: &[DataMarkerInfo] = &[
SegmenterBreakGraphemeClusterV1::INFO,
SegmenterBreakLineV1::INFO,
SegmenterBreakSentenceOverrideV1::INFO,
SegmenterBreakSentenceV1::INFO,
SegmenterBreakWordOverrideV1::INFO,
SegmenterBreakWordV1::INFO,
SegmenterDictionaryAutoV1::INFO,
SegmenterDictionaryExtendedV1::INFO,
SegmenterLstmAutoV1::INFO,
];
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct RuleBreakData<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub property_table: CodePointTrie<'data, u8>,
#[cfg_attr(feature = "serde", serde(borrow))]
pub break_state_table: ZeroVec<'data, BreakState>,
#[cfg_attr(feature = "serde", serde(borrow, rename = "rule_status_table"))]
pub word_type_table: ZeroVec<'data, WordType>,
pub property_count: u8,
pub last_codepoint_property: u8,
pub sot_property: u8,
pub eot_property: u8,
pub complex_property: u8,
}
icu_provider::data_struct!(
RuleBreakData<'_>,
#[cfg(feature = "datagen")]
);
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct UCharDictionaryBreakData<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie_data: ZeroVec<'data, u16>,
}
icu_provider::data_struct!(
UCharDictionaryBreakData<'_>,
#[cfg(feature = "datagen")]
);
pub(crate) struct UCharDictionaryBreakDataV1;
impl DynamicDataMarker for UCharDictionaryBreakDataV1 {
type DataStruct = UCharDictionaryBreakData<'static>;
}
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize,databake::Bake),
databake(path = icu_segmenter::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct RuleBreakDataOverride<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub property_table_override: CodePointTrie<'data, u8>,
}
icu_provider::data_struct!(
RuleBreakDataOverride<'_>,
#[cfg(feature = "datagen")]
);
#[derive(Clone, Copy, PartialEq, Debug)]
#[cfg_attr(feature = "datagen", derive(databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
pub enum BreakState {
Break,
Keep,
NoMatch,
Intermediate(u8),
Index(u8),
}
#[cfg(feature = "datagen")]
impl serde::Serialize for BreakState {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
i8::from_le_bytes([zerovec::ule::AsULE::to_unaligned(*self)]).serialize(serializer)
} else {
zerovec::ule::AsULE::to_unaligned(*self).serialize(serializer)
}
}
}
#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for BreakState {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
if deserializer.is_human_readable() {
Ok(zerovec::ule::AsULE::from_unaligned(
i8::deserialize(deserializer)?.to_le_bytes()[0],
))
} else {
u8::deserialize(deserializer).map(zerovec::ule::AsULE::from_unaligned)
}
}
}
impl zerovec::ule::AsULE for BreakState {
type ULE = u8;
fn to_unaligned(self) -> Self::ULE {
match self {
BreakState::Break => 253,
BreakState::Keep => 255,
BreakState::NoMatch => 254,
BreakState::Intermediate(i) => i + 120,
BreakState::Index(i) => i,
}
}
fn from_unaligned(unaligned: Self::ULE) -> Self {
match unaligned {
253 => BreakState::Break,
255 => BreakState::Keep,
254 => BreakState::NoMatch,
i if (120..253).contains(&i) => BreakState::Intermediate(i - 120),
i => BreakState::Index(i),
}
}
}
#[cfg(feature = "datagen")]
impl serde::Serialize for WordType {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
(*self as u8).serialize(serializer)
} else {
unreachable!("only used as ULE")
}
}
}
#[cfg(feature = "datagen")]
impl databake::Bake for WordType {
fn bake(&self, _crate_env: &databake::CrateEnv) -> databake::TokenStream {
unreachable!("only used as ULE")
}
}
#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for WordType {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
if deserializer.is_human_readable() {
use serde::de::Error;
match u8::deserialize(deserializer) {
Ok(0) => Ok(WordType::None),
Ok(1) => Ok(WordType::Number),
Ok(2) => Ok(WordType::Letter),
Ok(_) => Err(D::Error::custom("invalid value")),
Err(e) => Err(e),
}
} else {
unreachable!("only used as ULE")
}
}
}