#![warn(missing_docs)]
use cldr_cache::CldrCache;
use elsa::sync::FrozenMap;
use icu::calendar::{Date, Iso};
use icu::time::zone::UtcOffset;
use icu::time::Time;
use icu_provider::prelude::*;
use source::{AbstractFs, SerdeCache, TzdbCache, UnihanCache};
use std::collections::{BTreeSet, HashSet};
use std::fmt::Debug;
use std::path::Path;
use std::sync::{Arc, OnceLock};
mod calendar;
mod characters;
mod cldr_serde;
mod collator;
#[cfg(feature = "unstable")]
mod currency;
mod datetime;
mod debug_provider;
mod decimal;
#[cfg(feature = "unstable")]
mod displaynames;
mod duration;
mod list;
mod locale;
mod normalizer;
#[cfg(feature = "unstable")]
mod percent;
#[cfg(feature = "unstable")]
mod personnames;
mod plurals;
mod properties;
#[cfg(feature = "unstable")]
mod relativetime;
mod segmenter;
mod time_zones;
#[cfg(feature = "unstable")]
mod transforms;
mod ucase;
#[cfg(feature = "unstable")]
mod units;
mod cldr_cache;
mod source;
#[cfg(test)]
mod tests;
#[allow(clippy::exhaustive_structs)] #[derive(Debug, Clone)]
pub struct SourceDataProvider {
cldr_paths: Option<Arc<CldrCache>>,
icuexport_paths: Option<Arc<SerdeCache>>,
segmenter_lstm_paths: Option<Arc<SerdeCache>>,
tzdb_paths: Option<Arc<TzdbCache>>,
unihan_paths: Option<Arc<UnihanCache>>,
ucd_paths: Option<Arc<AbstractFs>>,
trie_type: TrieType,
collation_root_han: CollationRootHan,
pub(crate) timezone_horizon: time_zones::Timestamp,
#[expect(clippy::type_complexity)] requests_cache: Arc<
FrozenMap<
DataMarkerInfo,
Box<OnceLock<Result<HashSet<DataIdentifierCow<'static>>, DataError>>>,
>,
>,
}
macro_rules! cb {
($($marker_ty:ty:$marker:ident,)+ #[unstable] $($emarker_ty:ty:$emarker:ident,)+) => {
icu_provider::export::make_exportable_provider!(SourceDataProvider, [
$($marker_ty,)+
$(#[cfg(feature = "unstable")] $emarker_ty,)+
]);
}
}
extern crate alloc;
icu_provider_registry::registry!(cb);
icu_provider::marker::impl_data_provider_never_marker!(SourceDataProvider);
impl SourceDataProvider {
pub const TESTED_CLDR_TAG: &'static str = "48.2.0";
pub const TESTED_ICUEXPORT_TAG: &'static str = "release-78.1rc";
pub const TESTED_SEGMENTER_LSTM_TAG: &'static str = "v0.1.0";
pub const TESTED_UCD_TAG: &'static str = "17.0.0";
pub const TESTED_TZDB_TAG: &'static str = "2026a";
#[cfg(feature = "networking")]
#[expect(clippy::new_without_default)]
pub fn new() -> Self {
static SINGLETON: OnceLock<SourceDataProvider> = OnceLock::new();
SINGLETON
.get_or_init(|| {
Self::new_custom()
.with_cldr_for_tag(Self::TESTED_CLDR_TAG)
.with_icuexport_for_tag(Self::TESTED_ICUEXPORT_TAG)
.with_segmenter_lstm_for_tag(Self::TESTED_SEGMENTER_LSTM_TAG)
.with_tzdb_for_tag(Self::TESTED_TZDB_TAG)
.with_unihan_for_tag(Self::TESTED_UCD_TAG)
})
.clone()
}
pub fn new_custom() -> Self {
Self {
cldr_paths: None,
icuexport_paths: None,
segmenter_lstm_paths: None,
tzdb_paths: None,
unihan_paths: None,
ucd_paths: None,
trie_type: Default::default(),
timezone_horizon: time_zones::Timestamp::try_offset_only_from_str(
"2015-01-01T00:00:00Z",
Default::default(),
)
.unwrap(),
collation_root_han: Default::default(),
requests_cache: Default::default(),
}
}
pub fn with_cldr(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(
AbstractFs::new(root)?,
)))),
..self
})
}
pub fn with_icuexport(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new(root)?))),
..self
})
}
pub fn with_segmenter_lstm(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new(root)?))),
..self
})
}
pub fn with_unihan(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
unihan_paths: Some(Arc::new(UnihanCache {
root: AbstractFs::new(root)?,
irg_cache: Default::default(),
})),
..self
})
}
pub fn with_ucd(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
ucd_paths: Some(Arc::new(AbstractFs::new(root)?)),
..self
})
}
pub fn with_tzdb(self, root: &Path) -> Result<Self, DataError> {
Ok(Self {
tzdb_paths: Some(Arc::new(TzdbCache {
root: AbstractFs::new(root)?,
transitions: Default::default(),
})),
..self
})
}
#[cfg(feature = "networking")]
pub fn with_cldr_for_tag(self, tag: &str) -> Self {
Self {
cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(AbstractFs::new_from_url(format!(
"https://github.com/unicode-org/cldr-json/releases/download/{tag}/cldr-{tag}-json-full.zip",
)))))),
..self
}
}
#[cfg(feature = "networking")]
pub fn with_icuexport_for_tag(self, tag: &str) -> Self {
let url = if tag >= "release-78.1" || tag.starts_with("icu4x-") {
format!(
"https://github.com/unicode-org/icu/releases/download/{tag}/icu4x-icuexportdata-{}.zip",
tag.replace("release-", "").replace("icu4x-", "")
)
} else {
format!(
"https://github.com/unicode-org/icu/releases/download/{tag}/icuexportdata_{}.zip",
tag.replace('/', "-")
)
};
Self {
icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(url)))),
..self
}
}
#[cfg(feature = "networking")]
pub fn with_segmenter_lstm_for_tag(self, tag: &str) -> Self {
Self {
segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(format!(
"https://github.com/unicode-org/lstm_word_segmentation/releases/download/{tag}/models.zip"
))))),
..self
}
}
#[cfg(feature = "networking")]
pub fn with_unihan_for_tag(self, tag: &str) -> Self {
Self {
unihan_paths: Some(Arc::new(UnihanCache {
root: AbstractFs::new_from_url(format!(
"https://www.unicode.org/Public/{tag}/ucd/Unihan.zip"
)),
irg_cache: Default::default(),
})),
..self
}
}
#[cfg(feature = "networking")]
pub fn with_ucd_for_tag(self, tag: &str) -> Self {
Self {
ucd_paths: Some(Arc::new(AbstractFs::new_from_url(format!(
"https://www.unicode.org/Public/{tag}/"
)))),
..self
}
}
#[cfg(feature = "networking")]
pub fn with_tzdb_for_tag(self, tag: &str) -> Self {
Self {
tzdb_paths: Some(Arc::new(TzdbCache {
root: AbstractFs::new_from_url(format!(
"https://www.iana.org/time-zones/repository/releases/tzdata{tag}.tar.gz",
)),
transitions: Default::default(),
})),
..self
}
}
const MISSING_CLDR_ERROR: DataError =
DataError::custom("Missing CLDR data. Use `.with_cldr[_for_tag]` to set CLDR data.");
const MISSING_ICUEXPORT_ERROR: DataError =
DataError::custom("Missing ICU data. Use `.with_icuexport[_for_tag]` to set ICU data.");
const MISSING_SEGMENTER_LSTM_ERROR: DataError = DataError::custom(
"Missing segmenter data. Use `.with_segmenter_lstm[_for_tag]` to set segmenter data.",
);
const MISSING_UNIHAN_ERROR: DataError =
DataError::custom("Missing Unihan data. Use `.with_unihan[_for_tag]` to set Unihan data.");
const MISSING_UCD_ERROR: DataError =
DataError::custom("Missing UCD data. Use `.with_ucd` to set UCD data.");
const MISSING_TZDB_ERROR: DataError =
DataError::custom("Missing tzdb data. Use `.with_tzdb[_for_tag]` to set tzdb data.");
pub fn is_missing_cldr_error(mut e: DataError) -> bool {
e.marker = None;
e == Self::MISSING_CLDR_ERROR
}
pub fn is_missing_icuexport_error(mut e: DataError) -> bool {
e.marker = None;
e == Self::MISSING_ICUEXPORT_ERROR
}
pub fn is_missing_segmenter_lstm_error(mut e: DataError) -> bool {
e.marker = None;
e == Self::MISSING_SEGMENTER_LSTM_ERROR
}
pub fn is_missing_tzdb_error(mut e: DataError) -> bool {
e.marker = None;
e == Self::MISSING_TZDB_ERROR
}
pub fn is_missing_unihan_error(mut e: DataError) -> bool {
e.marker = None;
e == Self::MISSING_UNIHAN_ERROR
}
pub fn is_missing_ucd_error(mut e: DataError) -> bool {
e.marker = None;
e == Self::MISSING_UCD_ERROR
}
fn cldr(&self) -> Result<&CldrCache, DataError> {
self.cldr_paths.as_deref().ok_or(Self::MISSING_CLDR_ERROR)
}
fn icuexport(&self) -> Result<&SerdeCache, DataError> {
self.icuexport_paths
.as_deref()
.ok_or(Self::MISSING_ICUEXPORT_ERROR)
}
fn segmenter_lstm(&self) -> Result<&SerdeCache, DataError> {
self.segmenter_lstm_paths
.as_deref()
.ok_or(Self::MISSING_SEGMENTER_LSTM_ERROR)
}
#[allow(dead_code)]
fn unihan(&self) -> Result<&UnihanCache, DataError> {
self.unihan_paths
.as_deref()
.ok_or(Self::MISSING_UNIHAN_ERROR)
}
#[allow(dead_code)]
fn ucd(&self) -> Result<&AbstractFs, DataError> {
self.ucd_paths.as_deref().ok_or(Self::MISSING_UCD_ERROR)
}
fn tzdb(&self) -> Result<&TzdbCache, DataError> {
self.tzdb_paths.as_deref().ok_or(Self::MISSING_TZDB_ERROR)
}
pub fn with_fast_tries(self) -> Self {
Self {
trie_type: TrieType::Fast,
..self
}
}
pub fn with_collation_root_han(self, collation_root_han: CollationRootHan) -> Self {
Self {
collation_root_han,
..self
}
}
pub fn with_timezone_horizon(self, date: Date<Iso>) -> Self {
Self {
timezone_horizon: time_zones::Timestamp {
date,
time: Time::start_of_day(),
zone: UtcOffset::zero(),
},
..self
}
}
fn trie_type(&self) -> TrieType {
self.trie_type
}
fn collation_root_han(&self) -> CollationRootHan {
self.collation_root_han
}
pub fn locales_for_coverage_levels(
&self,
levels: impl IntoIterator<Item = CoverageLevel>,
) -> Result<impl IntoIterator<Item = DataLocale>, DataError> {
self.cldr()?.locales(levels)
}
}
impl SourceDataProvider {
fn check_req<M: DataMarker>(&self, req: DataRequest) -> Result<(), DataError>
where
SourceDataProvider: IterableDataProviderCached<M>,
{
if <M as DataMarker>::INFO.is_singleton {
if !req.id.locale.is_unknown() {
Err(DataErrorKind::InvalidRequest)
} else {
Ok(())
}
} else if !self.populate_requests_cache()?.contains(&req.id.as_cow()) {
Err(DataErrorKind::IdentifierNotFound)
} else {
Ok(())
}
.map_err(|e| e.with_req(<M as DataMarker>::INFO, req))
}
}
#[test]
fn test_check_req() {
use icu::locale::langid;
use icu_provider::hello_world::*;
#[allow(non_local_definitions)] impl DataProvider<HelloWorldV1> for SourceDataProvider {
fn load(&self, req: DataRequest) -> Result<DataResponse<HelloWorldV1>, DataError> {
HelloWorldProvider.load(req)
}
}
#[allow(non_local_definitions)] impl IterableDataProviderCached<HelloWorldV1> for SourceDataProvider {
fn iter_ids_cached(&self) -> Result<HashSet<DataIdentifierCow<'static>>, DataError> {
Ok(HelloWorldProvider.iter_ids()?.into_iter().collect())
}
}
let provider = SourceDataProvider::new_testing();
assert!(provider
.check_req::<HelloWorldV1>(DataRequest {
id: DataIdentifierBorrowed::for_locale(&langid!("fi").into()),
..Default::default()
})
.is_ok());
assert!(provider
.check_req::<HelloWorldV1>(DataRequest {
id: DataIdentifierBorrowed::for_locale(&langid!("arc").into()),
..Default::default()
})
.is_err());
}
trait IterableDataProviderCached<M: DataMarker>: DataProvider<M> {
fn iter_ids_cached(&self) -> Result<HashSet<DataIdentifierCow<'static>>, DataError>;
}
impl SourceDataProvider {
fn populate_requests_cache<M: DataMarker>(
&self,
) -> Result<&HashSet<DataIdentifierCow<'_>>, DataError>
where
SourceDataProvider: IterableDataProviderCached<M>,
{
self.requests_cache
.insert_with(M::INFO, || Box::new(OnceLock::new()))
.get_or_init(|| self.iter_ids_cached())
.as_ref()
.map_err(|&e| e)
}
}
impl<M: DataMarker> IterableDataProvider<M> for SourceDataProvider
where
SourceDataProvider: IterableDataProviderCached<M>,
{
fn iter_ids(&self) -> Result<BTreeSet<DataIdentifierCow<'_>>, DataError> {
Ok(if <M as DataMarker>::INFO.is_singleton {
[Default::default()].into_iter().collect()
} else {
self.populate_requests_cache()?
.iter()
.map(|id| id.as_borrowed().as_cow())
.collect()
})
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub enum CollationRootHan {
#[serde(rename = "implicit")]
#[default]
Implicit,
#[serde(rename = "unihan")]
Unihan,
}
impl std::fmt::Display for CollationRootHan {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
CollationRootHan::Implicit => write!(f, "implicithan"),
CollationRootHan::Unihan => write!(f, "unihan"),
}
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)]
#[non_exhaustive]
#[serde(rename_all = "camelCase")]
pub enum CoverageLevel {
Modern,
Moderate,
Basic,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
enum TrieType {
#[serde(rename = "fast")]
Fast,
#[serde(rename = "small")]
#[default]
Small,
}
impl std::fmt::Display for TrieType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
TrieType::Fast => write!(f, "fast"),
TrieType::Small => write!(f, "small"),
}
}
}