pub mod coverage;
#[cfg(feature = "varna")]
pub mod detect;
pub mod entry;
pub mod format;
pub mod g2p;
pub mod heteronym;
#[cfg(feature = "mmap")]
pub mod lazy;
pub mod morphology;
pub mod static_dict;
pub mod stream;
pub mod syllable;
pub mod trie;
#[cfg(feature = "varna")]
pub mod validate;
use alloc::{collections::BTreeMap, string::String, vec::Vec};
use hashbrown::HashMap;
use serde::{Deserialize, Serialize};
use svara::phoneme::Phoneme;
use entry::{DictEntry, Pronunciation};
include!(concat!(env!("OUT_DIR"), "/generated_dict.rs"));
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PronunciationDict {
#[serde(deserialize_with = "deserialize_entries_compat")]
entries: HashMap<String, DictEntry>,
#[serde(
default,
skip_serializing_if = "BTreeMap::is_empty",
deserialize_with = "deserialize_user_entries_compat"
)]
user_entries: BTreeMap<String, DictEntry>,
#[serde(default, skip_serializing_if = "Option::is_none")]
language: Option<String>,
}
impl PronunciationDict {
#[must_use]
pub fn new() -> Self {
Self {
entries: HashMap::new(),
user_entries: BTreeMap::new(),
language: None,
}
}
#[must_use]
pub fn english() -> Self {
Self {
entries: generated_english_entries(),
user_entries: BTreeMap::new(),
language: Some(alloc::string::ToString::to_string("en")),
}
}
#[must_use]
pub fn english_minimal() -> Self {
let mut dict = Self::new();
dict.language = Some(alloc::string::ToString::to_string("en"));
dict.insert("the", &[Phoneme::FricativeDh, Phoneme::VowelSchwa]);
dict.insert("a", &[Phoneme::VowelSchwa]);
dict.insert("an", &[Phoneme::VowelSchwa, Phoneme::NasalN]);
dict.insert("i", &[Phoneme::DiphthongAI]);
dict.insert("is", &[Phoneme::VowelNearI, Phoneme::FricativeZ]);
dict.insert(
"was",
&[
Phoneme::ApproximantW,
Phoneme::VowelOpenO,
Phoneme::FricativeZ,
],
);
dict.insert("are", &[Phoneme::VowelOpenA, Phoneme::ApproximantR]);
dict.insert("to", &[Phoneme::PlosiveT, Phoneme::VowelU]);
dict.insert("of", &[Phoneme::VowelOpenO, Phoneme::FricativeV]);
dict.insert("in", &[Phoneme::VowelNearI, Phoneme::NasalN]);
dict.insert("it", &[Phoneme::VowelNearI, Phoneme::PlosiveT]);
dict.insert(
"and",
&[Phoneme::VowelAsh, Phoneme::NasalN, Phoneme::PlosiveD],
);
dict.insert(
"that",
&[Phoneme::FricativeDh, Phoneme::VowelAsh, Phoneme::PlosiveT],
);
dict.insert(
"for",
&[
Phoneme::FricativeF,
Phoneme::VowelOpenO,
Phoneme::ApproximantR,
],
);
dict.insert("you", &[Phoneme::ApproximantJ, Phoneme::VowelU]);
dict.insert("he", &[Phoneme::FricativeH, Phoneme::VowelE]);
dict.insert("she", &[Phoneme::FricativeSh, Phoneme::VowelE]);
dict.insert("we", &[Phoneme::ApproximantW, Phoneme::VowelE]);
dict.insert("they", &[Phoneme::FricativeDh, Phoneme::DiphthongEI]);
dict.insert(
"this",
&[
Phoneme::FricativeDh,
Phoneme::VowelNearI,
Phoneme::FricativeS,
],
);
dict.insert(
"with",
&[
Phoneme::ApproximantW,
Phoneme::VowelNearI,
Phoneme::FricativeTh,
],
);
dict.insert(
"not",
&[Phoneme::NasalN, Phoneme::VowelOpenO, Phoneme::PlosiveT],
);
dict.insert(
"but",
&[Phoneme::PlosiveB, Phoneme::VowelCupV, Phoneme::PlosiveT],
);
dict.insert(
"have",
&[Phoneme::FricativeH, Phoneme::VowelAsh, Phoneme::FricativeV],
);
dict.insert(
"one",
&[Phoneme::ApproximantW, Phoneme::VowelCupV, Phoneme::NasalN],
);
dict.insert(
"hello",
&[
Phoneme::FricativeH,
Phoneme::VowelOpenE,
Phoneme::LateralL,
Phoneme::DiphthongOU,
],
);
dict.insert(
"world",
&[
Phoneme::ApproximantW,
Phoneme::VowelBird,
Phoneme::LateralL,
Phoneme::PlosiveD,
],
);
dict.insert(
"yes",
&[
Phoneme::ApproximantJ,
Phoneme::VowelOpenE,
Phoneme::FricativeS,
],
);
dict.insert("no", &[Phoneme::NasalN, Phoneme::DiphthongOU]);
dict
}
#[must_use]
pub fn from_entries(entries: HashMap<String, DictEntry>) -> Self {
Self {
entries,
user_entries: BTreeMap::new(),
language: None,
}
}
#[must_use]
pub fn from_simple_entries(entries: HashMap<String, Vec<Phoneme>>) -> Self {
let entries = entries
.into_iter()
.map(|(word, phonemes)| (word, DictEntry::from_phonemes(&phonemes)))
.collect();
Self {
entries,
user_entries: BTreeMap::new(),
language: None,
}
}
pub fn insert(&mut self, word: &str, phonemes: &[Phoneme]) {
self.entries.insert(
alloc::string::ToString::to_string(&word.to_lowercase()),
DictEntry::from_phonemes(phonemes),
);
}
pub fn insert_entry(&mut self, word: &str, entry: DictEntry) {
self.entries.insert(
alloc::string::ToString::to_string(&word.to_lowercase()),
entry,
);
}
pub fn insert_user(&mut self, word: &str, phonemes: &[Phoneme]) {
self.user_entries.insert(
alloc::string::ToString::to_string(&word.to_lowercase()),
DictEntry::from_phonemes(phonemes),
);
}
pub fn insert_user_entry(&mut self, word: &str, entry: DictEntry) {
self.user_entries.insert(
alloc::string::ToString::to_string(&word.to_lowercase()),
entry,
);
}
pub fn remove_user(&mut self, word: &str) -> bool {
self.user_entries
.remove(&alloc::string::ToString::to_string(&word.to_lowercase()))
.is_some()
}
#[must_use]
pub fn language(&self) -> Option<&str> {
self.language.as_deref()
}
pub fn set_language(&mut self, code: &str) {
self.language = Some(alloc::string::ToString::to_string(code));
}
#[must_use]
pub fn with_language(mut self, code: &str) -> Self {
self.set_language(code);
self
}
#[must_use]
pub fn user_entries(&self) -> &BTreeMap<String, DictEntry> {
&self.user_entries
}
#[must_use]
pub fn user_len(&self) -> usize {
self.user_entries.len()
}
#[must_use]
pub fn lookup(&self, word: &str) -> Option<&[Phoneme]> {
self.lookup_entry(word)
.map(|entry| entry.primary_phonemes())
}
#[must_use]
pub fn lookup_entry(&self, word: &str) -> Option<&DictEntry> {
if word.bytes().all(|b| !b.is_ascii_uppercase()) {
self.user_entries
.get(word)
.or_else(|| self.entries.get(word))
} else {
let key = alloc::string::ToString::to_string(&word.to_lowercase());
self.user_entries
.get(&key)
.or_else(|| self.entries.get(&key))
}
}
#[must_use]
pub fn lookup_all(&self, word: &str) -> Option<&[Pronunciation]> {
self.lookup_entry(word).map(|entry| entry.all())
}
#[must_use]
pub fn len(&self) -> usize {
self.entries.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
#[must_use]
pub fn entries(&self) -> &HashMap<String, DictEntry> {
&self.entries
}
pub fn merge(&mut self, other: &PronunciationDict) {
for (word, entry) in other.entries() {
self.entries.insert(word.clone(), entry.clone());
}
for (word, entry) in other.user_entries() {
self.user_entries.insert(word.clone(), entry.clone());
}
}
pub fn merge_conservative(&mut self, other: &PronunciationDict) {
for (word, entry) in other.entries() {
if !self.entries.contains_key(word) {
self.entries.insert(word.clone(), entry.clone());
}
}
for (word, entry) in other.user_entries() {
if !self.user_entries.contains_key(word) {
self.user_entries.insert(word.clone(), entry.clone());
}
}
}
#[must_use]
pub fn with_fallback<M: g2p::G2PModel>(self, model: M) -> g2p::FallbackDict<M> {
g2p::FallbackDict::new(self, model)
}
#[must_use]
pub fn prefix_search(&self, prefix: &str) -> Vec<alloc::string::String> {
trie::PrefixTrie::from_dict(self).search_prefix(prefix)
}
#[cfg(feature = "varna")]
#[must_use]
pub fn validate(&self) -> Option<validate::ValidationReport> {
let code = self.language.as_deref()?;
let inventory = varna::registry::phonemes(code)?;
Some(validate::validate_inventory(self, &inventory))
}
#[cfg(feature = "varna")]
#[must_use]
pub fn validate_phonotactics(&self) -> Option<validate::PhonotacticReport> {
let code = self.language.as_deref()?;
let phonotactics = match code {
"en" => Some(varna::phoneme::syllable::english_phonotactics()),
"sa" => Some(varna::phoneme::syllable::sanskrit_phonotactics()),
"ja" => Some(varna::phoneme::syllable::japanese_phonotactics()),
_ => None,
}?;
Some(validate::validate_phonotactics(self, &phonotactics))
}
#[cfg(feature = "varna")]
#[must_use]
pub fn from_lexicon(lexicon: &varna::lexicon::Lexicon) -> Self {
let mut dict = Self::new();
dict.language = Some(alloc::string::ToString::to_string(
lexicon.language_code.as_ref(),
));
for entry in &lexicon.entries {
let phonemes = crate::ipa::parse_ipa_word(entry.ipa.as_ref());
if phonemes.is_empty() {
continue;
}
let frequency = entry.frequency_rank.map(|rank| {
1.0 / (1.0 + rank as f32)
});
let mut pronunciation = Pronunciation::new(phonemes);
if let Some(freq) = frequency {
pronunciation = pronunciation.with_frequency(freq);
}
let key = alloc::string::ToString::to_string(&entry.word.to_lowercase());
dict.entries
.insert(key, entry::DictEntry::new(pronunciation));
}
dict
}
#[cfg(feature = "varna")]
#[must_use]
pub fn spanish() -> Self {
Self::from_lexicon(&varna::lexicon::swadesh::by_code("es").unwrap_or_else(|| {
varna::lexicon::Lexicon {
language_code: "es".into(),
entries: alloc::vec![],
}
}))
}
#[cfg(feature = "varna")]
#[must_use]
pub fn hindi() -> Self {
Self::from_lexicon(&varna::lexicon::swadesh::by_code("hi").unwrap_or_else(|| {
varna::lexicon::Lexicon {
language_code: "hi".into(),
entries: alloc::vec![],
}
}))
}
#[cfg(feature = "varna")]
#[must_use]
pub fn german() -> Self {
Self::from_lexicon(&varna::lexicon::swadesh::by_code("de").unwrap_or_else(|| {
varna::lexicon::Lexicon {
language_code: "de".into(),
entries: alloc::vec![],
}
}))
}
#[cfg(feature = "varna")]
#[must_use]
pub fn sanskrit() -> Self {
Self::new().with_language("sa")
}
}
#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
pub struct DictDiff {
pub added: Vec<String>,
pub removed: Vec<String>,
pub changed: Vec<String>,
}
impl DictDiff {
#[must_use]
pub fn is_empty(&self) -> bool {
self.added.is_empty() && self.removed.is_empty() && self.changed.is_empty()
}
#[must_use]
pub fn len(&self) -> usize {
self.added.len() + self.removed.len() + self.changed.len()
}
}
#[must_use]
pub fn diff(left: &PronunciationDict, right: &PronunciationDict) -> DictDiff {
let mut all_words = alloc::collections::BTreeSet::new();
for word in left.entries().keys() {
all_words.insert(word.as_str());
}
for word in left.user_entries().keys() {
all_words.insert(word.as_str());
}
for word in right.entries().keys() {
all_words.insert(word.as_str());
}
for word in right.user_entries().keys() {
all_words.insert(word.as_str());
}
let mut result = DictDiff::default();
for word in all_words {
let l = left.lookup_entry(word);
let r = right.lookup_entry(word);
match (l, r) {
(None, Some(_)) => result.added.push(alloc::string::ToString::to_string(word)),
(Some(_), None) => result
.removed
.push(alloc::string::ToString::to_string(word)),
(Some(le), Some(re)) if le != re => {
result
.changed
.push(alloc::string::ToString::to_string(word));
}
_ => {}
}
}
result
}
impl Default for PronunciationDict {
fn default() -> Self {
Self::new()
}
}
#[derive(Deserialize)]
#[serde(untagged)]
enum EntryCompat {
New(DictEntry),
Old(Vec<Phoneme>),
}
impl EntryCompat {
fn into_entry(self) -> DictEntry {
match self {
Self::New(entry) => entry,
Self::Old(phonemes) => DictEntry::from_phonemes(&phonemes),
}
}
}
fn deserialize_entries_compat<'de, D>(
deserializer: D,
) -> core::result::Result<HashMap<String, DictEntry>, D::Error>
where
D: serde::Deserializer<'de>,
{
let raw: BTreeMap<String, EntryCompat> = BTreeMap::deserialize(deserializer)?;
Ok(raw.into_iter().map(|(k, v)| (k, v.into_entry())).collect())
}
fn deserialize_user_entries_compat<'de, D>(
deserializer: D,
) -> core::result::Result<BTreeMap<String, DictEntry>, D::Error>
where
D: serde::Deserializer<'de>,
{
let raw: BTreeMap<String, EntryCompat> = BTreeMap::deserialize(deserializer)?;
Ok(raw.into_iter().map(|(k, v)| (k, v.into_entry())).collect())
}