use std::path::Path;
use crate::Error;
use super::{
N_HASH_DICT, N_LETTER_GROUPS, N_RULE_GROUP2,
RULE_GROUP_START, RULE_GROUP_END, RULE_REPLACEMENTS, RULE_LETTERGP2,
};
use super::transpose::TransposeConfig;
#[derive(Clone)]
pub struct Groups {
pub groups1: [Option<usize>; 256],
pub groups3: [Option<usize>; 128],
pub groups2: Vec<Group2Entry>,
pub groups2_count: [u8; 256],
pub groups2_start: [u8; 256],
pub letter_groups: [Option<usize>; N_LETTER_GROUPS],
pub replace_chars: Option<usize>,
}
#[derive(Clone, Copy, Debug)]
pub struct Group2Entry {
pub key: u16,
pub offset: usize,
}
impl Default for Groups {
fn default() -> Self {
Groups {
groups1: [None; 256],
groups3: [None; 128],
groups2: Vec::new(),
groups2_count: [0u8; 256],
groups2_start: [255u8; 256], letter_groups: [None; N_LETTER_GROUPS],
replace_chars: None,
}
}
}
pub struct Dictionary {
pub data: Vec<u8>,
pub rules_offset: usize,
pub hashtab: [usize; N_HASH_DICT],
pub groups: Groups,
pub lang: String,
pub transpose: TransposeConfig,
pub letter_bits_offset: u32,
pub letter_bits: Box<[u8; 256]>,
}
impl Dictionary {
pub fn load(lang: &str, data_dir: &Path) -> Result<Self, Error> {
let path = data_dir.join(format!("{}_dict", lang));
let data = std::fs::read(&path)
.map_err(|e| Error::Io(e))?;
Self::from_bytes(lang, data)
}
pub fn from_bytes(lang: &str, data: Vec<u8>) -> Result<Self, Error> {
if data.len() < N_HASH_DICT + 8 {
return Err(Error::InvalidData(
format!("dict '{}': file too short ({} bytes)", lang, data.len())));
}
let pw0 = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
let pw1 = u32::from_le_bytes(data[4..8].try_into().unwrap()) as usize;
if pw0 != N_HASH_DICT {
return Err(Error::InvalidData(
format!("dict '{}': bad magic 0x{:x} (expected 0x{:x})",
lang, pw0, N_HASH_DICT)));
}
if pw1 == 0 || pw1 > 0x800_0000 || pw1 > data.len() {
return Err(Error::InvalidData(
format!("dict '{}': bad rules_offset {}", lang, pw1)));
}
let rules_offset = pw1;
let mut hashtab = [0usize; N_HASH_DICT];
{
let mut pos = 8usize;
for hash in 0..N_HASH_DICT {
hashtab[hash] = pos;
loop {
if pos >= data.len() {
return Err(Error::InvalidData(
format!("dict '{}': hash table overran file at bucket {}", lang, hash)));
}
let entry_len = data[pos] as usize;
if entry_len == 0 { break; }
pos += entry_len;
}
pos += 1; }
}
let groups = build_groups(&data, rules_offset)?;
let (transpose, letter_bits_offset) = match lang {
"ru" | "bg" | "tt" | "uk" | "be" => (TransposeConfig::CYRILLIC, 0x420u32),
"ar" => (TransposeConfig::ARABIC, 0x600u32),
"fa" => (TransposeConfig::PERSIAN, 0x600u32),
_ => (TransposeConfig::LATIN, 0u32),
};
let letter_bits = build_letter_bits(lang);
Ok(Dictionary {
data,
rules_offset,
hashtab,
groups,
lang: lang.to_owned(),
transpose,
letter_bits_offset,
letter_bits,
})
}
#[inline]
pub fn rules(&self) -> &[u8] {
&self.data[self.rules_offset..]
}
#[inline]
pub fn group1(&self, c: u8) -> Option<&[u8]> {
self.groups.groups1[c as usize].map(|off| &self.data[off..])
}
#[inline]
pub fn group2_entries_for(&self, c: u8) -> impl Iterator<Item = &Group2Entry> {
let start = self.groups.groups2_start[c as usize] as usize;
let count = self.groups.groups2_count[c as usize] as usize;
if start >= self.groups.groups2.len() || count == 0 {
self.groups.groups2[0..0].iter()
} else {
let end = (start + count).min(self.groups.groups2.len());
self.groups.groups2[start..end].iter()
}
}
#[inline]
pub fn group2_rules(&self, e: &Group2Entry) -> &[u8] {
&self.data[e.offset..]
}
#[inline]
pub fn group3(&self, c2: u8) -> Option<&[u8]> {
let idx = c2.wrapping_sub(1) as usize;
if idx >= 128 { return None; }
self.groups.groups3[idx].map(|off| &self.data[off..])
}
#[inline]
pub fn letter_group(&self, ix: usize) -> Option<&[u8]> {
if ix >= N_LETTER_GROUPS { return None; }
self.groups.letter_groups[ix].map(|off| &self.data[off..])
}
}
fn build_groups(data: &[u8], rules_offset: usize) -> Result<Groups, Error> {
let mut g = Groups::default();
let mut n_groups2 = 0usize;
let rules = &data[rules_offset..];
let mut pos = 0usize;
if rules.is_empty() || rules[pos] == RULE_GROUP_END {
return Ok(g);
}
while pos < rules.len() && rules[pos] != 0 {
if rules[pos] != RULE_GROUP_START {
return Err(Error::InvalidData(
format!("bad rules data: expected RULE_GROUP_START at offset {}", rules_offset + pos)));
}
pos += 1;
if rules[pos] == RULE_REPLACEMENTS {
let abs = rules_offset + pos + 4;
let aligned = (abs + 3) & !3;
g.replace_chars = Some(aligned);
pos = aligned - rules_offset;
while pos < rules.len() && rules[pos] != RULE_GROUP_END {
pos += 1;
}
pos += 1; continue;
}
if rules[pos] == RULE_LETTERGP2 {
let idx_byte = rules[pos + 1];
let ix = if idx_byte < b'A' {
(idx_byte as i16 - b'A' as i16 + 256) as usize
} else {
(idx_byte - b'A') as usize
};
pos += 2;
if ix < N_LETTER_GROUPS {
g.letter_groups[ix] = Some(rules_offset + pos);
}
} else {
let name_start = pos;
while pos < rules.len() && rules[pos] != 0 {
pos += 1;
}
let name_len = pos - name_start;
let c = rules[name_start]; let c2 = if name_len >= 2 { rules[name_start + 1] } else { 0 };
pos += 1;
let rule_abs = rules_offset + pos;
match name_len {
0 => { g.groups1[0] = Some(rule_abs); }
1 => { g.groups1[c as usize] = Some(rule_abs); }
_ if c == 1 => {
let idx = c2.wrapping_sub(1) as usize;
if idx < 128 {
g.groups3[idx] = Some(rule_abs);
}
}
_ => {
if g.groups2_start[c as usize] == 255 {
g.groups2_start[c as usize] = n_groups2 as u8;
}
g.groups2_count[c as usize] =
g.groups2_count[c as usize].saturating_add(1);
let key = (c as u16) | ((c2 as u16) << 8);
if n_groups2 < N_RULE_GROUP2 {
g.groups2.push(Group2Entry { key, offset: rule_abs });
n_groups2 += 1;
}
}
}
}
while pos < rules.len() && rules[pos] != RULE_GROUP_END {
while pos < rules.len() && rules[pos] != 0 {
pos += 1;
}
pos += 1; }
pos += 1; }
Ok(g)
}
fn build_letter_bits(lang: &str) -> Box<[u8; 256]> {
let mut bits = Box::new([0u8; 256]);
match lang {
"ru" | "bg" | "uk" | "be" | "tt" => {
const RU_VOWELS: &[usize] = &[0x10, 0x15, 0x31, 0x18, 0x1e, 0x23, 0x2b, 0x2d, 0x2e, 0x2f];
const CYRL_SOFT: &[usize] = &[0x2c, 0x19, 0x27, 0x29];
const RU_CONSONANTS: &[usize] = &[
0x11,0x12,0x13,0x14,0x16,0x17,0x19,0x1a,0x1b,0x1c,
0x1d,0x1f,0x20,0x21,0x22,0x24,0x25,0x26,0x27,0x28,
0x29,0x2a,0x2c,
];
const CYRL_HARD: &[usize] = &[0x2a, 0x16, 0x26, 0x28];
const CYRL_NOTHARD: &[usize] = &[
0x11,0x12,0x13,0x14,0x17,0x19,0x1a,0x1b,0x1c,0x1d,
0x1f,0x20,0x21,0x22,0x24,0x25,0x27,0x29,0x2c,
];
const CYRL_VOICED: &[usize] = &[0x11,0x12,0x13,0x14,0x16,0x17];
const CYRL_IVOWELS: &[usize] = &[0x2c, 0x2e, 0x2f, 0x31, 0x15, 0x18, 0x34, 0x37];
fn set_bits(bits: &mut [u8; 256], indices: &[usize], group: u8) {
for &idx in indices {
if idx < 256 { bits[idx] |= 1 << group; }
}
}
set_bits(&mut bits, RU_VOWELS, 0); set_bits(&mut bits, CYRL_SOFT, 1); set_bits(&mut bits, RU_CONSONANTS, 2); set_bits(&mut bits, CYRL_HARD, 3); set_bits(&mut bits, CYRL_NOTHARD, 4); set_bits(&mut bits, CYRL_VOICED, 5); set_bits(&mut bits, CYRL_IVOWELS, 6); set_bits(&mut bits, RU_VOWELS, 7); }
_ => {
let en = crate::translate::english_letter_bits();
bits.copy_from_slice(en.as_slice());
}
}
bits
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn data_dir() -> PathBuf {
PathBuf::from("/usr/share/espeak-ng-data")
}
fn try_load(lang: &str) -> Option<Dictionary> {
let dir = data_dir();
if !dir.join(format!("{}_dict", lang)).exists() {
return None;
}
Some(Dictionary::load(lang, &dir).expect("load succeeded"))
}
#[test]
fn load_en_dict() {
let dict = match try_load("en") { Some(d) => d, None => return };
assert_eq!(dict.lang, "en");
assert_eq!(dict.rules_offset, 0x0001_b188,
"rules_offset should be 0x1b188 for installed en_dict");
assert!(dict.groups.groups1[b'a' as usize].is_some(),
"groups1['a'] should be set");
}
#[test]
fn hash_table_covers_all_buckets() {
let dict = match try_load("en") { Some(d) => d, None => return };
for (i, &off) in dict.hashtab.iter().enumerate() {
assert!(off < dict.data.len(),
"hashtab[{}] = {} out of bounds (len={})", i, off, dict.data.len());
}
}
#[test]
fn group1_default_is_some() {
let dict = match try_load("en") { Some(d) => d, None => return };
assert!(dict.groups.groups1[0].is_some(),
"default rule chain (groups1[0]) should be set for English");
}
#[test]
fn de_dict_loads() {
let _ = try_load("de"); }
#[test]
fn fr_dict_loads() {
let _ = try_load("fr");
}
}