espeak-ng 0.1.1

Pure Rust port of eSpeak NG text-to-speech
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
//! Dictionary binary file loader.
//!
//! Corresponds to `LoadDictionary` and `InitGroups` in `dictionary.c`.
//!
//! # File layout
//! ```text
//! bytes  0-3              : u32le = N_HASH_DICT (1024) — integrity check
//! bytes  4-7              : u32le = rules_offset
//! bytes  8..rules_offset  : hash buckets (word list)
//! bytes  rules_offset..   : translation rule groups
//! ```

use std::path::Path;

use crate::Error;
use super::{
    N_HASH_DICT, N_LETTER_GROUPS, N_RULE_GROUP2,
    RULE_GROUP_START, RULE_GROUP_END, RULE_REPLACEMENTS, RULE_LETTERGP2,
};
use super::transpose::TransposeConfig;

// ─────────────────────────────────────────────────────────────────────────────
// Rule-group indexing
// ─────────────────────────────────────────────────────────────────────────────

/// All offsets here are **absolute** byte offsets into `Dictionary::data`.
/// A value of `usize::MAX` means "not present" (corresponds to NULL in C).

#[derive(Clone)]
/// Index of all rule groups within a [`Dictionary`].
///
/// Built by `init_groups()` which scans the rules section of the binary file
/// and records the offset to each rule chain, mirroring `InitGroups()` in C.
pub struct Groups {
    /// `groups1[c]` — offset to the rule chain for the single ASCII byte `c`.
    pub groups1: [Option<usize>; 256],

    /// `groups3[c2-1]` — offset to the rule chain for multi-byte alphabet
    /// sequences indexed by the second byte of the `\x01 c2` header.
    pub groups3: [Option<usize>; 128],

    /// Two-letter rule groups, searched in order.
    pub groups2: Vec<Group2Entry>,

    /// For each initial byte `c`: how many `groups2` entries start with `c`.
    pub groups2_count: [u8; 256],

    /// For each initial byte `c`: first index into `groups2` for that byte.
    pub groups2_start: [u8; 256],

    /// `letterGroups[ix]` — offset to the letter-group string list.
    /// Index is `(flag_byte - 'A')` where flag_byte may wrap around 256.
    pub letter_groups: [Option<usize>; N_LETTER_GROUPS],

    /// Absolute offset to the `replace_chars` table inside `data`, if any.
    pub replace_chars: Option<usize>,
}

/// One entry in the two-letter rule group index.
#[derive(Clone, Copy, Debug)]
pub struct Group2Entry {
    /// The two-char key: `c1 | (c2 << 8)` (little-endian uint16).
    pub key: u16,
    /// Absolute offset into `Dictionary::data` of the first rule.
    pub offset: usize,
}

impl Default for Groups {
    fn default() -> Self {
        Groups {
            groups1: [None; 256],
            groups3: [None; 128],
            groups2: Vec::new(),
            groups2_count: [0u8; 256],
            groups2_start: [255u8; 256], // 255 = "not set"
            letter_groups: [None; N_LETTER_GROUPS],
            replace_chars: None,
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Dictionary
// ─────────────────────────────────────────────────────────────────────────────

/// An in-memory view of a `<lang>_dict` file.
///
/// `data` is the raw file bytes.  All indices (`hashtab`, `groups.*`) are
/// **absolute byte offsets into `data`**.
pub struct Dictionary {
    pub data: Vec<u8>,

    /// Absolute byte offset to the translation-rules section.
    pub rules_offset: usize,

    /// `hashtab[hash]` — absolute offset to the start of bucket `hash`.
    pub hashtab: [usize; N_HASH_DICT],

    /// Rule-group index built by `init_groups()`.
    pub groups: Groups,

    /// BCP-47 language tag this dictionary was loaded for (e.g. `"en"`).
    pub lang: String,

    /// Compression configuration for word hashing and lookup.
    /// Most Latin-script languages use `TransposeConfig::LATIN`.
    pub transpose: TransposeConfig,

    /// Base Unicode codepoint for `groups3` indexing.
    ///
    /// Mirrors `Translator::letter_bits_offset` in C's `tr_languages.c`.
    /// When non-zero, `groups3[wc - letter_bits_offset]` maps a Unicode
    /// codepoint to its rule chain.
    ///
    /// | Language family | Value |
    /// |-----------------|-------|
    /// | Latin (default) | 0 |
    /// | Cyrillic (ru, bg, tt, uk, be) | 0x420 |
    /// | Arabic (ar) | 0x600 |
    /// | Farsi/Persian (fa) | 0x600 |
    pub letter_bits_offset: u32,

    /// Per-character letter-group bitmask table.
    ///
    /// Indexed by `(codepoint - letter_bits_offset) & 0x7f` (clamped to 128).
    /// Bit N is set iff the character belongs to letter group N
    /// (LETTERGP_A=0, LETTERGP_B=1, LETTERGP_C=2, LETTERGP_H=3,
    ///  LETTERGP_F=4, LETTERGP_G=5, LETTERGP_Y=6, LETTERGP_VOWEL2=7).
    ///
    /// For Latin scripts, the table uses direct ASCII indices (offset=0).
    pub letter_bits: Box<[u8; 256]>,
}

impl Dictionary {
    /// Load `<data_dir>/<lang>_dict`.
    pub fn load(lang: &str, data_dir: &Path) -> Result<Self, Error> {
        let path = data_dir.join(format!("{}_dict", lang));
        let data = std::fs::read(&path)
            .map_err(|e| Error::Io(e))?;

        Self::from_bytes(lang, data)
    }

    /// Parse an already-loaded byte buffer.
    pub fn from_bytes(lang: &str, data: Vec<u8>) -> Result<Self, Error> {
        // Header validation
        if data.len() < N_HASH_DICT + 8 {
            return Err(Error::InvalidData(
                format!("dict '{}': file too short ({} bytes)", lang, data.len())));
        }

        let pw0 = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
        let pw1 = u32::from_le_bytes(data[4..8].try_into().unwrap()) as usize;

        if pw0 != N_HASH_DICT {
            return Err(Error::InvalidData(
                format!("dict '{}': bad magic 0x{:x} (expected 0x{:x})",
                    lang, pw0, N_HASH_DICT)));
        }
        if pw1 == 0 || pw1 > 0x800_0000 || pw1 > data.len() {
            return Err(Error::InvalidData(
                format!("dict '{}': bad rules_offset {}", lang, pw1)));
        }
        let rules_offset = pw1;

        // Build hash table.
        // C: p = &data_dictlist[8]; for each hash: record p, skip entries (length != 0), skip 0-terminator.
        let mut hashtab = [0usize; N_HASH_DICT];
        {
            let mut pos = 8usize;
            for hash in 0..N_HASH_DICT {
                hashtab[hash] = pos;
                // Skip entries in this bucket (each starts with its total length).
                loop {
                    if pos >= data.len() {
                        return Err(Error::InvalidData(
                            format!("dict '{}': hash table overran file at bucket {}", lang, hash)));
                    }
                    let entry_len = data[pos] as usize;
                    if entry_len == 0 { break; }
                    pos += entry_len;
                }
                pos += 1; // skip the 0-terminator
            }
        }

        // Parse rule groups.
        let groups = build_groups(&data, rules_offset)?;

        // Per-script configuration — mirrors `SelectTranslator()` in `tr_languages.c`.
        //
        // transpose:          controls dictionary word hashing/lookup compression
        // letter_bits_offset: base codepoint for groups3 rule indexing
        //
        // OFFSET_CYRILLIC = 0x420, OFFSET_ARABIC = 0x600 (from tr_languages.c)
        let (transpose, letter_bits_offset) = match lang {
            // Cyrillic-script languages: SetCyrillicLetters() → transpose_min=0x430,
            // letter_bits_offset=0x420 (OFFSET_CYRILLIC)
            "ru" | "bg" | "tt" | "uk" | "be" => (TransposeConfig::CYRILLIC, 0x420u32),
            // Arabic script (ar): transpose_min=0x600, letter_bits_offset=0x600
            "ar" => (TransposeConfig::ARABIC, 0x600u32),
            // Farsi/Persian: letter_bits_offset=0x600 (OFFSET_ARABIC)
            "fa" => (TransposeConfig::PERSIAN, 0x600u32),
            // All other languages: default Latin-script compression, no offset
            _ => (TransposeConfig::LATIN, 0u32),
        };

        let letter_bits = build_letter_bits(lang);

        Ok(Dictionary {
            data,
            rules_offset,
            hashtab,
            groups,
            lang: lang.to_owned(),
            transpose,
            letter_bits_offset,
            letter_bits,
        })
    }

    /// Return the slice containing all translation rules.
    #[inline]
    pub fn rules(&self) -> &[u8] {
        &self.data[self.rules_offset..]
    }

    /// Return the translation rule chain for a single ASCII byte `c` (groups1),
    /// as a slice starting at the first rule.
    #[inline]
    pub fn group1(&self, c: u8) -> Option<&[u8]> {
        self.groups.groups1[c as usize].map(|off| &self.data[off..])
    }

    /// Same for the two-byte chain.
    /// Returns `(matching_slice, advance_bytes)` for the best-matching group2 entry.
    #[inline]
    pub fn group2_entries_for(&self, c: u8) -> impl Iterator<Item = &Group2Entry> {
        let start = self.groups.groups2_start[c as usize] as usize;
        let count = self.groups.groups2_count[c as usize] as usize;
        if start >= self.groups.groups2.len() || count == 0 {
            self.groups.groups2[0..0].iter()
        } else {
            let end = (start + count).min(self.groups.groups2.len());
            self.groups.groups2[start..end].iter()
        }
    }

    /// Rule slice for a groups2 entry.
    #[inline]
    pub fn group2_rules(&self, e: &Group2Entry) -> &[u8] {
        &self.data[e.offset..]
    }

    /// Rule slice for a groups3 entry.
    #[inline]
    pub fn group3(&self, c2: u8) -> Option<&[u8]> {
        let idx = c2.wrapping_sub(1) as usize;
        if idx >= 128 { return None; }
        self.groups.groups3[idx].map(|off| &self.data[off..])
    }

    /// Letter-group string list for `letterGroups[ix]`.
    #[inline]
    pub fn letter_group(&self, ix: usize) -> Option<&[u8]> {
        if ix >= N_LETTER_GROUPS { return None; }
        self.groups.letter_groups[ix].map(|off| &self.data[off..])
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// InitGroups — build the rule-group index
// ─────────────────────────────────────────────────────────────────────────────

fn build_groups(data: &[u8], rules_offset: usize) -> Result<Groups, Error> {
    let mut g = Groups::default();
    let mut n_groups2 = 0usize;

    let rules = &data[rules_offset..];
    let mut pos = 0usize; // offset into `rules` (relative to rules_offset)

    // If there are no rules at all, the file just starts with RULE_GROUP_END.
    if rules.is_empty() || rules[pos] == RULE_GROUP_END {
        return Ok(g);
    }

    while pos < rules.len() && rules[pos] != 0 {
        if rules[pos] != RULE_GROUP_START {
            return Err(Error::InvalidData(
                format!("bad rules data: expected RULE_GROUP_START at offset {}", rules_offset + pos)));
        }
        pos += 1; // skip RULE_GROUP_START

        // Check for special group types
        if rules[pos] == RULE_REPLACEMENTS {
            // Advance to next 4-byte-aligned position (relative to data start)
            let abs = rules_offset + pos + 4;
            let aligned = (abs + 3) & !3;
            // replace_chars table starts there
            g.replace_chars = Some(aligned);
            // skip forward until RULE_GROUP_END
            pos = aligned - rules_offset;
            while pos < rules.len() && rules[pos] != RULE_GROUP_END {
                pos += 1;
            }
            pos += 1; // skip RULE_GROUP_END
            continue;
        }

        if rules[pos] == RULE_LETTERGP2 {
            // p[0] = RULE_LETTERGP2, p[1] = group-index byte
            let idx_byte = rules[pos + 1];
            let ix = if idx_byte < b'A' {
                // negative wrap-around: (idx_byte - 'A') mod 256 as signed
                (idx_byte as i16 - b'A' as i16 + 256) as usize
            } else {
                (idx_byte - b'A') as usize
            };
            pos += 2;
            if ix < N_LETTER_GROUPS {
                g.letter_groups[ix] = Some(rules_offset + pos);
            }
        } else {
            // Regular group: name string followed by \0
            let name_start = pos;
            while pos < rules.len() && rules[pos] != 0 {
                pos += 1;
            }
            let name_len = pos - name_start;
            let c  = rules[name_start];       // first byte of group name
            let c2 = if name_len >= 2 { rules[name_start + 1] } else { 0 };
            pos += 1; // skip the \0

            // abs offset of first rule for this group
            let rule_abs = rules_offset + pos;

            match name_len {
                0 => { g.groups1[0] = Some(rule_abs); }
                1 => { g.groups1[c as usize] = Some(rule_abs); }
                _ if c == 1 => {
                    // groups3 indexed by c2-1
                    let idx = c2.wrapping_sub(1) as usize;
                    if idx < 128 {
                        g.groups3[idx] = Some(rule_abs);
                    }
                }
                _ => {
                    // Two-letter group
                    if g.groups2_start[c as usize] == 255 {
                        g.groups2_start[c as usize] = n_groups2 as u8;
                    }
                    g.groups2_count[c as usize] =
                        g.groups2_count[c as usize].saturating_add(1);
                    let key = (c as u16) | ((c2 as u16) << 8);
                    if n_groups2 < N_RULE_GROUP2 {
                        g.groups2.push(Group2Entry { key, offset: rule_abs });
                        n_groups2 += 1;
                    }
                }
            }
        }

        // Skip over all rules in this group until RULE_GROUP_END
        // Each rule is a null-terminated string.
        while pos < rules.len() && rules[pos] != RULE_GROUP_END {
            while pos < rules.len() && rules[pos] != 0 {
                pos += 1;
            }
            pos += 1; // skip the \0 terminator of this rule
        }
        pos += 1; // skip RULE_GROUP_END
    }

    Ok(g)
}

// ─────────────────────────────────────────────────────────────────────────────
// Per-language letter_bits table builder
// ─────────────────────────────────────────────────────────────────────────────

/// Build the letter group bitmask table for `lang`.
///
/// The table is indexed by `(codepoint - letter_bits_offset) & 0x7f`.
/// Bit N = 1 iff the character belongs to letter group N:
///   0=A(vowel)  1=B(soft)  2=C(consonant)  3=H(hard)
///   4=F(not-hard)  5=G(voiced)  6=Y(iotated/front)  7=VOWEL2
///
/// Letter group data mirrors `SetCyrillicLetters()` / `SetLetterBits()` in
/// `tr_languages.c`.
fn build_letter_bits(lang: &str) -> Box<[u8; 256]> {
    let mut bits = Box::new([0u8; 256]);

    match lang {
        "ru" | "bg" | "uk" | "be" | "tt" => {
            // Cyrillic letter groups — `letter_bits_offset = OFFSET_CYRILLIC = 0x420`
            // Indices are (codepoint - 0x420); stored at that index in bits[].

            // LETTERGP_A (0) = vowels: а е ё и о у ы э ю я (0x10,0x15,0x31,0x18,0x1e,0x23,0x2b,0x2d,0x2e,0x2f)
            const RU_VOWELS: &[usize] = &[0x10, 0x15, 0x31, 0x18, 0x1e, 0x23, 0x2b, 0x2d, 0x2e, 0x2f];
            // LETTERGP_B (1) = soft consonants: ь й ч щ (0x2c,0x19,0x27,0x29)
            const CYRL_SOFT: &[usize] = &[0x2c, 0x19, 0x27, 0x29];
            // LETTERGP_C (2) = consonants
            const RU_CONSONANTS: &[usize] = &[
                0x11,0x12,0x13,0x14,0x16,0x17,0x19,0x1a,0x1b,0x1c,
                0x1d,0x1f,0x20,0x21,0x22,0x24,0x25,0x26,0x27,0x28,
                0x29,0x2a,0x2c,
            ];
            // LETTERGP_H (3) = hard consonants: ъ ж ц ш (0x2a,0x16,0x26,0x28)
            const CYRL_HARD: &[usize] = &[0x2a, 0x16, 0x26, 0x28];
            // LETTERGP_F (4) = not-hard
            const CYRL_NOTHARD: &[usize] = &[
                0x11,0x12,0x13,0x14,0x17,0x19,0x1a,0x1b,0x1c,0x1d,
                0x1f,0x20,0x21,0x22,0x24,0x25,0x27,0x29,0x2c,
            ];
            // LETTERGP_G (5) = voiced obstruents: б в г д ж з (0x11-0x14,0x16-0x17)
            const CYRL_VOICED: &[usize] = &[0x11,0x12,0x13,0x14,0x16,0x17];
            // LETTERGP_Y (6) = iotated vowels + soft sign
            //   SetCyrillicLetters: ь ю я ё (0x2c,0x2e,0x2f,0x31)
            //   Translator_Russian adds: е и є ї (0x15,0x18,0x34,0x37)
            const CYRL_IVOWELS: &[usize] = &[0x2c, 0x2e, 0x2f, 0x31, 0x15, 0x18, 0x34, 0x37];

            fn set_bits(bits: &mut [u8; 256], indices: &[usize], group: u8) {
                for &idx in indices {
                    if idx < 256 { bits[idx] |= 1 << group; }
                }
            }

            set_bits(&mut bits, RU_VOWELS,     0); // LETTERGP_A
            set_bits(&mut bits, CYRL_SOFT,     1); // LETTERGP_B
            set_bits(&mut bits, RU_CONSONANTS, 2); // LETTERGP_C
            set_bits(&mut bits, CYRL_HARD,     3); // LETTERGP_H
            set_bits(&mut bits, CYRL_NOTHARD,  4); // LETTERGP_F
            set_bits(&mut bits, CYRL_VOICED,   5); // LETTERGP_G
            set_bits(&mut bits, CYRL_IVOWELS,  6); // LETTERGP_Y
            set_bits(&mut bits, RU_VOWELS,     7); // LETTERGP_VOWEL2
        }
        _ => {
            // Latin/default: use the English letter bits
            let en = crate::translate::english_letter_bits();
            bits.copy_from_slice(en.as_slice());
        }
    }

    bits
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    fn data_dir() -> PathBuf {
        PathBuf::from("/usr/share/espeak-ng-data")
    }

    fn try_load(lang: &str) -> Option<Dictionary> {
        let dir = data_dir();
        if !dir.join(format!("{}_dict", lang)).exists() {
            return None;
        }
        Some(Dictionary::load(lang, &dir).expect("load succeeded"))
    }

    #[test]
    fn load_en_dict() {
        let dict = match try_load("en") { Some(d) => d, None => return };
        assert_eq!(dict.lang, "en");
        assert_eq!(dict.rules_offset, 0x0001_b188,
            "rules_offset should be 0x1b188 for installed en_dict");
        // groups1['a'] must be non-null (English has 'a' rules)
        assert!(dict.groups.groups1[b'a' as usize].is_some(),
            "groups1['a'] should be set");
    }

    #[test]
    fn hash_table_covers_all_buckets() {
        let dict = match try_load("en") { Some(d) => d, None => return };
        // Every hash table entry must point somewhere inside the data.
        for (i, &off) in dict.hashtab.iter().enumerate() {
            assert!(off < dict.data.len(),
                "hashtab[{}] = {} out of bounds (len={})", i, off, dict.data.len());
        }
    }

    #[test]
    fn group1_default_is_some() {
        let dict = match try_load("en") { Some(d) => d, None => return };
        // groups1[0] is the default rule chain; it should be present in English
        assert!(dict.groups.groups1[0].is_some(),
            "default rule chain (groups1[0]) should be set for English");
    }

    #[test]
    fn de_dict_loads() {
        let _ = try_load("de"); // just check it doesn't panic
    }

    #[test]
    fn fr_dict_loads() {
        let _ = try_load("fr");
    }
}