use super::file::Dictionary;
use super::flags::{DictFlags1, DictFlags2};
use super::N_WORD_BYTES;
use super::transpose::transpose_alphabet;
pub fn hash_word(word: &[u8]) -> usize {
let mut hash: u32 = 0;
let mut chars: u32 = 0;
for &c in word {
if c == 0 { break; }
hash = hash.wrapping_mul(8).wrapping_add(c as u32);
hash = (hash & 0x3ff) ^ (hash >> 8);
chars += 1;
}
((hash + chars) & 0x3ff) as usize
}
#[derive(Clone, Debug, Default)]
pub struct LookupResult {
pub phonemes: Vec<u8>,
pub flags1: DictFlags1,
pub flags2: DictFlags2,
pub skipwords: usize,
}
#[derive(Clone, Copy, Debug, Default)]
pub struct LookupCtx {
pub dict_condition: u32,
pub word_flags: u32,
pub end_flags: u32,
pub lookup_symbol: bool,
pub expect_verb: bool,
pub expect_past: bool,
pub expect_verb_s: bool,
pub expect_noun: bool,
pub textmode_lang: bool,
pub clause_terminator: u32,
pub at_clause_end: bool,
pub is_first_word: bool,
}
pub const CLAUSE_TYPE_SENTENCE: u32 = 0x8000;
pub fn lookup_dict2(
dict: &Dictionary,
word: &[u8], ctx: &LookupCtx,
) -> Option<LookupResult> {
let word_str = std::str::from_utf8(word).unwrap_or("");
let transposed = transpose_alphabet(word_str, &dict.transpose);
let compressed_word = &transposed.bytes;
let wlen = transposed.wlen;
let hash = {
let ix = compressed_word.len();
let mut hash_buf: Vec<u8> = compressed_word.clone();
if ix < word.len() {
hash_buf.extend_from_slice(&word[ix..]);
}
hash_word(&hash_buf)
};
let bucket_start = dict.hashtab[hash];
let data = &dict.data;
let mut pos = bucket_start;
loop {
if pos >= data.len() { break; }
let entry_len = data[pos] as usize;
if entry_len == 0 { break; }
let entry_end = pos + entry_len;
if entry_end > data.len() { break; }
let word_info = data[pos + 1];
let stored_len = word_info & 0x7f; let actual_len = (wlen & 0x3f) as usize;
if stored_len != wlen
|| pos + 2 + actual_len > data.len()
|| &data[pos + 2..pos + 2 + actual_len] != compressed_word.as_slice()
{
pos = entry_end;
continue;
}
let no_phonemes = (word_info & 0x80) != 0;
let mut p = pos + 2 + actual_len;
let phonemes: Vec<u8>;
if no_phonemes {
phonemes = Vec::new();
} else {
let ph_start = p;
while p < entry_end && data[p] != 0 { p += 1; }
phonemes = data[ph_start..p].to_vec();
if p < entry_end { p += 1; } }
let mut flags1 = DictFlags1::default();
let mut flags2 = DictFlags2::default();
let mut skipwords: usize = 0;
let mut condition_failed = false;
while p < entry_end {
let flag = data[p];
p += 1;
if flag >= 100 {
if flag >= 132 {
if ctx.dict_condition & (1 << (flag - 132)) != 0 {
condition_failed = true;
}
} else {
if ctx.dict_condition & (1 << (flag - 100)) == 0 {
condition_failed = true;
}
}
} else if flag > 80 {
skipwords = (flag - 80) as usize;
condition_failed = true;
p = entry_end;
} else if flag > 64 {
flags1.0 = (flags1.0 & !0xf) | (flag & 0xf) as u32;
if (flag & 0xc) == 0xc {
flags1.set(super::FLAG_STRESS_END);
}
} else if flag >= 32 {
flags2.set(1u32 << (flag - 32));
} else {
flags1.set(1u32 << flag);
}
}
if condition_failed {
pos = entry_end;
continue;
}
let end_flags = ctx.end_flags;
let has_suffix = (end_flags & super::FLAG_SUFX) != 0;
if !has_suffix && flags2.stem_only() {
pos = entry_end;
continue;
}
if (end_flags & super::SUFX_P != 0) && (flags2.only_form() || flags2.only_s_form()) {
pos = entry_end;
continue;
}
if has_suffix {
if flags2.only_form() {
pos = entry_end;
continue;
}
if flags2.only_s_form() && (end_flags & super::FLAG_SUFX_S == 0) {
pos = entry_end;
continue;
}
}
if flags2.is_capital() && (ctx.word_flags & super::FLAG_FIRST_UPPER == 0) {
pos = entry_end;
continue;
}
if flags2.is_allcaps() && (ctx.word_flags & super::FLAG_ALL_UPPER == 0) {
pos = entry_end;
continue;
}
if flags1.contains(super::FLAG_NEEDS_DOT) && (ctx.word_flags & super::FLAG_HAS_DOT == 0) {
pos = entry_end;
continue;
}
if flags2.contains(DictFlags2::ATEND) && !ctx.at_clause_end && !ctx.lookup_symbol {
pos = entry_end;
continue;
}
if flags2.contains(DictFlags2::ATSTART) && !ctx.is_first_word {
pos = entry_end;
continue;
}
if flags2.contains(DictFlags2::SENTENCE)
&& (ctx.clause_terminator & CLAUSE_TYPE_SENTENCE == 0)
{
pos = entry_end;
continue;
}
if flags2.is_verb() {
if !ctx.expect_verb && !(ctx.expect_verb_s && (end_flags & super::FLAG_SUFX_S != 0)) {
pos = entry_end;
continue;
}
}
if flags2.is_past() && !ctx.expect_past {
pos = entry_end;
continue;
}
if flags2.is_noun() && (!ctx.expect_noun || (end_flags & super::SUFX_V != 0)) {
pos = entry_end;
continue;
}
flags1.set(super::FLAG_FOUND_ATTRIBUTES);
if !phonemes.is_empty() {
flags1.set(super::FLAG_FOUND);
}
if ctx.textmode_lang {
flags1.0 ^= super::FLAG_TEXTMODE;
}
return Some(LookupResult {
phonemes,
flags1,
flags2,
skipwords,
});
}
None
}
pub fn lookup(
dict: &Dictionary,
word: &str,
ctx: &LookupCtx,
) -> Option<LookupResult> {
let word_bytes: Vec<u8> = word.bytes()
.take_while(|&b| b != 0 && b != b' ')
.collect();
if word_bytes.is_empty() || word_bytes.len() >= N_WORD_BYTES {
return None;
}
lookup_dict2(dict, &word_bytes, ctx)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn en_dict() -> Option<Dictionary> {
let dir = PathBuf::from("/usr/share/espeak-ng-data");
if !dir.join("en_dict").exists() { return None; }
Some(Dictionary::load("en", &dir).unwrap())
}
#[test]
fn hash_hello() {
assert_eq!(hash_word(b"hello"), 48);
}
#[test]
fn hash_empty() {
assert_eq!(hash_word(b""), 0);
}
#[test]
fn hash_nul_terminated() {
assert_eq!(hash_word(b"hi\x00junk"), hash_word(b"hi"));
}
#[test]
fn hash_a() {
assert_eq!(hash_word(b"a"), 98);
}
#[test]
fn lookup_the() {
let dict = match en_dict() { Some(d) => d, None => return };
let ctx = LookupCtx { lookup_symbol: true, ..Default::default() };
let result = lookup(&dict, "the", &ctx);
assert!(result.is_some(), "'the' should be in en_dict");
let r = result.unwrap();
assert!(r.flags1.found(), "FLAG_FOUND should be set");
assert!(!r.phonemes.is_empty(), "'the' should have phonemes");
}
#[test]
fn lookup_notaword() {
let dict = match en_dict() { Some(d) => d, None => return };
let ctx = LookupCtx::default();
let result = lookup(&dict, "xzqfgh", &ctx);
assert!(result.is_none(), "non-word should not be found");
}
#[test]
fn lookup_a() {
let dict = match en_dict() { Some(d) => d, None => return };
let ctx = LookupCtx { lookup_symbol: true, ..Default::default() };
let result = lookup(&dict, "a", &ctx);
assert!(result.is_some(), "'a' should be in en_dict");
}
#[test]
fn lookup_and() {
let dict = match en_dict() { Some(d) => d, None => return };
let ctx = LookupCtx { lookup_symbol: true, ..Default::default() };
let result = lookup(&dict, "and", &ctx);
assert!(result.is_some(), "'and' should be in en_dict");
}
#[test]
fn lookup_is() {
let dict = match en_dict() { Some(d) => d, None => return };
let ctx = LookupCtx { lookup_symbol: true, ..Default::default() };
let result = lookup(&dict, "is", &ctx);
assert!(result.is_some(), "'is' should be in en_dict");
}
}