use super::{
RULE_PRE, RULE_POST, RULE_PHONEMES, RULE_PH_COMMON, RULE_CONDITION,
RULE_GROUP_END, RULE_PRE_ATSTART, RULE_LINENUM,
RULE_LETTERGP, RULE_LETTERGP2, RULE_NOTVOWEL, RULE_DIGIT, RULE_NONALPHA,
RULE_DOUBLE, RULE_DOLLAR, RULE_SYLLABLE, RULE_NOVOWELS, RULE_SKIPCHARS,
RULE_INC_SCORE, RULE_DEC_SCORE, RULE_DEL_FWD, RULE_ENDING, RULE_NO_SUFFIX,
RULE_STRESSED, RULE_CAPITAL, RULE_IFVERB, RULE_SPELLING,
RULE_SPACE,
DOLLAR_UNPR, DOLLAR_NOPREFIX, DOLLAR_LIST,
BITNUM_FLAG_ALT,
SUFX_UNPRON, SUFX_P,
FLAG_SUFFIX_VOWEL, FLAG_UNPRON_TEST, FLAG_PREFIX_REMOVED, FLAG_SUFFIX_REMOVED,
FLAG_HYPHEN, FLAG_HYPHEN_AFTER, FLAG_FIRST_UPPER,
LETTERGP_VOWEL2, LETTERGP_C,
REPLACED_E,
};
use super::file::Dictionary;
pub fn is_letter(letter_bits: &[u8; 256], c: u32, group: usize) -> bool {
if c > 0x7fff { return false; }
let idx = (c as usize) & 0xff;
(letter_bits[idx] >> group) & 1 != 0
}
pub fn is_letter_wc(letter_bits: &[u8; 256], wc: u32, letter_bits_offset: u32, group: usize) -> bool {
let idx = if letter_bits_offset > 0 {
if wc < letter_bits_offset { return false; }
let ix = wc - letter_bits_offset;
if ix >= 128 { return false; }
ix as usize
} else {
if wc >= 256 { return false; }
wc as usize
};
(letter_bits[idx] >> group) & 1 != 0
}
pub fn is_alpha(c: u32) -> bool {
(c as u8).is_ascii_alphabetic() || (c >= 0xc0 && c < 0x2c0)
}
pub fn is_digit(c: u32) -> bool {
(c >= b'0' as u32 && c <= b'9' as u32) || (c >= 0x660 && c <= 0x669) }
pub fn is_letter_group(
dict: &Dictionary,
text: &[u8],
text_pos: usize, group_ix: usize,
backwards: bool,
) -> i32 {
let group_data = match dict.letter_group(group_ix) {
Some(d) => d,
None => return -1,
};
let mut g_pos = 0usize;
loop {
if g_pos >= group_data.len() { break; }
let gb = group_data[g_pos];
if gb == 0 { break; }
if gb == super::RULE_GROUP_END { break; } if gb == b'~' { return 0; }
let entry_start = g_pos;
while g_pos < group_data.len() && group_data[g_pos] != 0
&& group_data[g_pos] != super::RULE_GROUP_END
{
g_pos += 1;
}
let entry = &group_data[entry_start..g_pos];
if g_pos < group_data.len() { g_pos += 1; }
let n = entry.len();
if n == 0 { continue; }
let matches = if backwards {
text_pos + 1 >= n
&& &text[text_pos + 1 - n..=text_pos] == entry
} else {
text_pos + n <= text.len()
&& &text[text_pos..text_pos + n] == entry
};
if matches {
return n as i32;
}
}
-1
}
#[derive(Clone, Debug, Default)]
pub struct MatchRecord {
pub points: i32,
pub phonemes_offset: usize,
pub end_type: u32,
pub del_fwd: usize,
}
impl MatchRecord {
const NONE: usize = usize::MAX;
fn reset() -> Self {
MatchRecord {
points: 1,
phonemes_offset: Self::NONE,
end_type: 0,
del_fwd: Self::NONE,
}
}
fn empty_best() -> Self {
MatchRecord {
points: 0,
phonemes_offset: Self::NONE,
end_type: 0,
del_fwd: Self::NONE,
}
}
}
pub struct RuleContext<'a> {
pub word: &'a [u8],
pub word_pos: usize,
pub group_length: usize,
pub word_flags: u32,
pub dict_flags0: u32,
pub dict_condition: u32,
pub word_vowel_count: i32,
pub word_stressed_count: i32,
pub letter_bits: &'a [u8; 256],
pub letter_bits_offset: u32,
pub unpron_test: bool,
pub expect_verb: bool,
pub suffix_option: u32, }
pub fn match_rule(
dict: &Dictionary,
ctx: &RuleContext<'_>,
rules: &[u8], rules_abs_offset: usize, word_pos_out: &mut usize,
) -> MatchRecord {
let word = ctx.word;
let group_length = ctx.group_length;
let word_flags = ctx.word_flags;
let _dict_flags0 = ctx.dict_flags0; let dict_condition = ctx.dict_condition;
let unpron_ignore = ctx.unpron_test;
let mut total_consumed = 0usize;
let mut common_phonemes: Option<usize> = None;
let mut best = MatchRecord::empty_best();
let mut r_pos = 0usize;
while r_pos < rules.len() && rules[r_pos] != RULE_GROUP_END {
let mut check_atstart = false;
let mut consumed = 0usize; let mut distance_left: i32 = -2;
let mut distance_right: i32 = -6;
let mut failed: i32 = 0;
let _rule_start = r_pos; let mut unpron_ignore_local: bool = unpron_ignore;
let mut match_type: u8 = 0; let mut match_ = MatchRecord::reset();
let mut letter_w: u32 = 0;
#[allow(unused_assignments)]
let mut last_letter_w: u32 = 0;
let mut pre_ptr = ctx.word_pos; let mut post_ptr = ctx.word_pos + group_length;
while failed == 0 {
if r_pos >= rules.len() { break; }
let rb = rules[r_pos];
r_pos += 1;
let mut add_points: i32 = 0;
if rb <= RULE_LINENUM {
match rb {
0 => {
if let Some(cp_off) = common_phonemes {
let mut cp = cp_off - rules_abs_offset;
loop {
if cp >= rules.len() { break; }
let b = rules[cp];
cp += 1;
if b == 0 || b == RULE_PHONEMES { break; }
if b == RULE_CONDITION { cp += 1; }
if b == RULE_LINENUM { cp += 2; }
}
match_.phonemes_offset = cp + rules_abs_offset;
} else {
match_.phonemes_offset = MatchRecord::NONE;
}
r_pos -= 1; failed = 2; }
RULE_PRE_ATSTART => {
check_atstart = true;
unpron_ignore_local = false;
match_type = RULE_PRE;
}
RULE_PRE => {
match_type = RULE_PRE;
if unpron_ignore_local { failed = 1; }
}
RULE_POST => {
match_type = RULE_POST;
}
RULE_PHONEMES => {
match_.phonemes_offset = rules_abs_offset + r_pos;
failed = 2; }
RULE_PH_COMMON => {
common_phonemes = Some(rules_abs_offset + r_pos);
}
RULE_CONDITION => {
let cond_num = rules[r_pos];
r_pos += 1;
if cond_num >= 32 {
if dict_condition & (1 << (cond_num - 32)) != 0 { failed = 1; }
} else {
if dict_condition & (1 << cond_num) == 0 { failed = 1; }
}
if failed == 0 { match_.points += 1; }
}
RULE_LINENUM => {
r_pos += 2;
}
_ => {
}
}
continue;
}
match match_type {
0 => {
if post_ptr >= word.len() { failed = 1; break; }
let letter = word[post_ptr];
post_ptr += 1;
if letter == rb || (letter == REPLACED_E && rb == b'e') {
if (letter & 0xc0) != 0x80 { add_points = 21; }
consumed += 1;
} else {
failed = 1;
}
}
RULE_POST => {
distance_right += 6;
if distance_right > 18 { distance_right = 19; }
last_letter_w = letter_w;
if post_ptr == 0
|| post_ptr >= word.len()
|| (post_ptr > 0 && word[post_ptr - 1] == 0)
{
failed = 1;
break;
}
let (wc, xbytes) = utf8_decode(word, post_ptr);
letter_w = wc;
let letter = word[post_ptr];
post_ptr += 1;
match rb {
RULE_LETTERGP => {
let lg = letter_group_no(&mut r_pos, rules);
if is_letter_wc(ctx.letter_bits, letter_w, ctx.letter_bits_offset, lg) {
let lg_pts = if lg == LETTERGP_C { 19 } else { 20 };
add_points = lg_pts - distance_right;
post_ptr += xbytes;
} else { failed = 1; }
}
RULE_LETTERGP2 => {
let lg = letter_group_no(&mut r_pos, rules);
let n = is_letter_group(dict, word, post_ptr - 1, lg, false);
if n >= 0 {
add_points = 20 - distance_right;
post_ptr += (n as usize).saturating_sub(1);
} else { failed = 1; }
}
RULE_NOTVOWEL => {
if is_letter_wc(ctx.letter_bits, letter_w, ctx.letter_bits_offset, LETTERGP_VOWEL2)
|| (letter_w == RULE_SPACE as u32
&& word_flags & FLAG_SUFFIX_VOWEL != 0)
{
failed = 1;
} else {
add_points = 20 - distance_right;
post_ptr += xbytes;
}
}
RULE_DIGIT => {
if is_digit(letter_w) {
add_points = 20 - distance_right;
post_ptr += xbytes;
} else { failed = 1; }
}
RULE_NONALPHA => {
if !is_alpha(letter_w) {
add_points = 21 - distance_right;
post_ptr += xbytes;
} else { failed = 1; }
}
RULE_DOUBLE => {
if letter_w == last_letter_w {
add_points = 21 - distance_right;
post_ptr += xbytes;
} else { failed = 1; }
}
RULE_DOLLAR => {
let command = rules[r_pos];
r_pos += 1;
post_ptr -= 1;
if command == DOLLAR_UNPR {
match_.end_type = SUFX_UNPRON;
} else if command == DOLLAR_NOPREFIX {
if word_flags & FLAG_PREFIX_REMOVED != 0 { failed = 1; }
else { add_points = 1; }
} else if (command & 0xf0) == 0x10 {
let flag_bit = (BITNUM_FLAG_ALT + (command & 0x0f) as u32) as usize;
if ctx.dict_flags0 & (1 << flag_bit) != 0 {
add_points = 23;
} else {
failed = 1;
}
} else if (command & 0xf0) == 0x20 || command == DOLLAR_LIST {
failed = 1;
}
}
b'-' => {
if letter == b'-'
|| (letter == b' ' && word_flags & FLAG_HYPHEN_AFTER != 0)
{
add_points = 22 - distance_right;
} else { failed = 1; }
}
RULE_SYLLABLE => {
let mut p2 = post_ptr + xbytes;
let mut vowel_count = 0i32;
let mut syllable_count = 1i32;
while r_pos < rules.len() && rules[r_pos] == RULE_SYLLABLE {
r_pos += 1;
syllable_count += 1;
}
let mut lw = letter_w;
let mut vowel_flag = false;
while lw != RULE_SPACE as u32 && lw != 0 {
if !vowel_flag && is_letter_wc(ctx.letter_bits, lw, ctx.letter_bits_offset, LETTERGP_VOWEL2) {
vowel_count += 1;
}
vowel_flag = is_letter_wc(ctx.letter_bits, lw, ctx.letter_bits_offset, LETTERGP_VOWEL2);
let (nw, _) = utf8_decode(word, p2);
lw = nw;
p2 += 1;
if p2 >= word.len() { break; }
}
if syllable_count <= vowel_count {
add_points = 18 + syllable_count - distance_right;
} else { failed = 1; }
}
RULE_NOVOWELS => {
let mut p2 = post_ptr + xbytes;
let mut lw = letter_w;
loop {
if lw == RULE_SPACE as u32 || lw == 0 { break; }
if is_letter_wc(ctx.letter_bits, lw, ctx.letter_bits_offset, LETTERGP_VOWEL2) {
failed = 1;
break;
}
let (nw, _) = utf8_decode(word, p2);
lw = nw;
p2 += 1;
if p2 >= word.len() { break; }
}
if failed == 0 { add_points = 19 - distance_right; }
}
RULE_SKIPCHARS => {
let target = rules[r_pos] as u32;
let mut p2 = post_ptr.saturating_sub(1);
let mut p2_prev = p2;
let mut found_lw = letter_w;
while found_lw != target
&& found_lw != RULE_SPACE as u32
&& found_lw != 0
{
p2_prev = p2;
let (nw, _) = utf8_decode(word, p2);
found_lw = nw;
p2 += 1;
if p2 >= word.len() { break; }
}
if found_lw == target { post_ptr = p2_prev; }
}
RULE_INC_SCORE => {
post_ptr -= 1;
add_points = 20;
}
RULE_DEC_SCORE => {
post_ptr -= 1;
add_points = -20;
}
RULE_DEL_FWD => {
let search_start = ctx.word_pos + group_length;
let search_end = post_ptr;
for k in search_start..search_end {
if k < word.len() && word[k] == b'e' {
match_.del_fwd = rules_abs_offset + k; break;
}
}
}
RULE_ENDING => {
if r_pos + 2 < rules.len() {
let end_type = (rules[r_pos] as u32) << 16
| ((rules[r_pos + 1] & 0x7f) as u32) << 8
| (rules[r_pos + 2] & 0x7f) as u32;
r_pos += 3;
if ctx.word_vowel_count == 0
&& (end_type & SUFX_P == 0)
&& (ctx.suffix_option & 1 != 0)
{
failed = 1;
} else {
match_.end_type = end_type;
}
}
}
RULE_NO_SUFFIX => {
if word_flags & FLAG_SUFFIX_REMOVED != 0 { failed = 1; }
else {
post_ptr -= 1;
add_points = 1;
}
}
RULE_SPELLING => {
}
_ => {
if letter == rb {
if (letter & 0xc0) != 0x80 {
add_points = 21 - distance_right;
}
} else { failed = 1; }
}
}
}
RULE_PRE => {
distance_left += 2;
if distance_left > 18 { distance_left = 19; }
if pre_ptr >= word.len() || word[pre_ptr] == 0 { failed = 1; break; }
let (cur_lw, _) = utf8_decode(word, pre_ptr);
last_letter_w = cur_lw;
if pre_ptr == 0 { failed = 1; break; }
pre_ptr -= 1;
let (lw, xbytes) = utf8_decode_backwards(word, pre_ptr + 1);
letter_w = lw;
let letter = word[pre_ptr];
match rb {
RULE_LETTERGP => {
let lg = letter_group_no(&mut r_pos, rules);
if is_letter_wc(ctx.letter_bits, letter_w, ctx.letter_bits_offset, lg) {
let lg_pts = if lg == LETTERGP_C { 19 } else { 20 };
add_points = lg_pts - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else { failed = 1; }
}
RULE_LETTERGP2 => {
let lg = letter_group_no(&mut r_pos, rules);
let n = is_letter_group(dict, word, pre_ptr, lg, true);
if n >= 0 {
add_points = 20 - distance_right;
pre_ptr = pre_ptr.saturating_sub((n as usize).saturating_sub(1));
} else { failed = 1; }
}
RULE_NOTVOWEL => {
if !is_letter_wc(ctx.letter_bits, letter_w, ctx.letter_bits_offset, LETTERGP_VOWEL2) {
add_points = 20 - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else { failed = 1; }
}
RULE_DOUBLE => {
if letter_w == last_letter_w {
add_points = 21 - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else { failed = 1; }
}
RULE_DIGIT => {
if is_digit(letter_w) {
add_points = 21 - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else { failed = 1; }
}
RULE_NONALPHA => {
if !is_alpha(letter_w) {
add_points = 21 - distance_right;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else { failed = 1; }
}
RULE_DOLLAR => {
let command = rules[r_pos];
r_pos += 1;
if pre_ptr < word.len() { pre_ptr += 1; }
if (command & 0xf0) == 0x10 {
let flag_bit = (BITNUM_FLAG_ALT + (command & 0x0f) as u32) as usize;
if ctx.dict_flags0 & (1 << flag_bit) != 0 {
add_points = 23;
} else {
failed = 1;
}
} else if (command & 0xf0) == 0x20 || command == DOLLAR_LIST {
failed = 1;
}
}
RULE_SYLLABLE => {
let mut syllable_count = 1i32;
while r_pos < rules.len() && rules[r_pos] == RULE_SYLLABLE {
r_pos += 1;
syllable_count += 1;
}
if syllable_count <= ctx.word_vowel_count {
add_points = 18 + syllable_count - distance_left;
} else { failed = 1; }
}
RULE_STRESSED => {
if pre_ptr < word.len() { pre_ptr += 1; }
if ctx.word_stressed_count > 0 { add_points = 19; }
else { failed = 1; }
}
RULE_NOVOWELS => {
let mut p2 = pre_ptr;
let mut lw2 = letter_w;
loop {
if lw2 == RULE_SPACE as u32 { break; }
if is_letter_wc(ctx.letter_bits, lw2, ctx.letter_bits_offset, LETTERGP_VOWEL2) {
failed = 1;
break;
}
if p2 == 0 { break; }
let (nw, nb) = utf8_decode_backwards(word, p2);
lw2 = nw;
p2 = p2.saturating_sub(nb + 1);
}
if failed == 0 { add_points = 3; }
}
RULE_IFVERB => {
if pre_ptr < word.len() { pre_ptr += 1; }
if ctx.expect_verb { add_points = 1; }
else { failed = 1; }
}
RULE_CAPITAL => {
if pre_ptr < word.len() { pre_ptr += 1; }
if word_flags & FLAG_FIRST_UPPER != 0 { add_points = 1; }
else { failed = 1; }
}
b'.' => {
let mut k = pre_ptr;
let mut found_dot = false;
loop {
if k >= word.len() || word[k] == 0 || word[k] == b' ' { break; }
if word[k] == b'.' { add_points = 50; found_dot = true; break; }
if k == 0 { break; }
k -= 1;
}
if !found_dot { failed = 1; }
}
b'-' => {
if letter == b'-'
|| (letter == b' ' && word_flags & FLAG_HYPHEN != 0)
{
add_points = 22 - distance_right;
} else { failed = 1; }
}
RULE_SKIPCHARS => {
let target = rules[r_pos];
let mut p2 = (pre_ptr + 1).min(word.len().saturating_sub(1));
let mut p2_prev = p2;
loop {
if p2 >= word.len() { break; }
if word[p2] == target { break; }
if word[p2] == RULE_SPACE || word[p2] == 0 { break; }
p2_prev = p2;
p2 = p2.saturating_sub(1);
}
if p2 < word.len() && word[p2] == target {
pre_ptr = p2_prev;
}
}
_ => {
if letter == rb {
add_points = if letter == RULE_SPACE { 4 }
else if (letter & 0xc0) != 0x80 { 21 - distance_left }
else { 0 };
} else { failed = 1; }
}
}
}
_ => { failed = 1; }
}
if failed == 0 {
match_.points += add_points;
}
}
if failed == 2 && !unpron_ignore {
let at_word_start = !check_atstart || (pre_ptr == 0 || word[pre_ptr.saturating_sub(1)] == b' ');
if at_word_start {
if check_atstart { match_.points += 4; }
if match_.points >= best.points {
if std::env::var("ESPEAK_DEBUG_RULES").is_ok() {
let ph_off = match_.phonemes_offset;
let rule_end = rules[_rule_start..]
.iter()
.position(|&b| b == 0)
.map(|rel| _rule_start + rel)
.unwrap_or(rules.len());
let rule_bytes = &rules[_rule_start..rule_end];
eprintln!(
" [RULE MATCH] word={:?} rule_start={} abs={} points={} ph_off={:?} rule={:?}",
String::from_utf8_lossy(ctx.word),
_rule_start,
rules_abs_offset + _rule_start,
match_.points,
if ph_off == usize::MAX { None } else { Some(ph_off) },
rule_bytes,
);
}
best = match_.clone();
total_consumed = consumed;
}
}
}
while r_pos < rules.len() && rules[r_pos] != 0 {
r_pos += 1;
}
if r_pos < rules.len() { r_pos += 1; } }
total_consumed += group_length;
if total_consumed == 0 { total_consumed = 1; }
*word_pos_out = ctx.word_pos + total_consumed;
best
}
#[derive(Clone, Debug, Default)]
pub struct RulesResult {
pub phonemes: Vec<u8>,
pub end_phonemes: Vec<u8>,
pub end_type: u32,
pub suffix_start: usize,
pub spellword: bool,
}
pub fn translate_rules(
dict: &Dictionary,
word_buf: &[u8],
word_start: usize, word_flags: u32,
dict_flags: u32,
letter_bits: &[u8; 256],
dict_condition: u32,
word_vowel_count: &mut i32,
word_stressed_count: &mut i32,
) -> RulesResult {
translate_rules_phdata(dict, word_buf, word_start, word_flags, dict_flags,
letter_bits, dict_condition, word_vowel_count, word_stressed_count, None)
}
pub fn translate_rules_phdata(
dict: &Dictionary,
word_buf: &[u8],
word_start: usize,
word_flags: u32,
dict_flags: u32,
letter_bits: &[u8; 256],
dict_condition: u32,
word_vowel_count: &mut i32,
word_stressed_count: &mut i32,
phdata: Option<&crate::phoneme::load::PhonemeData>,
) -> RulesResult {
if dict.rules().is_empty() {
return RulesResult::default();
}
let mut out_phonemes: Vec<u8> = Vec::new();
let mut out_end_phonemes: Vec<u8> = Vec::new();
let mut out_end_type = 0u32;
let mut out_suffix_start = 0usize;
let mut out_stem_ph_len = 0usize; let mut spellword = false;
let mut pos = word_start;
let rules_abs_base = dict.rules_offset;
while pos < word_buf.len() {
let c = word_buf[pos];
if c == 0 || c == b' ' { break; }
let (wc, wc_bytes) = utf8_decode(word_buf, pos);
let mut found = false;
let mut match1 = MatchRecord::empty_best();
let mut next_pos1 = pos + 1;
let group_length_total = wc_bytes + 1;
let lbo = dict.letter_bits_offset as u64;
if lbo > 0 && (wc as u64) >= lbo {
let g3_idx = (wc as u64 - lbo) as usize;
if g3_idx < 128 {
let c2 = (g3_idx + 1) as u8; if let Some(g3_rules) = dict.group3(c2) {
let g3_abs = dict.groups.groups3[g3_idx].unwrap_or(0);
let ctx = make_ctx(word_buf, pos, group_length_total, word_flags, dict_flags,
dict_condition, letter_bits, dict.letter_bits_offset, *word_vowel_count, *word_stressed_count);
let mut np = pos;
match1 = match_rule(dict, &ctx, g3_rules, g3_abs, &mut np);
next_pos1 = np;
found = match1.points > 0 || np > pos;
}
}
}
if !found {
if let Some(g3_rules) = dict.group3(c) {
let g3_abs = dict.groups.groups3[(c.wrapping_sub(1)) as usize].unwrap_or(0);
let ctx = make_ctx(word_buf, pos, group_length_total, word_flags, dict_flags,
dict_condition, letter_bits, dict.letter_bits_offset, *word_vowel_count, *word_stressed_count);
let mut np = pos;
match1 = match_rule(dict, &ctx, g3_rules, g3_abs, &mut np);
next_pos1 = np;
found = match1.points > 0 || np > pos;
}
}
let n = dict.groups.groups2_count[c as usize] as usize;
if !found && n > 0 && pos + 1 < word_buf.len() {
let c2 = word_buf[pos + 1];
let key = (c as u16) | ((c2 as u16) << 8);
let g1 = dict.groups.groups2_start[c as usize] as usize;
let g_end = (g1 + n).min(dict.groups.groups2.len());
for g in g1..g_end {
if dict.groups.groups2[g].key == key {
found = true;
let entry = &dict.groups.groups2[g];
let g2_rules = &dict.data[entry.offset..];
let ctx2 = make_ctx(word_buf, pos, 2, word_flags, dict_flags,
dict_condition, letter_bits, dict.letter_bits_offset, *word_vowel_count, *word_stressed_count);
let mut np2 = pos;
let mut m2 = match_rule(dict, &ctx2, g2_rules, entry.offset, &mut np2);
if m2.points > 0 { m2.points += 35; }
let ctx1 = make_ctx(word_buf, pos, 1, word_flags, dict_flags,
dict_condition, letter_bits, dict.letter_bits_offset, *word_vowel_count, *word_stressed_count);
if let Some(g1_rules) = dict.group1(c) {
let g1_abs = dict.groups.groups1[c as usize].unwrap_or(rules_abs_base);
let mut np1 = pos;
match1 = match_rule(dict, &ctx1, g1_rules, g1_abs, &mut np1);
next_pos1 = np1;
}
if m2.points >= match1.points {
match1 = m2;
next_pos1 = np2;
}
break;
}
}
}
if !found {
if let Some(g1_rules) = dict.group1(c) {
let g1_abs = dict.groups.groups1[c as usize].unwrap_or(rules_abs_base);
let ctx = make_ctx(word_buf, pos, 1, word_flags, dict_flags,
dict_condition, letter_bits, dict.letter_bits_offset, *word_vowel_count, *word_stressed_count);
let mut np = pos;
match1 = match_rule(dict, &ctx, g1_rules, g1_abs, &mut np);
next_pos1 = np;
} else {
if let Some(def_rules) = dict.group1(0) {
let def_abs = dict.groups.groups1[0].unwrap_or(rules_abs_base);
let ctx = make_ctx(word_buf, pos, 0, word_flags, dict_flags,
dict_condition, letter_bits, dict.letter_bits_offset, *word_vowel_count, *word_stressed_count);
let mut np = pos;
match1 = match_rule(dict, &ctx, def_rules, def_abs, &mut np);
next_pos1 = np;
}
if match1.points == 0 && is_alpha(wc) {
spellword = true;
break;
}
}
}
if match1.points > 0 {
if match1.phonemes_offset != MatchRecord::NONE {
let ph_start = match1.phonemes_offset;
let mut ph_end = ph_start;
while ph_end < dict.data.len() && dict.data[ph_end] != 0 {
ph_end += 1;
}
if let Some(ph) = phdata {
let mut unstress_mark = false;
for &code in &dict.data[ph_start..ph_end] {
if let Some(phoneme) = ph.get(code) {
if phoneme.typ == 1 {
if phoneme.std_length < 4 {
unstress_mark = true;
}
} else if phoneme.typ == 2 {
if (phoneme.phflags & 2) == 0 && !unstress_mark {
*word_stressed_count += 1;
}
unstress_mark = false;
*word_vowel_count += 1;
} else {
unstress_mark = false;
}
}
}
}
let before_len = out_phonemes.len();
out_phonemes.extend_from_slice(&dict.data[ph_start..ph_end]);
if match1.end_type != 0 && out_end_type == 0 {
out_stem_ph_len = before_len;
let suffix_slice = &out_phonemes[before_len..];
out_end_phonemes.extend_from_slice(suffix_slice);
}
}
if match1.end_type != 0 {
out_end_type = match1.end_type;
out_suffix_start = pos; }
}
if next_pos1 <= pos {
let (_, adv) = utf8_decode(word_buf, pos);
pos += adv + 1;
} else {
pos = next_pos1;
}
}
if out_end_type != 0 && out_stem_ph_len <= out_phonemes.len() {
out_phonemes.truncate(out_stem_ph_len);
}
out_phonemes.push(0);
out_end_phonemes.push(0);
RulesResult {
phonemes: out_phonemes,
end_phonemes: out_end_phonemes,
end_type: out_end_type,
suffix_start: out_suffix_start,
spellword,
}
}
fn make_ctx<'a>(
word: &'a [u8],
pos: usize,
group_length: usize,
word_flags: u32,
dict_flags: u32,
dict_condition: u32,
letter_bits: &'a [u8; 256],
letter_bits_offset: u32,
vowel_count: i32,
stressed_count: i32,
) -> RuleContext<'a> {
RuleContext {
word,
word_pos: pos,
group_length,
word_flags,
dict_flags0: dict_flags,
dict_condition,
word_vowel_count: vowel_count,
word_stressed_count: stressed_count,
letter_bits,
letter_bits_offset,
unpron_test: word_flags & FLAG_UNPRON_TEST != 0,
expect_verb: false,
suffix_option: 0,
}
}
pub fn utf8_decode(buf: &[u8], pos: usize) -> (u32, usize) {
if pos >= buf.len() { return (0, 0); }
let b0 = buf[pos] as u32;
if b0 < 0x80 {
return (b0, 0);
} else if b0 < 0xe0 {
if pos + 1 < buf.len() {
let b1 = buf[pos + 1] as u32;
return (((b0 & 0x1f) << 6) | (b1 & 0x3f), 1);
}
} else if b0 < 0xf0 {
if pos + 2 < buf.len() {
let b1 = buf[pos + 1] as u32;
let b2 = buf[pos + 2] as u32;
return (((b0 & 0x0f) << 12) | ((b1 & 0x3f) << 6) | (b2 & 0x3f), 2);
}
} else if pos + 3 < buf.len() {
let b1 = buf[pos + 1] as u32;
let b2 = buf[pos + 2] as u32;
let b3 = buf[pos + 3] as u32;
return (((b0 & 0x07) << 18) | ((b1 & 0x3f) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f), 3);
}
(b0, 0)
}
fn utf8_decode_backwards(buf: &[u8], pos: usize) -> (u32, usize) {
if pos == 0 { return (0, 0); }
let mut start = pos - 1;
while start > 0 && (buf[start] & 0xc0) == 0x80 {
start -= 1;
}
let (c, _) = utf8_decode(buf, start);
(c, pos - 1 - start)
}
fn letter_group_no(r_pos: &mut usize, rules: &[u8]) -> usize {
if *r_pos >= rules.len() { return 0; }
let b = rules[*r_pos];
*r_pos += 1;
(b as i16 - b'A' as i16).rem_euclid(N_LETTER_GROUPS as i16) as usize
}
use super::N_LETTER_GROUPS;
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
use crate::dictionary::Dictionary;
fn en_dict() -> Option<Dictionary> {
let dir = PathBuf::from("/usr/share/espeak-ng-data");
if !dir.join("en_dict").exists() { return None; }
Some(Dictionary::load("en", &dir).unwrap())
}
fn default_letter_bits() -> [u8; 256] {
let mut bits = [0u8; 256];
for c in b"aeiouAEIOU".iter() {
bits[*c as usize] |= (1 << LETTERGP_VOWEL2) | 1;
}
for c in b"bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ".iter() {
bits[*c as usize] |= 1 << LETTERGP_C;
}
bits
}
#[test]
fn utf8_decode_ascii() {
assert_eq!(utf8_decode(b"hello", 0), (b'h' as u32, 0));
assert_eq!(utf8_decode(b"hello", 4), (b'o' as u32, 0));
}
#[test]
fn utf8_decode_two_byte() {
let buf: &[u8] = &[0xc3, 0xa9];
let (c, xb) = utf8_decode(buf, 0);
assert_eq!(c, 0xe9);
assert_eq!(xb, 1);
}
#[test]
fn hash_word_in_rules() {
let dict = match en_dict() { Some(d) => d, None => return };
let h = super::super::lookup::hash_word(b"the");
let bucket_start = dict.hashtab[h];
assert!(bucket_start >= 8 && bucket_start < dict.rules_offset,
"bucket for 'the' should be in word-list region");
}
#[test]
fn translate_rules_short_word() {
let dict = match en_dict() { Some(d) => d, None => return };
let letter_bits = default_letter_bits();
let word_buf = b" a ";
let mut vcount = 0i32;
let mut scount = 0i32;
let result = translate_rules(
&dict, word_buf, 1, 0, 0, &letter_bits, 0,
&mut vcount, &mut scount,
);
let _ = result;
}
#[test]
fn translate_rules_hello() {
let dict = match en_dict() { Some(d) => d, None => return };
let letter_bits = default_letter_bits();
let word_buf = b" hello ";
let mut vcount = 0i32;
let mut scount = 0i32;
let result = translate_rules(
&dict, word_buf, 1, 0, 0, &letter_bits, 0,
&mut vcount, &mut scount,
);
assert!(!result.spellword,
"hello should not trigger spellword; phonemes={:?}", result.phonemes);
}
}