use super::file::Dictionary;
use super::{
BITNUM_FLAG_ALT, DOLLAR_LIST, DOLLAR_NOPREFIX, DOLLAR_UNPR, FLAG_FIRST_UPPER, FLAG_HYPHEN,
FLAG_HYPHEN_AFTER, FLAG_PREFIX_REMOVED, FLAG_SUFFIX_REMOVED, FLAG_SUFFIX_VOWEL,
FLAG_UNPRON_TEST, LETTERGP_C, LETTERGP_VOWEL2, REPLACED_E, RULE_CAPITAL, RULE_CONDITION,
RULE_DEC_SCORE, RULE_DEL_FWD, RULE_DIGIT, RULE_DOLLAR, RULE_DOUBLE, RULE_ENDING,
RULE_GROUP_END, RULE_IFVERB, RULE_INC_SCORE, RULE_LETTERGP, RULE_LETTERGP2, RULE_LINENUM,
RULE_NO_SUFFIX, RULE_NONALPHA, RULE_NOTVOWEL, RULE_NOVOWELS, RULE_PH_COMMON, RULE_PHONEMES,
RULE_POST, RULE_PRE, RULE_PRE_ATSTART, RULE_SKIPCHARS, RULE_SPACE, RULE_SPELLING,
RULE_STRESSED, RULE_SYLLABLE, SUFX_P, SUFX_UNPRON,
};
use crate::phoneme::load::{ActiveTable, PhonemeData};
pub fn is_letter(letter_bits: &[u8; 256], c: u32, group: usize) -> bool {
if c > 0x7fff {
return false;
}
let idx = (c as usize) & 0xff;
(letter_bits[idx] >> group) & 1 != 0
}
pub fn is_letter_wc(
letter_bits: &[u8; 256],
wc: u32,
letter_bits_offset: u32,
group: usize,
) -> bool {
let idx = if letter_bits_offset > 0 {
if wc < letter_bits_offset {
return false;
}
let ix = wc - letter_bits_offset;
if ix >= 128 {
return false;
}
ix as usize
} else {
if wc >= 256 {
return false;
}
wc as usize
};
(letter_bits[idx] >> group) & 1 != 0
}
pub fn is_alpha(c: u32) -> bool {
(c as u8).is_ascii_alphabetic() || (c >= 0xc0 && c < 0x2c0)
}
pub fn is_digit(c: u32) -> bool {
(c >= b'0' as u32 && c <= b'9' as u32) || (c >= 0x660 && c <= 0x669) }
pub fn is_letter_group(
dict: &Dictionary,
text: &[u8],
text_pos: usize, group_ix: usize,
backwards: bool,
) -> i32 {
let group_data = match dict.letter_group(group_ix) {
Some(d) => d,
None => return -1,
};
let mut g_pos = 0usize;
loop {
if g_pos >= group_data.len() {
break;
}
let gb = group_data[g_pos];
if gb == 0 {
break;
}
if gb == super::RULE_GROUP_END {
break;
} if gb == b'~' {
return 0;
}
let entry_start = g_pos;
while g_pos < group_data.len()
&& group_data[g_pos] != 0
&& group_data[g_pos] != super::RULE_GROUP_END
{
g_pos += 1;
}
let entry = &group_data[entry_start..g_pos];
if g_pos < group_data.len() {
g_pos += 1;
}
let n = entry.len();
if n == 0 {
continue;
}
let matches = if backwards {
text_pos + 1 >= n && &text[text_pos + 1 - n..=text_pos] == entry
} else {
text_pos + n <= text.len() && &text[text_pos..text_pos + n] == entry
};
if matches {
return n as i32;
}
}
-1
}
#[derive(Clone, Debug, Default)]
pub struct MatchRecord {
pub points: i32,
pub phonemes_offset: usize,
pub end_type: u32,
pub del_fwd: usize,
}
impl MatchRecord {
const NONE: usize = usize::MAX;
fn reset() -> Self {
MatchRecord {
points: 1,
phonemes_offset: Self::NONE,
end_type: 0,
del_fwd: Self::NONE,
}
}
fn empty_best() -> Self {
MatchRecord {
points: 0,
phonemes_offset: Self::NONE,
end_type: 0,
del_fwd: Self::NONE,
}
}
}
pub struct RuleContext<'a> {
pub word: &'a [u8],
pub word_pos: usize,
pub group_length: usize,
pub word_flags: u32,
pub dict_flags0: u32,
pub dict_condition: u32,
pub word_vowel_count: i32,
pub word_stressed_count: i32,
pub letter_bits: &'a [u8; 256],
pub letter_bits_offset: u32,
pub unpron_test: bool,
pub expect_verb: bool,
pub suffix_option: u32, }
pub fn match_rule(
dict: &Dictionary,
ctx: &RuleContext<'_>,
rules: &[u8], rules_abs_offset: usize, word_pos_out: &mut usize,
) -> MatchRecord {
let word = ctx.word;
let group_length = ctx.group_length;
let word_flags = ctx.word_flags;
let _dict_flags0 = ctx.dict_flags0; let dict_condition = ctx.dict_condition;
let unpron_ignore = ctx.unpron_test;
let mut total_consumed = 0usize;
let mut common_phonemes: Option<usize> = None;
let mut best = MatchRecord::empty_best();
let mut r_pos = 0usize;
while r_pos < rules.len() && rules[r_pos] != RULE_GROUP_END {
let mut check_atstart = false;
let mut consumed = 0usize; let mut distance_left: i32 = -2;
let mut distance_right: i32 = -6;
let mut failed: i32 = 0;
let _rule_start = r_pos; let mut unpron_ignore_local: bool = unpron_ignore;
let mut match_type: u8 = 0; let mut match_ = MatchRecord::reset();
let mut letter_w: u32 = 0;
#[allow(unused_assignments)]
let mut last_letter_w: u32 = 0;
let mut pre_ptr = ctx.word_pos; let mut post_ptr = ctx.word_pos + group_length;
while failed == 0 {
if r_pos >= rules.len() {
break;
}
let rb = rules[r_pos];
r_pos += 1;
let mut add_points: i32 = 0;
if rb <= RULE_LINENUM {
match rb {
0 => {
if let Some(cp_off) = common_phonemes {
let mut cp = cp_off - rules_abs_offset;
loop {
if cp >= rules.len() {
break;
}
let b = rules[cp];
cp += 1;
if b == 0 || b == RULE_PHONEMES {
break;
}
if b == RULE_CONDITION {
cp += 1;
}
if b == RULE_LINENUM {
cp += 2;
}
}
match_.phonemes_offset = cp + rules_abs_offset;
} else {
match_.phonemes_offset = MatchRecord::NONE;
}
r_pos -= 1; failed = 2; }
RULE_PRE_ATSTART => {
check_atstart = true;
unpron_ignore_local = false;
match_type = RULE_PRE;
}
RULE_PRE => {
match_type = RULE_PRE;
if unpron_ignore_local {
failed = 1;
}
}
RULE_POST => {
match_type = RULE_POST;
}
RULE_PHONEMES => {
match_.phonemes_offset = rules_abs_offset + r_pos;
failed = 2; }
RULE_PH_COMMON => {
common_phonemes = Some(rules_abs_offset + r_pos);
}
RULE_CONDITION => {
let cond_num = rules[r_pos];
r_pos += 1;
if cond_num >= 32 {
if dict_condition & (1 << (cond_num - 32)) != 0 {
failed = 1;
}
} else {
if dict_condition & (1 << cond_num) == 0 {
failed = 1;
}
}
if failed == 0 {
match_.points += 1;
}
}
RULE_LINENUM => {
r_pos += 2;
}
_ => {
}
}
continue;
}
match match_type {
0 => {
if post_ptr >= word.len() {
failed = 1;
break;
}
let letter = word[post_ptr];
post_ptr += 1;
if letter == rb || (letter == REPLACED_E && rb == b'e') {
if (letter & 0xc0) != 0x80 {
add_points = 21;
}
consumed += 1;
} else {
failed = 1;
}
}
RULE_POST => {
distance_right += 6;
if distance_right > 18 {
distance_right = 19;
}
last_letter_w = letter_w;
if post_ptr == 0
|| post_ptr >= word.len()
|| (post_ptr > 0 && word[post_ptr - 1] == 0)
{
failed = 1;
break;
}
let (wc, xbytes) = utf8_decode(word, post_ptr);
letter_w = wc;
let letter = word[post_ptr];
post_ptr += 1;
match rb {
RULE_LETTERGP => {
let lg = letter_group_no(&mut r_pos, rules);
if is_letter_wc(ctx.letter_bits, letter_w, ctx.letter_bits_offset, lg) {
let lg_pts = if lg == LETTERGP_C { 19 } else { 20 };
add_points = lg_pts - distance_right;
post_ptr += xbytes;
} else {
failed = 1;
}
}
RULE_LETTERGP2 => {
let lg = letter_group_no(&mut r_pos, rules);
let n = is_letter_group(dict, word, post_ptr - 1, lg, false);
if n >= 0 {
add_points = 20 - distance_right;
post_ptr += (n as usize).saturating_sub(1);
} else {
failed = 1;
}
}
RULE_NOTVOWEL => {
if is_letter_wc(
ctx.letter_bits,
letter_w,
ctx.letter_bits_offset,
LETTERGP_VOWEL2,
) || (letter_w == RULE_SPACE as u32
&& word_flags & FLAG_SUFFIX_VOWEL != 0)
{
failed = 1;
} else {
add_points = 20 - distance_right;
post_ptr += xbytes;
}
}
RULE_DIGIT => {
if is_digit(letter_w) {
add_points = 20 - distance_right;
post_ptr += xbytes;
} else {
failed = 1;
}
}
RULE_NONALPHA => {
if !is_alpha(letter_w) {
add_points = 21 - distance_right;
post_ptr += xbytes;
} else {
failed = 1;
}
}
RULE_DOUBLE => {
if letter_w == last_letter_w {
add_points = 21 - distance_right;
post_ptr += xbytes;
} else {
failed = 1;
}
}
RULE_DOLLAR => {
let command = rules[r_pos];
r_pos += 1;
post_ptr -= 1;
if command == DOLLAR_UNPR {
match_.end_type = SUFX_UNPRON;
} else if command == DOLLAR_NOPREFIX {
if word_flags & FLAG_PREFIX_REMOVED != 0 {
failed = 1;
} else {
add_points = 1;
}
} else if (command & 0xf0) == 0x10 {
let flag_bit = (BITNUM_FLAG_ALT + (command & 0x0f) as u32) as usize;
if ctx.dict_flags0 & (1 << flag_bit) != 0 {
add_points = 23;
} else {
failed = 1;
}
} else if (command & 0xf0) == 0x20 || command == DOLLAR_LIST {
failed = 1;
}
}
b'-' => {
if letter == b'-'
|| (letter == b' ' && word_flags & FLAG_HYPHEN_AFTER != 0)
{
add_points = 22 - distance_right;
} else {
failed = 1;
}
}
RULE_SYLLABLE => {
let mut p2 = post_ptr + xbytes;
let mut vowel_count = 0i32;
let mut syllable_count = 1i32;
while r_pos < rules.len() && rules[r_pos] == RULE_SYLLABLE {
r_pos += 1;
syllable_count += 1;
}
let mut lw = letter_w;
let mut vowel_flag = false;
while lw != RULE_SPACE as u32 && lw != 0 {
if !vowel_flag
&& is_letter_wc(
ctx.letter_bits,
lw,
ctx.letter_bits_offset,
LETTERGP_VOWEL2,
)
{
vowel_count += 1;
}
vowel_flag = is_letter_wc(
ctx.letter_bits,
lw,
ctx.letter_bits_offset,
LETTERGP_VOWEL2,
);
let (nw, _) = utf8_decode(word, p2);
lw = nw;
p2 += 1;
if p2 >= word.len() {
break;
}
}
if syllable_count <= vowel_count {
add_points = 18 + syllable_count - distance_right;
} else {
failed = 1;
}
}
RULE_NOVOWELS => {
let mut p2 = post_ptr + xbytes;
let mut lw = letter_w;
loop {
if lw == RULE_SPACE as u32 || lw == 0 {
break;
}
if is_letter_wc(
ctx.letter_bits,
lw,
ctx.letter_bits_offset,
LETTERGP_VOWEL2,
) {
failed = 1;
break;
}
let (nw, _) = utf8_decode(word, p2);
lw = nw;
p2 += 1;
if p2 >= word.len() {
break;
}
}
if failed == 0 {
add_points = 19 - distance_right;
}
}
RULE_SKIPCHARS => {
let target = rules[r_pos] as u32;
let mut p2 = post_ptr.saturating_sub(1);
let mut p2_prev = p2;
let mut found_lw = letter_w;
while found_lw != target
&& found_lw != RULE_SPACE as u32
&& found_lw != 0
{
p2_prev = p2;
let (nw, _) = utf8_decode(word, p2);
found_lw = nw;
p2 += 1;
if p2 >= word.len() {
break;
}
}
if found_lw == target {
post_ptr = p2_prev;
}
}
RULE_INC_SCORE => {
post_ptr -= 1;
add_points = 20;
}
RULE_DEC_SCORE => {
post_ptr -= 1;
add_points = -20;
}
RULE_DEL_FWD => {
let search_start = ctx.word_pos + group_length;
let search_end = post_ptr;
for k in search_start..search_end {
if k < word.len() && word[k] == b'e' {
match_.del_fwd = rules_abs_offset + k; break;
}
}
}
RULE_ENDING => {
if r_pos + 2 < rules.len() {
let end_type = (rules[r_pos] as u32) << 16
| ((rules[r_pos + 1] & 0x7f) as u32) << 8
| (rules[r_pos + 2] & 0x7f) as u32;
r_pos += 3;
if ctx.word_vowel_count == 0
&& (end_type & SUFX_P == 0)
&& (ctx.suffix_option & 1 != 0)
{
failed = 1;
} else {
match_.end_type = end_type;
}
}
}
RULE_NO_SUFFIX => {
if word_flags & FLAG_SUFFIX_REMOVED != 0 {
failed = 1;
} else {
post_ptr -= 1;
add_points = 1;
}
}
RULE_SPELLING => {
}
_ => {
if letter == rb {
if (letter & 0xc0) != 0x80 {
add_points = 21 - distance_right;
}
} else {
failed = 1;
}
}
}
}
RULE_PRE => {
distance_left += 2;
if distance_left > 18 {
distance_left = 19;
}
if pre_ptr == 0 || pre_ptr > word.len() {
failed = 1;
break;
}
let (cur_lw, _) = utf8_decode_backwards(word, pre_ptr);
last_letter_w = cur_lw;
let (lw, xbytes) = utf8_decode_backwards(word, pre_ptr);
letter_w = lw;
let letter = if pre_ptr > 0 { word[pre_ptr - 1] } else { 0 };
if pre_ptr > 0 {
pre_ptr -= 1;
}
match rb {
RULE_LETTERGP => {
let lg = letter_group_no(&mut r_pos, rules);
if is_letter_wc(ctx.letter_bits, letter_w, ctx.letter_bits_offset, lg) {
let lg_pts = if lg == LETTERGP_C { 19 } else { 20 };
add_points = lg_pts - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else {
failed = 1;
}
}
RULE_LETTERGP2 => {
let lg = letter_group_no(&mut r_pos, rules);
let n = is_letter_group(dict, word, pre_ptr, lg, true);
if n >= 0 {
add_points = 20 - distance_right;
pre_ptr = pre_ptr.saturating_sub((n as usize).saturating_sub(1));
} else {
failed = 1;
}
}
RULE_NOTVOWEL => {
if !is_letter_wc(
ctx.letter_bits,
letter_w,
ctx.letter_bits_offset,
LETTERGP_VOWEL2,
) {
add_points = 20 - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else {
failed = 1;
}
}
RULE_DOUBLE => {
if letter_w == last_letter_w {
add_points = 21 - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else {
failed = 1;
}
}
RULE_DIGIT => {
if is_digit(letter_w) {
add_points = 21 - distance_left;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else {
failed = 1;
}
}
RULE_NONALPHA => {
if !is_alpha(letter_w) {
add_points = 21 - distance_right;
pre_ptr = pre_ptr.saturating_sub(xbytes);
} else {
failed = 1;
}
}
RULE_DOLLAR => {
let command = rules[r_pos];
r_pos += 1;
if pre_ptr < word.len() {
pre_ptr += 1;
}
if (command & 0xf0) == 0x10 {
let flag_bit = (BITNUM_FLAG_ALT + (command & 0x0f) as u32) as usize;
if ctx.dict_flags0 & (1 << flag_bit) != 0 {
add_points = 23;
} else {
failed = 1;
}
} else if (command & 0xf0) == 0x20 || command == DOLLAR_LIST {
failed = 1;
}
}
RULE_SYLLABLE => {
let mut syllable_count = 1i32;
while r_pos < rules.len() && rules[r_pos] == RULE_SYLLABLE {
r_pos += 1;
syllable_count += 1;
}
if syllable_count <= ctx.word_vowel_count {
add_points = 18 + syllable_count - distance_left;
} else {
failed = 1;
}
}
RULE_STRESSED => {
if pre_ptr < word.len() {
pre_ptr += 1;
}
if ctx.word_stressed_count > 0 {
add_points = 19;
} else {
failed = 1;
}
}
RULE_NOVOWELS => {
let mut p2 = pre_ptr;
let mut lw2 = letter_w;
loop {
if lw2 == RULE_SPACE as u32 {
break;
}
if is_letter_wc(
ctx.letter_bits,
lw2,
ctx.letter_bits_offset,
LETTERGP_VOWEL2,
) {
failed = 1;
break;
}
if p2 == 0 {
break;
}
let (nw, nb) = utf8_decode_backwards(word, p2);
lw2 = nw;
p2 = p2.saturating_sub(nb + 1);
}
if failed == 0 {
add_points = 3;
}
}
RULE_IFVERB => {
if pre_ptr < word.len() {
pre_ptr += 1;
}
if ctx.expect_verb {
add_points = 1;
} else {
failed = 1;
}
}
RULE_CAPITAL => {
if pre_ptr < word.len() {
pre_ptr += 1;
}
if word_flags & FLAG_FIRST_UPPER != 0 {
add_points = 1;
} else {
failed = 1;
}
}
b'.' => {
let mut k = pre_ptr;
let mut found_dot = false;
loop {
if k >= word.len() || word[k] == 0 || word[k] == b' ' {
break;
}
if word[k] == b'.' {
add_points = 50;
found_dot = true;
break;
}
if k == 0 {
break;
}
k -= 1;
}
if !found_dot {
failed = 1;
}
}
b'-' => {
if letter == b'-' || (letter == b' ' && word_flags & FLAG_HYPHEN != 0) {
add_points = 22 - distance_right;
} else {
failed = 1;
}
}
RULE_SKIPCHARS => {
let target = rules[r_pos];
let mut p2 = (pre_ptr + 1).min(word.len().saturating_sub(1));
let mut p2_prev = p2;
loop {
if p2 >= word.len() {
break;
}
if word[p2] == target {
break;
}
if word[p2] == RULE_SPACE || word[p2] == 0 {
break;
}
p2_prev = p2;
p2 = p2.saturating_sub(1);
}
if p2 < word.len() && word[p2] == target {
pre_ptr = p2_prev;
}
}
_ => {
if letter == rb {
add_points = if letter == RULE_SPACE {
4
} else if (letter & 0xc0) != 0x80 {
21 - distance_left
} else {
0
};
} else {
failed = 1;
}
}
}
}
_ => {
failed = 1;
}
}
if failed == 0 {
match_.points += add_points;
}
}
if failed == 2 && !unpron_ignore {
let at_word_start =
!check_atstart || (pre_ptr == 0 || word[pre_ptr.saturating_sub(1)] == b' ');
if at_word_start {
if check_atstart {
match_.points += 4;
}
if match_.points >= best.points {
if std::env::var("ESPEAK_DEBUG_RULES").is_ok() {
let ph_off = match_.phonemes_offset;
eprintln!(
" [RULE MATCH] rule_start={} abs={} points={} ph_off={:?}",
_rule_start,
rules_abs_offset + _rule_start,
match_.points,
if ph_off == usize::MAX {
None
} else {
Some(ph_off)
}
);
}
best = match_.clone();
total_consumed = consumed;
}
}
}
while r_pos < rules.len() && rules[r_pos] != 0 {
r_pos += 1;
}
if r_pos < rules.len() {
r_pos += 1;
} }
total_consumed += group_length;
if total_consumed == 0 {
total_consumed = 1;
}
*word_pos_out = ctx.word_pos + total_consumed;
best
}
#[derive(Clone, Debug, Default)]
pub struct RulesResult {
pub phonemes: Vec<u8>,
pub end_phonemes: Vec<u8>,
pub end_type: u32,
pub suffix_start: usize,
pub spellword: bool,
}
pub fn translate_rules(
dict: &Dictionary,
word_buf: &[u8],
word_start: usize, word_flags: u32,
dict_flags: u32,
letter_bits: &[u8; 256],
dict_condition: u32,
word_vowel_count: &mut i32,
word_stressed_count: &mut i32,
table: Option<&ActiveTable>,
) -> RulesResult {
translate_rules_phdata(
dict,
word_buf,
word_start,
word_flags,
dict_flags,
letter_bits,
dict_condition,
word_vowel_count,
word_stressed_count,
None,
table,
)
}
pub fn translate_rules_phdata(
dict: &Dictionary,
word_buf: &[u8],
word_start: usize,
word_flags: u32,
dict_flags: u32,
letter_bits: &[u8; 256],
dict_condition: u32,
word_vowel_count: &mut i32,
word_stressed_count: &mut i32,
phdata: Option<&PhonemeData>,
table: Option<&ActiveTable>,
) -> RulesResult {
if dict.rules().is_empty() {
return RulesResult::default();
}
let mut out_phonemes: Vec<u8> = Vec::new();
let mut out_end_phonemes: Vec<u8> = Vec::new();
let mut out_end_type = 0u32;
let mut out_suffix_start = 0usize;
let mut out_stem_ph_len = 0usize; let mut spellword = false;
let mut pos = word_start;
let rules_abs_base = dict.rules_offset;
while pos < word_buf.len() {
let c = word_buf[pos];
if c == 0 || c == b' ' {
break;
}
let (wc, wc_bytes) = utf8_decode(word_buf, pos);
let mut found = false;
let mut match1 = MatchRecord::empty_best();
let mut next_pos1 = pos + 1;
let group_length_total = wc_bytes + 1;
let lbo = dict.letter_bits_offset as u64;
if lbo > 0 && (wc as u64) >= lbo {
let g3_idx = (wc as u64 - lbo) as usize;
if g3_idx < 128 {
let c2 = (g3_idx + 1) as u8; if let Some(g3_rules) = dict.group3(c2) {
let g3_abs = dict.groups.groups3[g3_idx].unwrap_or(0);
let ctx = make_ctx(
word_buf,
pos,
group_length_total,
word_flags,
dict_flags,
dict_condition,
letter_bits,
dict.letter_bits_offset,
*word_vowel_count,
*word_stressed_count,
);
let mut np = pos;
match1 = match_rule(dict, &ctx, g3_rules, g3_abs, &mut np);
next_pos1 = np;
found = match1.points > 0 || np > pos;
}
}
}
if !found {
if let Some(g3_rules) = dict.group3(c) {
let g3_abs = dict.groups.groups3[(c.wrapping_sub(1)) as usize].unwrap_or(0);
let ctx = make_ctx(
word_buf,
pos,
group_length_total,
word_flags,
dict_flags,
dict_condition,
letter_bits,
dict.letter_bits_offset,
*word_vowel_count,
*word_stressed_count,
);
let mut np = pos;
match1 = match_rule(dict, &ctx, g3_rules, g3_abs, &mut np);
next_pos1 = np;
found = match1.points > 0 || np > pos;
}
}
let n = dict.groups.groups2_count[c as usize] as usize;
if !found && n > 0 && pos + 1 < word_buf.len() {
let c2 = word_buf[pos + 1];
let key = (c as u16) | ((c2 as u16) << 8);
let g1 = dict.groups.groups2_start[c as usize] as usize;
let g_end = (g1 + n).min(dict.groups.groups2.len());
for g in g1..g_end {
if dict.groups.groups2[g].key == key {
found = true;
let entry = &dict.groups.groups2[g];
let g2_rules = &dict.data[entry.offset..];
let ctx2 = make_ctx(
word_buf,
pos,
2,
word_flags,
dict_flags,
dict_condition,
letter_bits,
dict.letter_bits_offset,
*word_vowel_count,
*word_stressed_count,
);
let mut np2 = pos;
let mut m2 = match_rule(dict, &ctx2, g2_rules, entry.offset, &mut np2);
if m2.points > 0 {
m2.points += 35;
}
let ctx1 = make_ctx(
word_buf,
pos,
1,
word_flags,
dict_flags,
dict_condition,
letter_bits,
dict.letter_bits_offset,
*word_vowel_count,
*word_stressed_count,
);
if let Some(g1_rules) = dict.group1(c) {
let g1_abs = dict.groups.groups1[c as usize].unwrap_or(rules_abs_base);
let mut np1 = pos;
match1 = match_rule(dict, &ctx1, g1_rules, g1_abs, &mut np1);
next_pos1 = np1;
}
if m2.points >= match1.points {
match1 = m2;
next_pos1 = np2;
}
break;
}
}
}
if !found {
if let Some(g1_rules) = dict.group1(c) {
let g1_abs = dict.groups.groups1[c as usize].unwrap_or(rules_abs_base);
let ctx = make_ctx(
word_buf,
pos,
1,
word_flags,
dict_flags,
dict_condition,
letter_bits,
dict.letter_bits_offset,
*word_vowel_count,
*word_stressed_count,
);
let mut np = pos;
match1 = match_rule(dict, &ctx, g1_rules, g1_abs, &mut np);
next_pos1 = np;
} else {
if let Some(def_rules) = dict.group1(0) {
let def_abs = dict.groups.groups1[0].unwrap_or(rules_abs_base);
let ctx = make_ctx(
word_buf,
pos,
0,
word_flags,
dict_flags,
dict_condition,
letter_bits,
dict.letter_bits_offset,
*word_vowel_count,
*word_stressed_count,
);
let mut np = pos;
match1 = match_rule(dict, &ctx, def_rules, def_abs, &mut np);
next_pos1 = np;
}
if match1.points == 0 && is_alpha(wc) {
spellword = true;
break;
}
}
}
if match1.points > 0 {
if match1.phonemes_offset != MatchRecord::NONE {
let ph_start = match1.phonemes_offset;
let mut ph_end = ph_start;
while ph_end < dict.data.len() && dict.data[ph_end] != 0 {
ph_end += 1;
}
if let (Some(ph), Some(at)) = (phdata, table) {
let mut unstress_mark = false;
for &code in &dict.data[ph_start..ph_end] {
if let Some(phoneme) = ph.get(code, at) {
if phoneme.typ == 1 {
if phoneme.std_length < 4 {
unstress_mark = true;
}
} else if phoneme.typ == 2 {
if (phoneme.phflags & 2) == 0 && !unstress_mark {
*word_stressed_count += 1;
}
unstress_mark = false;
*word_vowel_count += 1;
} else {
unstress_mark = false;
}
}
}
}
let before_len = out_phonemes.len();
out_phonemes.extend_from_slice(&dict.data[ph_start..ph_end]);
if match1.end_type != 0 && out_end_type == 0 {
out_stem_ph_len = before_len;
let suffix_slice = &out_phonemes[before_len..];
out_end_phonemes.extend_from_slice(suffix_slice);
}
}
if match1.end_type != 0 {
out_end_type = match1.end_type;
out_suffix_start = pos; }
}
if next_pos1 <= pos {
let (_, adv) = utf8_decode(word_buf, pos);
pos += adv + 1;
} else {
pos = next_pos1;
}
}
if out_end_type != 0 && out_stem_ph_len <= out_phonemes.len() {
out_phonemes.truncate(out_stem_ph_len);
}
out_phonemes.push(0);
out_end_phonemes.push(0);
RulesResult {
phonemes: out_phonemes,
end_phonemes: out_end_phonemes,
end_type: out_end_type,
suffix_start: out_suffix_start,
spellword,
}
}
fn make_ctx<'a>(
word: &'a [u8],
pos: usize,
group_length: usize,
word_flags: u32,
dict_flags: u32,
dict_condition: u32,
letter_bits: &'a [u8; 256],
letter_bits_offset: u32,
vowel_count: i32,
stressed_count: i32,
) -> RuleContext<'a> {
RuleContext {
word,
word_pos: pos,
group_length,
word_flags,
dict_flags0: dict_flags,
dict_condition,
word_vowel_count: vowel_count,
word_stressed_count: stressed_count,
letter_bits,
letter_bits_offset,
unpron_test: word_flags & FLAG_UNPRON_TEST != 0,
expect_verb: false,
suffix_option: 0,
}
}
pub fn utf8_decode(buf: &[u8], pos: usize) -> (u32, usize) {
if pos >= buf.len() {
return (0, 0);
}
let b0 = buf[pos] as u32;
if b0 < 0x80 {
return (b0, 0);
} else if b0 < 0xe0 {
if pos + 1 < buf.len() {
let b1 = buf[pos + 1] as u32;
return (((b0 & 0x1f) << 6) | (b1 & 0x3f), 1);
}
} else if b0 < 0xf0 {
if pos + 2 < buf.len() {
let b1 = buf[pos + 1] as u32;
let b2 = buf[pos + 2] as u32;
return (((b0 & 0x0f) << 12) | ((b1 & 0x3f) << 6) | (b2 & 0x3f), 2);
}
} else if pos + 3 < buf.len() {
let b1 = buf[pos + 1] as u32;
let b2 = buf[pos + 2] as u32;
let b3 = buf[pos + 3] as u32;
return (
((b0 & 0x07) << 18) | ((b1 & 0x3f) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f),
3,
);
}
(b0, 0)
}
fn utf8_decode_backwards(buf: &[u8], pos: usize) -> (u32, usize) {
if pos == 0 {
return (0, 0);
}
let mut start = pos - 1;
while start > 0 && (buf[start] & 0xc0) == 0x80 {
start -= 1;
}
let (c, _) = utf8_decode(buf, start);
(c, pos - 1 - start)
}
fn letter_group_no(r_pos: &mut usize, rules: &[u8]) -> usize {
if *r_pos >= rules.len() {
return 0;
}
let b = rules[*r_pos];
*r_pos += 1;
(b as i16 - b'A' as i16).rem_euclid(N_LETTER_GROUPS as i16) as usize
}
use super::N_LETTER_GROUPS;
#[cfg(test)]
mod tests {
use super::*;
use crate::dictionary::Dictionary;
use std::path::PathBuf;
fn en_dict() -> Option<Dictionary> {
let dir = PathBuf::from("/usr/share/espeak-ng-data");
if !dir.join("en_dict").exists() {
return None;
}
Some(Dictionary::load("en", &dir).unwrap())
}
fn default_letter_bits() -> [u8; 256] {
let mut bits = [0u8; 256];
for c in b"aeiouAEIOU".iter() {
bits[*c as usize] |= (1 << LETTERGP_VOWEL2) | 1;
}
for c in b"bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ".iter() {
bits[*c as usize] |= 1 << LETTERGP_C;
}
bits
}
#[test]
fn utf8_decode_ascii() {
assert_eq!(utf8_decode(b"hello", 0), (b'h' as u32, 0));
assert_eq!(utf8_decode(b"hello", 4), (b'o' as u32, 0));
}
#[test]
fn utf8_decode_two_byte() {
let buf: &[u8] = &[0xc3, 0xa9];
let (c, xb) = utf8_decode(buf, 0);
assert_eq!(c, 0xe9);
assert_eq!(xb, 1);
}
#[test]
fn hash_word_in_rules() {
let dict = match en_dict() {
Some(d) => d,
None => return,
};
let h = super::super::lookup::hash_word(b"the");
let bucket_start = dict.hashtab[h];
assert!(
bucket_start >= 8 && bucket_start < dict.rules_offset,
"bucket for 'the' should be in word-list region"
);
}
#[test]
fn translate_rules_short_word() {
let dict = match en_dict() {
Some(d) => d,
None => return,
};
let _letter_bits = default_letter_bits();
let word_buf = b" a ";
let mut vcount = 0i32;
let mut scount = 0i32;
let result = translate_rules(
&dict,
word_buf,
1,
0,
0,
&dict.letter_bits,
0,
&mut vcount,
&mut scount,
None,
);
let _ = result;
}
#[test]
fn translate_rules_hello() {
let dict = match en_dict() {
Some(d) => d,
None => return,
};
let _letter_bits = default_letter_bits();
let word_buf = b" hello ";
let mut vcount = 0i32;
let mut scount = 0i32;
let result = translate_rules(
&dict,
word_buf,
1,
0,
0,
&dict.letter_bits,
0,
&mut vcount,
&mut scount,
None,
);
assert!(
!result.spellword,
"hello should not trigger spellword; phonemes={:?}",
result.phonemes
);
}
}