use crate::{encode, english, utils, with_encoder};
pub(crate) fn is_ipa_phonetic_symbol(c: char) -> bool {
matches!(c, 'θ' | 'ə' | 'æ' | 'ŋ' | 'ː')
}
pub(crate) fn detect_ipa_context(text: &str) -> bool {
let mut has_group_start = false;
let mut has_ipa_symbol = false;
for c in text.chars() {
has_group_start |= matches!(c, '[' | '/');
has_ipa_symbol |= is_ipa_phonetic_symbol(c);
if has_group_start && has_ipa_symbol {
break;
}
}
if !has_group_start || !has_ipa_symbol {
return false;
}
let chars: Vec<char> = text.chars().collect();
let mut i = 0;
while i < chars.len() {
match chars[i] {
'[' => {
if let Some(rel) = chars[i + 1..].iter().position(|&c| c == ']') {
let inner: &[char] = &chars[i + 1..i + 1 + rel];
if inner.iter().any(|c| is_ipa_phonetic_symbol(*c)) {
return true;
}
i += rel + 2;
continue;
}
}
'/' => {
if let Some(rel) = chars[i + 1..].iter().position(|&c| c == '/') {
let inner: &[char] = &chars[i + 1..i + 1 + rel];
if inner.iter().any(|c| is_ipa_phonetic_symbol(*c)) {
return true;
}
i += rel + 2;
continue;
}
}
_ => {}
}
i += 1;
}
false
}
pub(crate) fn encode_ipa(text: &str) -> Result<Vec<u8>, String> {
let mut out: Vec<u8> = Vec::new();
let mut bracket_open = false;
let mut slash_open = false;
let mut korean_buf = String::new();
let has_korean_anywhere = text.chars().any(utils::is_korean_char);
let flush_korean = |buf: &mut String, out: &mut Vec<u8>| -> Result<(), String> {
if !buf.is_empty() {
let enc = if has_korean_anywhere {
with_encoder(true, |encoder| {
let mut result = Vec::new();
encoder.encode(buf.as_str(), &mut result)?;
Ok::<Vec<u8>, String>(result)
})?
} else {
encode(buf.as_str())?
};
out.extend(enc);
buf.clear();
}
Ok(())
};
fn strip_trailing_english_terminator_before_bracket(out: &mut Vec<u8>) {
let mut i = out.len();
while i > 0 && out[i - 1] == 0 {
i -= 1;
}
if i > 0 && out[i - 1] == 50 {
out.remove(i - 1);
}
}
for ch in text.chars() {
match ch {
'[' => {
flush_korean(&mut korean_buf, &mut out)?;
strip_trailing_english_terminator_before_bracket(&mut out);
out.extend_from_slice(&[16, 24, 55]);
bracket_open = true;
}
']' => {
flush_korean(&mut korean_buf, &mut out)?;
out.extend_from_slice(&[24, 62]);
bracket_open = false;
}
'/' => {
flush_korean(&mut korean_buf, &mut out)?;
if slash_open {
out.extend_from_slice(&[24, 12]);
slash_open = false;
} else {
strip_trailing_english_terminator_before_bracket(&mut out);
out.extend_from_slice(&[16, 24, 12]);
slash_open = true;
}
}
' ' => {
flush_korean(&mut korean_buf, &mut out)?;
out.push(0);
}
_ if bracket_open || slash_open => {
flush_korean(&mut korean_buf, &mut out)?;
let bytes =
encode_ipa_char(ch).ok_or_else(|| format!("Unknown IPA character: {ch:?}"))?;
out.extend(bytes);
}
_ => {
korean_buf.push(ch);
}
}
}
flush_korean(&mut korean_buf, &mut out)?;
Ok(out)
}
pub(crate) fn encode_ipa_char(ch: char) -> Option<Vec<u8>> {
match ch {
'ə' => Some(vec![34]), 'ː' => Some(vec![18]), 'θ' => Some(vec![40, 57]), 'ŋ' => Some(vec![43]), 'æ' => Some(vec![41]), _ => {
if let Ok(code) = english::encode_english(ch) {
Some(vec![code])
} else {
None
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ipa_input_without_korean_uses_plain_encode() {
let _ = encode_ipa("think [θɪŋk]");
let _ = encode_ipa("[θ]/æ/");
}
#[test]
fn ipa_encode_char_none_for_unsupported() {
assert!(encode_ipa_char('\u{1F600}').is_none()); assert!(encode_ipa_char('한').is_none());
}
}