use std::error::Error;
use std::ffi::OsString;
use std::fmt::{Display, Formatter};
use std::path::PathBuf;
use std::str::FromStr;
use either::Either;
use enum_iterator::Sequence;
use regex::Regex;
use regex_optim::OptimizedRegex;
pub use rule::RuleType;
use serde::{Deserialize, Serialize};
use crate::beider_morse::engine::PhoneticEngine;
use crate::beider_morse::lang::Langs;
pub use crate::beider_morse::languages::LanguageSet;
use crate::beider_morse::languages::Languages;
use crate::beider_morse::rule::Rules;
use crate::{Encoder, PhoneticError};
mod engine;
mod lang;
mod languages;
mod regex_optim;
mod rule;
const ASH: &str = "ash";
const GEN: &str = "gen";
const SEP: &str = "sep";
const DEFAULT_MAX_PHONEMES: usize = 20;
#[derive(Debug, Clone, PartialEq)]
pub enum BMError {
UnknownNameType(String),
UnknownRuleType(String),
ParseConfiguration(String),
WrongFilename(String),
WrongPhoneme(String),
BadContextRegex(regex::Error),
NotABoolean(String),
BadRule(String),
}
impl Display for BMError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
BMError::UnknownNameType(error) => write!(f, "Unknown NameType {error}"),
BMError::ParseConfiguration(error) => write!(f, "Error reading files {error}"),
BMError::WrongFilename(error) => write!(f, "Wrong file name : {error}"),
BMError::WrongPhoneme(error) => write!(f, "{error}"),
BMError::BadContextRegex(error) => write!(f, "{error}"),
BMError::NotABoolean(error) => write!(f, "{error}"),
BMError::BadRule(error) => write!(f, "{error}"),
BMError::UnknownRuleType(error) => write!(f, "Unknown RuleType {error}"),
}
}
}
impl From<std::io::Error> for BMError {
fn from(error: std::io::Error) -> Self {
Self::ParseConfiguration(error.to_string())
}
}
impl From<regex::Error> for BMError {
fn from(error: regex::Error) -> Self {
Self::BadContextRegex(error)
}
}
impl Error for BMError {}
trait IsMatch {
fn is_match(&self, input: &str) -> bool;
}
impl IsMatch for Either<Regex, OptimizedRegex> {
fn is_match(&self, input: &str) -> bool {
match self {
Either::Left(regex) => regex.is_match(input),
Either::Right(optimized) => optimized.is_match(input),
}
}
}
#[derive(
Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash, Serialize, Deserialize, Sequence,
)]
pub enum NameType {
#[serde(rename = "ash")]
Ashkenazi,
#[serde(rename = "gen")]
Generic,
#[serde(rename = "sep")]
Sephardic,
}
impl NameType {
fn language_filename(&self) -> String {
format!("{self}_languages.txt")
}
}
impl Display for NameType {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let r = match self {
Self::Ashkenazi => ASH,
Self::Generic => GEN,
Self::Sephardic => SEP,
};
write!(f, "{r}")
}
}
impl FromStr for NameType {
type Err = BMError;
fn from_str(value: &str) -> Result<Self, Self::Err> {
match value {
ASH => Ok(Self::Ashkenazi),
GEN => Ok(Self::Generic),
SEP => Ok(Self::Sephardic),
_ => Err(BMError::UnknownNameType(value.to_string())),
}
}
}
impl TryFrom<OsString> for NameType {
type Error = BMError;
fn try_from(value: OsString) -> Result<Self, Self::Error> {
if value == OsString::from(NameType::Ashkenazi.language_filename()) {
Ok(NameType::Ashkenazi)
} else if value == OsString::from(NameType::Generic.language_filename()) {
Ok(NameType::Generic)
} else if value == OsString::from(NameType::Sephardic.language_filename()) {
Ok(NameType::Sephardic)
} else {
Err(BMError::UnknownNameType(
value.to_string_lossy().to_string(),
))
}
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "embedded_bm", derive(Default))]
pub struct ConfigFiles {
langs: Langs,
rules: Rules,
}
impl ConfigFiles {
pub fn new(directory: &PathBuf) -> Result<Self, PhoneticError> {
let languages = Languages::try_from(directory)?;
let langs = Langs::new(directory, &languages)?;
let rules = Rules::new(directory, &languages)?;
Ok(Self { langs, rules })
}
}
#[derive(Debug, Clone)]
pub struct BeiderMorse<'a> {
engine: PhoneticEngine<'a>,
}
impl BeiderMorse<'_> {
pub fn encode_with_languages(&self, value: &str, languages: &LanguageSet) -> String {
self.engine.encode_with_language_set(value, languages)
}
}
impl Encoder for BeiderMorse<'_> {
fn encode(&self, value: &str) -> String {
self.engine.encode(value)
}
}
#[derive(Debug, Clone)]
pub struct BeiderMorseBuilder<'a> {
config_files: &'a ConfigFiles,
name_type: NameType,
rule_type: RuleType,
concat: bool,
max_phonemes: usize,
}
impl<'a> BeiderMorseBuilder<'a> {
pub fn new(config_files: &'a ConfigFiles) -> Self {
Self {
config_files,
name_type: NameType::Generic,
rule_type: RuleType::Approx,
concat: true,
max_phonemes: DEFAULT_MAX_PHONEMES,
}
}
pub fn name_type(mut self, name_type: NameType) -> Self {
self.name_type = name_type;
self
}
pub fn rule_type(mut self, rule_type: RuleType) -> Self {
self.rule_type = rule_type;
self
}
pub fn concat(mut self, concat: bool) -> Self {
self.concat = concat;
self
}
pub fn max_phonemes(mut self, max_phonemes: usize) -> Self {
self.max_phonemes = max_phonemes;
self
}
pub fn build(&self) -> BeiderMorse<'a> {
let lang = self.config_files.langs.get(&self.name_type).unwrap();
let rules = &self.config_files.rules;
let engine = PhoneticEngine {
rules,
lang,
name_type: self.name_type,
rule_type: self.rule_type.into(),
concat: self.concat,
max_phonemes: self.max_phonemes,
};
BeiderMorse { engine }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "embedded_bm")]
use crate::beider_morse::rule::PrivateRuleType;
lazy_static::lazy_static! {
static ref CONFIG_FILE: ConfigFiles =
ConfigFiles::new(&PathBuf::from("./test_assets/cc-rules/")).unwrap();
}
#[test]
fn test_all_chars() -> Result<(), BMError> {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
for ch in '\u{0000}'..'\u{ffff}' {
let _ = encoder.encode(ch.to_string().as_str());
}
Ok(())
}
#[test]
fn test_oom() -> Result<(), BMError> {
let input = "200697900'-->�</ bceaeef >aadaabcf\"aedfbff<!--\'-->?>cae\
cfaaa><?&#<!--</script>&lang&fc;aadeaf?>>&bdquo< cc =\"abff\" /></ afe ><script>\
<!-- f(';< cf aefbeef = \"bfabadcf\" ebbfeedd = fccabeb >";
let builder = BeiderMorseBuilder::new(&CONFIG_FILE)
.name_type(NameType::Generic)
.rule_type(RuleType::Exact)
.max_phonemes(10);
let encoder = builder.build();
let result = encoder.encode(input);
assert!(!result.is_empty());
let result = result.split('|').count();
assert!(result <= 10);
Ok(())
}
#[test]
fn test_ascii_encode_not_empty_1_letter() -> Result<(), BMError> {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
for ch in 'a'..='z' {
assert_ne!(encoder.encode(&ch.to_string()), "");
assert_ne!(encoder.encode(&ch.to_ascii_uppercase().to_string()), "");
}
Ok(())
}
#[test]
fn test_ascii_encode_not_empty_2_letters() -> Result<(), BMError> {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
for ch1 in 'a'..='z' {
for ch2 in 'a'..='z' {
let mut string = String::with_capacity(2);
string.push(ch1);
string.push(ch2);
assert_ne!(encoder.encode(&string), "");
assert_ne!(encoder.encode(&string.to_ascii_uppercase().to_string()), "");
}
}
Ok(())
}
#[test]
fn test_encode_atz_not_empty() -> Result<(), BMError> {
let data = vec![
"\u{00e1}cz",
"\u{00e1}tz",
"Ign\u{00e1}cz",
"Ign\u{00e1}tz",
"Ign\u{00e1}c",
];
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
for d in data {
assert_ne!(encoder.encode(d), "");
}
Ok(())
}
#[test]
fn test_encode_gna() -> Result<(), BMError> {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
assert_ne!(encoder.encode("gna"), "");
Ok(())
}
#[test]
fn test_longest_english_surname() -> Result<(), BMError> {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
assert_ne!(encoder.encode("MacGhilleseatheanaich"), "");
Ok(())
}
#[test]
fn test_speed_check() -> Result<(), BMError> {
let test_chars: Vec<char> = vec!['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'o', 'u'];
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
let mut string = String::with_capacity(40);
for i in 0..40 {
string.push(test_chars[i % test_chars.len()]);
assert_ne!(encoder.encode(&string), "");
}
assert_ne!(
encoder.encode("ItstheendoftheworldasweknowitandIfeelfine"),
""
);
Ok(())
}
#[test]
fn test_speed_check_2() -> Result<(), BMError> {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
assert_ne!(
encoder.encode("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"),
""
);
Ok(())
}
#[test]
fn test_speed_check_3() -> Result<(), BMError> {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
let encoder = builder.build();
assert_ne!(
encoder.encode("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"),
""
);
Ok(())
}
#[test]
#[cfg(feature = "embedded_bm")]
fn test_config_file_default() {
let config_file = ConfigFiles::default();
let rules = config_file
.rules
.rules(NameType::Generic, PrivateRuleType::Exact, "any");
assert!(rules.is_some());
assert_eq!(rules.unwrap().len(), 8);
let rules = config_file
.rules
.rules(NameType::Generic, PrivateRuleType::Approx, "any");
assert!(rules.is_some());
assert_eq!(rules.unwrap().len(), 22);
}
#[test]
fn test_builder() {
let builder = BeiderMorseBuilder::new(&CONFIG_FILE);
assert_eq!(builder.rule_type, RuleType::Approx);
assert_eq!(builder.name_type, NameType::Generic);
assert!(builder.concat);
assert_eq!(builder.max_phonemes, DEFAULT_MAX_PHONEMES);
let builder = builder.concat(false);
assert_eq!(builder.rule_type, RuleType::Approx);
assert_eq!(builder.name_type, NameType::Generic);
assert!(!builder.concat);
assert_eq!(builder.max_phonemes, DEFAULT_MAX_PHONEMES);
let builder = builder.max_phonemes(5);
assert_eq!(builder.rule_type, RuleType::Approx);
assert_eq!(builder.name_type, NameType::Generic);
assert!(!builder.concat);
assert_eq!(builder.max_phonemes, 5);
let builder = builder.name_type(NameType::Ashkenazi);
assert_eq!(builder.rule_type, RuleType::Approx);
assert_eq!(builder.name_type, NameType::Ashkenazi);
assert!(!builder.concat);
assert_eq!(builder.max_phonemes, 5);
let builder = builder.rule_type(RuleType::Exact);
assert_eq!(builder.rule_type, RuleType::Exact);
assert_eq!(builder.name_type, NameType::Ashkenazi);
assert!(!builder.concat);
assert_eq!(builder.max_phonemes, 5);
}
}