use super::{CharNormalizer, CharOrStr};
use crate::{Script, Token};
pub struct ArabicNormalizer;
impl CharNormalizer for ArabicNormalizer {
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
normalize_arabic_char(c)
}
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Arabic && token.lemma.chars().any(is_shoud_normalize)
}
}
fn normalize_arabic_char(c: char) -> Option<CharOrStr> {
match c {
'ـ' => None,
'أ' | 'إ' | 'آ' | 'ٱ' => Some('ا'.into()), 'ى' => Some('ي'.into()),
'ة' => Some('ه'.into()),
_ => Some(c.into()),
}
}
fn is_shoud_normalize(c: char) -> bool {
matches!(c, 'ـ' | 'أ' | 'إ' | 'آ' | 'ٱ' | 'ى' | 'ة')
}
#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;
use crate::normalizer::test::test_normalizer;
use crate::normalizer::{Normalizer, NormalizerOption};
use crate::token::TokenKind;
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("الحمــــــد".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Arabic,
..Default::default()
},
Token {
lemma: Owned("رحــــــيم".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Arabic,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
]),
..Default::default()
},
Token {
lemma: Owned("ٱلحمد".to_string()),
char_end: 5,
byte_end: 10,
script: Script::Arabic,
..Default::default()
},
Token {
lemma: Owned("يومى".to_string()),
char_end: 4,
byte_end: 8,
script: Script::Arabic,
..Default::default()
},
Token {
lemma: Owned("النهاردة".to_string()),
char_end: 8,
byte_end: 16,
script: Script::Arabic,
..Default::default()
},
]
}
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("الحمد".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Arabic,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 2),
]),
..Default::default()
},
Token {
lemma: Owned("رحيم".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Arabic,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 2),
(2, 2),
]),
..Default::default()
},
Token {
lemma: Owned("الحمد".to_string()),
char_end: 5,
byte_end: 10,
script: Script::Arabic,
char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2), (2, 2)]),
..Default::default()
},
Token {
lemma: Owned("يومي".to_string()),
char_end: 4,
byte_end: 8,
char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2)]),
script: Script::Arabic,
..Default::default()
},
Token {
lemma: Owned("النهارده".to_string()),
char_end: 8,
byte_end: 16,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
]),
script: Script::Arabic,
..Default::default()
},
]
}
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("الحمد".to_string()),
char_end: 10,
byte_end: 10,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 2),
]),
script: Script::Arabic,
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("رحيم".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Arabic,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 0),
(2, 2),
(2, 2),
]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("الحمد".to_string()),
char_end: 5,
byte_end: 10,
script: Script::Arabic,
char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2), (2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("يومي".to_string()),
char_end: 4,
byte_end: 8,
char_map: Some(vec![(2, 2), (2, 2), (2, 2), (2, 2)]),
script: Script::Arabic,
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("النهارده".to_string()),
char_end: 8,
byte_end: 16,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
]),
script: Script::Arabic,
kind: TokenKind::Word,
..Default::default()
},
]
}
test_normalizer!(ArabicNormalizer, tokens(), normalizer_result(), normalized_tokens());
}