use std::collections::HashSet;
use std::sync::LazyLock;
use super::CharNormalizer;
use crate::detection::Script;
use crate::normalizer::CharOrStr;
use crate::Token;
static NONSPACING_MARKS: LazyLock<HashSet<u32>> = LazyLock::new(|| {
let bytes = include_bytes!("../../dictionaries/bin/nonspacing_mark/marks.bin");
HashSet::from_iter(
bytes.chunks_exact(4).map(|chunk| u32::from_ne_bytes(chunk.try_into().unwrap())),
)
});
pub struct NonspacingMarkNormalizer;
impl CharNormalizer for NonspacingMarkNormalizer {
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
(!is_nonspacing_mark(c)).then(|| c.into())
}
fn should_normalize(&self, token: &Token) -> bool {
matches!(
token.script,
Script::Hebrew | Script::Thai | Script::Arabic | Script::Latin | Script::Greek
) && token.lemma().chars().any(is_nonspacing_mark)
}
}
fn is_nonspacing_mark(c: char) -> bool {
NONSPACING_MARKS.contains(&(c as u32))
}
#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;
use crate::normalizer::test::test_normalizer;
use crate::normalizer::{Normalizer, NormalizerOption};
use crate::token::TokenKind;
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("ง่าย".to_string()),
char_end: "ง่าย".chars().count(),
byte_end: "ง่าย".len(),
script: Script::Thai,
..Default::default()
},
Token {
lemma: Owned("أَب".to_string()),
char_end: "أَب".chars().count(),
byte_end: "أَب".len(),
script: Script::Arabic,
..Default::default()
},
Token {
lemma: Owned("כָּבוֹד".to_string()),
char_end: "כָּבוֹד".chars().count(),
byte_end: "כָּבוֹד".len(),
script: Script::Hebrew,
..Default::default()
},
]
}
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("งาย".to_string()),
char_end: 4,
byte_end: 12,
char_map: Some(vec![(3, 3), (3, 0), (3, 3), (3, 3)]),
script: Script::Thai,
..Default::default()
},
Token {
lemma: Owned("أب".to_string()),
char_end: "أَب".chars().count(),
byte_end: "أَب".len(),
char_map: Some(vec![(2, 2), (2, 0), (2, 2)]),
script: Script::Arabic,
..Default::default()
},
Token {
lemma: Owned("כבוד".to_string()),
char_end: "כָּבוֹד".chars().count(),
byte_end: "כָּבוֹד".len(),
script: Script::Hebrew,
char_map: Some(vec![(2, 2), (2, 0), (2, 0), (2, 2), (2, 2), (2, 0), (2, 2)]),
..Default::default()
},
]
}
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("งาย".to_string()),
char_end: 4,
byte_end: 12,
char_map: Some(vec![(3, 3), (3, 0), (3, 3), (3, 3)]),
script: Script::Thai,
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("اب".to_string()),
char_end: "أَب".chars().count(),
byte_end: "أَب".len(),
char_map: Some(vec![(2, 2), (2, 0), (2, 2)]),
script: Script::Arabic,
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("כבוד".to_string()),
char_end: "כָּבוֹד".chars().count(),
byte_end: "כָּבוֹד".len(),
script: Script::Hebrew,
char_map: Some(vec![(2, 2), (2, 0), (2, 0), (2, 2), (2, 2), (2, 0), (2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
]
}
test_normalizer!(NonspacingMarkNormalizer, tokens(), normalizer_result(), normalized_tokens());
}