use crate::error::{Result, TextError};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Language {
English,
Spanish,
French,
German,
Italian,
Portuguese,
Dutch,
Russian,
Chinese,
Japanese,
Korean,
Arabic,
Unknown,
}
impl Language {
pub fn iso_code(&self) -> &'static str {
match self {
Language::English => "en",
Language::Spanish => "es",
Language::French => "fr",
Language::German => "de",
Language::Italian => "it",
Language::Portuguese => "pt",
Language::Dutch => "nl",
Language::Russian => "ru",
Language::Chinese => "zh",
Language::Japanese => "ja",
Language::Korean => "ko",
Language::Arabic => "ar",
Language::Unknown => "und",
}
}
pub fn from_iso_code(code: &str) -> Self {
match code.to_lowercase().as_str() {
"en" => Language::English,
"es" => Language::Spanish,
"fr" => Language::French,
"de" => Language::German,
"it" => Language::Italian,
"pt" => Language::Portuguese,
"nl" => Language::Dutch,
"ru" => Language::Russian,
"zh" => Language::Chinese,
"ja" => Language::Japanese,
"ko" => Language::Korean,
"ar" => Language::Arabic,
_ => Language::Unknown,
}
}
pub fn name(&self) -> &'static str {
match self {
Language::English => "English",
Language::Spanish => "Spanish",
Language::French => "French",
Language::German => "German",
Language::Italian => "Italian",
Language::Portuguese => "Portuguese",
Language::Dutch => "Dutch",
Language::Russian => "Russian",
Language::Chinese => "Chinese",
Language::Japanese => "Japanese",
Language::Korean => "Korean",
Language::Arabic => "Arabic",
Language::Unknown => "Unknown",
}
}
}
#[derive(Debug, Clone)]
pub struct LanguageDetectionResult {
pub language: Language,
pub confidence: f64,
pub alternatives: Vec<(Language, f64)>,
}
pub struct LanguageDetector {
profiles: HashMap<Language, HashMap<String, f64>>,
n_gram_size: usize,
}
impl LanguageDetector {
pub fn new() -> Self {
let mut detector = Self {
profiles: HashMap::new(),
n_gram_size: 3,
};
detector.initialize_default_profiles();
detector
}
pub fn with_ngram_size(n_gramsize: usize) -> Result<Self> {
if !(1..=5).contains(&n_gramsize) {
return Err(TextError::InvalidInput(
"N-gram size must be between 1 and 5".to_string(),
));
}
let mut detector = Self {
profiles: HashMap::new(),
n_gram_size: n_gramsize,
};
detector.initialize_default_profiles();
Ok(detector)
}
fn initialize_default_profiles(&mut self) {
let mut english_profile = HashMap::new();
for (ngram, freq) in &[
("the", 0.05),
("and", 0.03),
("ing", 0.025),
("ion", 0.02),
("tio", 0.018),
("ent", 0.015),
("ati", 0.013),
("her", 0.012),
("for", 0.011),
("ter", 0.01),
("hat", 0.009),
("tha", 0.009),
("ere", 0.008),
("ate", 0.008),
("ver", 0.007),
("his", 0.007),
] {
english_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::English, english_profile);
let mut spanish_profile = HashMap::new();
for (ngram, freq) in &[
("que", 0.04),
("de_", 0.035),
("la_", 0.03),
("el_", 0.025),
("es_", 0.02),
("los", 0.018),
("las", 0.015),
("ión", 0.013),
("ado", 0.012),
("nte", 0.011),
("con", 0.01),
("par", 0.009),
("ara", 0.008),
("una", 0.008),
("por", 0.007),
("est", 0.007),
] {
spanish_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Spanish, spanish_profile);
let mut french_profile = HashMap::new();
for (ngram, freq) in &[
("de_", 0.05),
("le_", 0.04),
("que", 0.03),
("les", 0.025),
("la_", 0.02),
("des", 0.018),
("ent", 0.015),
("ion", 0.013),
("est", 0.012),
("ait", 0.011),
("pour", 0.01),
("ais", 0.009),
("ans", 0.008),
("ont", 0.008),
("une", 0.007),
("qui", 0.007),
] {
french_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::French, french_profile);
let mut german_profile = HashMap::new();
for (ngram, freq) in &[
("der", 0.05),
("die", 0.04),
("und", 0.03),
("den", 0.025),
("das", 0.02),
("ein", 0.018),
("ich", 0.015),
("ist", 0.013),
("sch", 0.012),
("cht", 0.011),
("ung", 0.01),
("gen", 0.009),
("eit", 0.008),
("ver", 0.008),
("ber", 0.007),
("ten", 0.007),
] {
german_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::German, german_profile);
let mut italian_profile = HashMap::new();
for (ngram, freq) in &[
("che", 0.05),
("la_", 0.04),
("il_", 0.03),
("di_", 0.025),
("del", 0.02),
("le_", 0.018),
("lla", 0.015),
("per", 0.013),
("ato", 0.012),
("gli", 0.011),
("sta", 0.01),
("con", 0.009),
("ent", 0.008),
("ion", 0.008),
("are", 0.007),
("una", 0.007),
] {
italian_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Italian, italian_profile);
let mut portuguese_profile = HashMap::new();
for (ngram, freq) in &[
("que", 0.05),
("de_", 0.04),
("os_", 0.03),
("as_", 0.025),
("da_", 0.02),
("do_", 0.018),
("ão_", 0.015),
("ent", 0.013),
("com", 0.012),
("para", 0.011),
("uma", 0.01),
("est", 0.009),
("nte", 0.008),
("ção", 0.008),
("por", 0.007),
("não", 0.007),
] {
portuguese_profile.insert(ngram.to_string(), *freq);
}
self.profiles
.insert(Language::Portuguese, portuguese_profile);
let mut dutch_profile = HashMap::new();
for (ngram, freq) in &[
("de_", 0.05),
("het", 0.04),
("een", 0.03),
("van", 0.025),
("en_", 0.02),
("dat", 0.018),
("te_", 0.015),
("op_", 0.013),
("aar", 0.012),
("oor", 0.011),
("eer", 0.01),
("sch", 0.009),
("ver", 0.008),
("ing", 0.008),
("cht", 0.007),
("ter", 0.007),
] {
dutch_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Dutch, dutch_profile);
let mut russian_profile = HashMap::new();
for (ngram, freq) in &[
("что", 0.05),
("ого", 0.04),
("как", 0.03),
("это", 0.025),
("все", 0.02),
("был", 0.018),
("ени", 0.015),
("ост", 0.013),
("ова", 0.012),
("про", 0.011),
("сто", 0.01),
("ого", 0.009),
("при", 0.008),
("ени", 0.008),
("ать", 0.007),
("ный", 0.007),
] {
russian_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Russian, russian_profile);
let mut chinese_profile = HashMap::new();
for (ngram, freq) in &[
("的_", 0.06),
("是_", 0.045),
("了_", 0.035),
("在_", 0.03),
("和_", 0.025),
("有_", 0.022),
("我_", 0.02),
("他_", 0.018),
("不_", 0.016),
("为_", 0.014),
("这_", 0.013),
("个_", 0.012),
("们_", 0.011),
("人_", 0.01),
("要_", 0.009),
("会_", 0.008),
] {
chinese_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Chinese, chinese_profile);
let mut japanese_profile = HashMap::new();
for (ngram, freq) in &[
("の_", 0.05),
("に_", 0.04),
("は_", 0.035),
("を_", 0.03),
("た_", 0.025),
("と_", 0.022),
("が_", 0.02),
("で_", 0.018),
("る_", 0.016),
("す_", 0.014),
("い_", 0.013),
("ます", 0.012),
("した", 0.011),
("して", 0.01),
("です", 0.009),
("ない", 0.008),
] {
japanese_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Japanese, japanese_profile);
let mut korean_profile = HashMap::new();
for (ngram, freq) in &[
("의_", 0.05),
("이_", 0.04),
("가_", 0.035),
("을_", 0.03),
("는_", 0.025),
("에_", 0.022),
("하_", 0.02),
("고_", 0.018),
("다_", 0.016),
("지_", 0.014),
("한_", 0.013),
("로_", 0.012),
("서_", 0.011),
("도_", 0.01),
("와_", 0.009),
("니_", 0.008),
] {
korean_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Korean, korean_profile);
let mut arabic_profile = HashMap::new();
for (ngram, freq) in &[
("ال_", 0.06),
("في_", 0.045),
("من_", 0.035),
("على", 0.03),
("إلى", 0.025),
("ها_", 0.022),
("أن_", 0.02),
("ما_", 0.018),
("هو_", 0.016),
("كان", 0.014),
("هذا", 0.013),
("عن_", 0.012),
("بين", 0.011),
("لا_", 0.01),
("قد_", 0.009),
("كل_", 0.008),
] {
arabic_profile.insert(ngram.to_string(), *freq);
}
self.profiles.insert(Language::Arabic, arabic_profile);
}
pub fn detect(&self, text: &str) -> Result<LanguageDetectionResult> {
if text.trim().is_empty() {
return Err(TextError::InvalidInput(
"Cannot detect language of empty text".to_string(),
));
}
let text_profile = self.createtext_profile(text);
let mut scores: Vec<(Language, f64)> = self
.profiles
.iter()
.map(|(lang, profile)| {
let score = self.calculate_similarity(&text_profile, profile);
(*lang, score)
})
.collect();
scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
if scores.is_empty() {
return Ok(LanguageDetectionResult {
language: Language::Unknown,
confidence: 0.0,
alternatives: vec![],
});
}
let best_score = scores[0].1;
let best_language = scores[0].0;
let confidence = if scores.len() > 1 {
let second_score = scores[1].1;
let diff = best_score - second_score;
(diff / best_score).clamp(0.0, 1.0)
} else {
best_score
};
Ok(LanguageDetectionResult {
language: best_language,
confidence,
alternatives: scores.into_iter().skip(1).take(3).collect(),
})
}
fn createtext_profile(&self, text: &str) -> HashMap<String, f64> {
let mut profile = HashMap::new();
let text_lower = text.to_lowercase();
let chars: Vec<char> = text_lower.chars().collect();
let total_ngrams = chars.len().saturating_sub(self.n_gram_size - 1) as f64;
if total_ngrams <= 0.0 {
return profile;
}
let mut ngram_counts: HashMap<String, usize> = HashMap::new();
for i in 0..=chars.len().saturating_sub(self.n_gram_size) {
let ngram: String = chars[i..i + self.n_gram_size].iter().collect();
let ngram = ngram.replace(' ', "_");
*ngram_counts.entry(ngram).or_insert(0) += 1;
}
for (ngram, count) in ngram_counts {
profile.insert(ngram, count as f64 / total_ngrams);
}
profile
}
fn calculate_similarity(
&self,
profile1: &HashMap<String, f64>,
profile2: &HashMap<String, f64>,
) -> f64 {
let mut similarity = 0.0;
let mut total_weight = 0.0;
for (ngram, freq1) in profile1 {
if let Some(freq2) = profile2.get(ngram) {
similarity += freq1 * freq2;
}
total_weight += freq1 * freq1;
}
if total_weight > 0.0 {
similarity / total_weight.sqrt()
} else {
0.0
}
}
pub fn supported_languages(&self) -> Vec<Language> {
self.profiles.keys().copied().collect()
}
}
impl Default for LanguageDetector {
fn default() -> Self {
Self::new()
}
}
pub struct StopWords {
stop_words: HashMap<Language, Vec<String>>,
}
impl StopWords {
pub fn new() -> Self {
let mut stop_words = HashMap::new();
stop_words.insert(
Language::English,
vec![
"a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in",
"is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "you",
"your", "this", "have", "had", "been", "but", "not", "they", "were", "what",
"when", "where", "who", "which", "their", "them", "these", "those", "there",
"here", "than",
]
.iter()
.map(|s| s.to_string())
.collect(),
);
stop_words.insert(
Language::Spanish,
vec![
"a", "al", "algo", "algunas", "algunos", "ante", "antes", "como", "con", "contra",
"cual", "cuando", "de", "del", "desde", "donde", "durante", "e", "el", "ella",
"ellas", "ellos", "en", "entre", "era", "erais", "eran", "eras", "eres", "es",
"esa", "esas", "ese", "eso", "esos", "esta", "estas", "este", "esto", "estos",
"fue", "fueron", "fui", "la", "las", "lo", "los", "más", "mi", "mis", "mucho",
"muchos", "muy", "ni", "no", "nos", "nosotras", "nosotros", "o", "otra", "otras",
"otro", "otros", "para", "pero", "por", "porque", "que", "quien", "quienes", "se",
"si", "sin", "sobre", "su", "sus", "también", "tanto", "te", "tu", "tus", "un",
"una", "uno", "unos", "y", "ya", "yo",
]
.iter()
.map(|s| s.to_string())
.collect(),
);
stop_words.insert(
Language::French,
vec![
"au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et",
"eux", "il", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même",
"mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas",
"pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes",
"toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
]
.iter()
.map(|s| s.to_string())
.collect(),
);
Self { stop_words }
}
pub fn get(&self, language: Language) -> Option<&Vec<String>> {
self.stop_words.get(&language)
}
pub fn is_stop_word(&self, word: &str, language: Language) -> bool {
if let Some(words) = self.stop_words.get(&language) {
words.iter().any(|sw| sw == &word.to_lowercase())
} else {
false
}
}
pub fn remove_stop_words(&self, tokens: &[String], language: Language) -> Vec<String> {
tokens
.iter()
.filter(|token| !self.is_stop_word(token, language))
.cloned()
.collect()
}
}
impl Default for StopWords {
fn default() -> Self {
Self::new()
}
}
pub struct MultilingualProcessor {
detector: LanguageDetector,
stop_words: StopWords,
}
impl MultilingualProcessor {
pub fn new() -> Self {
Self {
detector: LanguageDetector::new(),
stop_words: StopWords::new(),
}
}
pub fn process(&self, text: &str) -> Result<ProcessedText> {
let detection = self.detector.detect(text)?;
let tokens: Vec<String> = text.split_whitespace().map(|s| s.to_string()).collect();
let filtered_tokens = self
.stop_words
.remove_stop_words(&tokens, detection.language);
Ok(ProcessedText {
original: text.to_string(),
language: detection.language,
confidence: detection.confidence,
tokens,
filtered_tokens,
})
}
}
impl Default for MultilingualProcessor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct ProcessedText {
pub original: String,
pub language: Language,
pub confidence: f64,
pub tokens: Vec<String>,
pub filtered_tokens: Vec<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_language_enum() {
assert_eq!(Language::English.iso_code(), "en");
assert_eq!(Language::Spanish.name(), "Spanish");
assert_eq!(Language::from_iso_code("fr"), Language::French);
assert_eq!(Language::from_iso_code("unknown"), Language::Unknown);
}
#[test]
fn test_language_detection() {
let detector = LanguageDetector::new();
let result = detector.detect("The quick brown fox jumps over the lazy dog. This is definitely an English sentence with many common words.").expect("Operation failed");
assert_eq!(result.language, Language::English);
let empty_result = detector.detect("");
assert!(empty_result.is_err());
}
#[test]
fn test_stop_words() {
let stop_words = StopWords::new();
assert!(stop_words.is_stop_word("the", Language::English));
assert!(stop_words.is_stop_word("and", Language::English));
assert!(!stop_words.is_stop_word("hello", Language::English));
let tokens = vec![
"the".to_string(),
"cat".to_string(),
"is".to_string(),
"happy".to_string(),
];
let filtered = stop_words.remove_stop_words(&tokens, Language::English);
assert_eq!(filtered, vec!["cat", "happy"]);
}
#[test]
fn test_multilingual_processor() {
let processor = MultilingualProcessor::new();
let result = processor.process("The quick brown fox jumps over the lazy dog. This sentence has many English words.").expect("Operation failed");
assert_eq!(result.language, Language::English);
assert!(!result.tokens.is_empty());
assert!(result.filtered_tokens.len() < result.tokens.len());
}
#[test]
fn test_createtext_profile() {
let detector = LanguageDetector::new();
let profile = detector.createtext_profile("hello world");
assert!(!profile.is_empty());
assert!(profile.contains_key("hel") || profile.contains_key("llo"));
}
}