use crate::common::collection;
use std::collections::HashSet;
use super::types::*;
pub struct TextAnalyzer {
stop_words: HashSet<String>,
sentiment_words: SentimentWords,
}
impl TextAnalyzer {
pub fn new() -> Self {
Self {
stop_words: Self::default_stop_words(),
sentiment_words: Self::default_sentiment_words(),
}
}
pub fn analyze_stats(&self, text: &str) -> TextStats {
let words = self.extract_words(text);
let sentences = self.extract_sentences(text);
let paragraphs = self.extract_paragraphs(text);
let word_count = words.len();
let character_count = text.chars().count();
let sentence_count = sentences.len();
let paragraph_count = paragraphs.len();
let avg_word_length = if word_count > 0 {
words.iter().map(|w| w.len()).sum::<usize>() as f64 / word_count as f64
} else {
0.0
};
let avg_sentence_length = if sentence_count > 0 {
words.len() as f64 / sentence_count as f64
} else {
0.0
};
let readability_score = self.calculate_readability_score(&words, &sentences);
let unique_words = collection::unique_preserve_order(&words).len();
let lexical_diversity = if word_count > 0 {
unique_words as f64 / word_count as f64
} else {
0.0
};
TextStats {
word_count,
character_count,
sentence_count,
paragraph_count,
avg_word_length,
avg_sentence_length,
readability_score,
unique_words,
lexical_diversity,
}
}
pub fn analyze_sentiment(&self, text: &str) -> SentimentResult {
let words = self.extract_words(text);
let mut positive_count = 0;
let mut negative_count = 0;
let mut neutral_count = 0;
for word in &words {
let lower_word = word.to_lowercase();
if self.sentiment_words.positive.contains(&lower_word) {
positive_count += 1;
} else if self.sentiment_words.negative.contains(&lower_word) {
negative_count += 1;
} else {
neutral_count += 1;
}
}
let total = positive_count + negative_count + neutral_count;
let (positive_score, negative_score, neutral_score) = if total > 0 {
(
positive_count as f64 / total as f64,
negative_count as f64 / total as f64,
neutral_count as f64 / total as f64,
)
} else {
(0.0, 0.0, 1.0)
};
let (sentiment, confidence) =
if positive_score > negative_score && positive_score > neutral_score {
(Sentiment::Positive, positive_score)
} else if negative_score > positive_score && negative_score > neutral_score {
(Sentiment::Negative, negative_score)
} else {
(Sentiment::Neutral, neutral_score)
};
SentimentResult {
sentiment,
confidence,
positive_score,
negative_score,
neutral_score,
}
}
pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> KeywordResult {
let words = self.extract_words(text);
let mut word_frequencies: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for word in &words {
let lower_word = word.to_lowercase();
if !self.stop_words.contains(&lower_word) && word.len() > 2 {
*word_frequencies.entry(lower_word.clone()).or_insert(0) += 1;
}
}
let total_words = words.len();
let mut keywords: Vec<Keyword> = word_frequencies
.into_iter()
.map(|(word, frequency)| {
let tf = frequency as f64 / total_words as f64;
let score = tf * (1.0 + word.len() as f64 * 0.1);
let importance = if frequency >= total_words / 10 {
Importance::High
} else if frequency >= total_words / 20 {
Importance::Medium
} else {
Importance::Low
};
Keyword {
word,
score,
frequency,
importance,
}
})
.collect();
keywords.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| b.frequency.cmp(&a.frequency))
});
keywords.truncate(max_keywords);
let total_keywords = keywords.len();
KeywordResult {
keywords,
total_keywords,
}
}
pub fn detect_language(&self, text: &str) -> LanguageResult {
let supported_languages = vec![
"English".to_string(),
"Spanish".to_string(),
"French".to_string(),
"German".to_string(),
"Chinese".to_string(),
"Japanese".to_string(),
];
let lower_text = text.to_lowercase();
let mut language_scores: std::collections::HashMap<String, f64> =
std::collections::HashMap::new();
if lower_text.contains("the ")
|| lower_text.contains(" and ")
|| lower_text.contains(" is ")
{
*language_scores.entry("English".to_string()).or_insert(0.0) += 0.3;
}
if lower_text.contains(" el ") || lower_text.contains(" la ") || lower_text.contains(" de ")
{
*language_scores.entry("Spanish".to_string()).or_insert(0.0) += 0.3;
}
if lower_text.contains(" le ") || lower_text.contains(" la ") || lower_text.contains(" et ")
{
*language_scores.entry("French".to_string()).or_insert(0.0) += 0.3;
}
let (language, confidence) = language_scores
.into_iter()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or_else(|| ("English".to_string(), 0.5));
LanguageResult {
language,
confidence,
supported_languages,
}
}
}
impl Default for TextAnalyzer {
fn default() -> Self {
Self::new()
}
}