use regex::Regex;
use std::collections::HashSet;
use super::super::types::{EngagementLevel, EntityMention, ReasoningType};
pub struct TextAnalyzer;
impl TextAnalyzer {
pub fn analyze_sentiment(content: &str) -> Option<String> {
let positive_words = [
"good",
"great",
"excellent",
"happy",
"pleased",
"wonderful",
"amazing",
"fantastic",
"brilliant",
"awesome",
"love",
"like",
"enjoy",
"appreciate",
"grateful",
"thankful",
"positive",
"optimistic",
"excited",
"thrilled",
];
let negative_words = [
"bad",
"terrible",
"awful",
"sad",
"angry",
"frustrated",
"disappointed",
"hate",
"dislike",
"horrible",
"disgusting",
"annoying",
"upset",
"worried",
"anxious",
"depressed",
"negative",
"pessimistic",
"miserable",
"furious",
];
let neutral_words = [
"okay", "fine", "alright", "average", "normal", "standard", "regular", "moderate",
"typical", "ordinary", "usual", "common",
];
let content_lower = content.to_lowercase();
let pos_count = positive_words.iter().filter(|word| content_lower.contains(*word)).count();
let neg_count = negative_words.iter().filter(|word| content_lower.contains(*word)).count();
let neu_count = neutral_words.iter().filter(|word| content_lower.contains(*word)).count();
if pos_count > neg_count && pos_count > neu_count {
Some("positive".to_string())
} else if neg_count > pos_count && neg_count > neu_count {
Some("negative".to_string())
} else {
Some("neutral".to_string())
}
}
pub fn classify_intent(content: &str) -> Option<String> {
let content_lower = content.to_lowercase();
if content.contains('?')
|| content_lower.starts_with("what")
|| content_lower.starts_with("how")
|| content_lower.starts_with("why")
|| content_lower.starts_with("when")
|| content_lower.starts_with("where")
|| content_lower.starts_with("who")
|| content_lower.starts_with("which")
{
return Some("question".to_string());
}
if [
"please",
"can you",
"could you",
"would you",
"help me",
"assist me",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some("request".to_string());
}
if ["thank", "thanks", "appreciate", "grateful"]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some("gratitude".to_string());
}
if [
"hello",
"hi",
"hey",
"good morning",
"good afternoon",
"good evening",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some("greeting".to_string());
}
if ["goodbye", "bye", "see you", "farewell", "take care"]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some("farewell".to_string());
}
if ["help", "assist", "support", "guidance", "advice"]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some("help_seeking".to_string());
}
if ["complain", "issue", "problem", "trouble", "error", "wrong"]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some("complaint".to_string());
}
if ["i think", "i believe", "in my opinion", "i feel", "i know"]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some("information_sharing".to_string());
}
Some("statement".to_string())
}
pub fn extract_topics(content: &str) -> Vec<String> {
let mut topics = Vec::new();
let content_lower = content.to_lowercase();
let topic_keywords = [
(
"technology",
&[
"computer",
"software",
"tech",
"ai",
"programming",
"code",
"algorithm",
"data",
"internet",
"web",
"mobile",
"app",
] as &[&str],
),
(
"sports",
&[
"football",
"basketball",
"soccer",
"tennis",
"game",
"match",
"team",
"player",
"score",
"league",
"championship",
],
),
(
"food",
&[
"restaurant",
"cooking",
"recipe",
"eat",
"meal",
"dish",
"cuisine",
"chef",
"ingredient",
"flavor",
"taste",
],
),
(
"travel",
&[
"trip",
"vacation",
"visit",
"country",
"hotel",
"flight",
"airport",
"tourism",
"destination",
"journey",
],
),
(
"work",
&[
"job",
"career",
"office",
"meeting",
"project",
"company",
"business",
"colleague",
"manager",
"salary",
],
),
(
"health",
&[
"doctor",
"medicine",
"exercise",
"wellness",
"fitness",
"hospital",
"treatment",
"therapy",
"diet",
"nutrition",
],
),
(
"education",
&[
"school",
"university",
"student",
"teacher",
"learn",
"study",
"course",
"degree",
"education",
"knowledge",
],
),
(
"entertainment",
&[
"movie",
"music",
"book",
"show",
"concert",
"theater",
"film",
"song",
"artist",
"performance",
],
),
(
"finance",
&[
"money",
"bank",
"investment",
"stock",
"financial",
"economy",
"budget",
"savings",
"loan",
"credit",
],
),
(
"science",
&[
"research",
"experiment",
"discovery",
"theory",
"physics",
"chemistry",
"biology",
"mathematics",
"scientific",
],
),
(
"politics",
&[
"government",
"election",
"policy",
"political",
"democracy",
"vote",
"politician",
"law",
"regulation",
],
),
(
"family",
&[
"family",
"parent",
"child",
"sibling",
"relative",
"marriage",
"relationship",
"home",
"domestic",
],
),
];
for (topic, keywords) in topic_keywords {
if keywords.iter().any(|keyword| content_lower.contains(keyword)) {
topics.push(topic.to_string());
}
}
topics
}
pub fn extract_entities(content: &str) -> Vec<EntityMention> {
let mut entities = Vec::new();
let patterns = [
(r"\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", "PERSON"),
(r"\b\d{1,2}/\d{1,2}/\d{4}\b", "DATE"),
(r"\b\d{4}-\d{2}-\d{2}\b", "DATE"),
(
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b",
"DATE",
),
(r"\$\d+(?:\.\d{2})?\b", "MONEY"),
(
r"\b\d+(?:\.\d+)?\s*(?:dollars?|euros?|pounds?|yen)\b",
"MONEY",
),
(r"\b\d{3}-\d{3}-\d{4}\b", "PHONE"),
(
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
"EMAIL",
),
(r"\bhttps?://[^\s]+\b", "URL"),
(
r"\b\d{1,5}\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln)\b",
"ADDRESS",
),
(
r"\b[A-Z][a-z]+\s+(?:University|College|School|Hospital|Corporation|Corp|Inc|LLC)\b",
"ORGANIZATION",
),
];
for (pattern, entity_type) in patterns {
if let Ok(regex) = Regex::new(pattern) {
for mat in regex.find_iter(content) {
entities.push(EntityMention {
text: mat.as_str().to_string(),
entity_type: entity_type.to_string(),
confidence: 0.8,
start_pos: mat.start(),
end_pos: mat.end(),
});
}
}
}
entities
}
pub fn calculate_confidence(content: &str) -> f32 {
let mut confidence: f32 = 0.7;
if content.len() > 20 {
confidence += 0.1;
}
if content.len() > 100 {
confidence += 0.1;
}
let uncertainty_words = [
"maybe", "perhaps", "might", "possibly", "probably", "seems", "appears", "could be",
];
if !uncertainty_words.iter().any(|&word| content.to_lowercase().contains(word)) {
confidence += 0.1;
}
let confidence_words = [
"definitely",
"certainly",
"absolutely",
"clearly",
"obviously",
"undoubtedly",
];
if confidence_words.iter().any(|&word| content.to_lowercase().contains(word)) {
confidence += 0.1;
}
if content.chars().any(|c| c.is_uppercase()) {
confidence += 0.05;
}
if [".", "!", "?"].iter().any(|&punct| content.contains(punct)) {
confidence += 0.05;
}
confidence.min(1.0)
}
pub fn calculate_quality_score(content: &str) -> f32 {
let mut score = 0.5;
let length = content.len();
if (10..=1000).contains(&length) {
score += 0.2;
}
if content.chars().any(|c| c.is_uppercase()) {
score += 0.1;
}
if [".", "!", "?"].iter().any(|&punct| content.contains(punct)) {
score += 0.1;
}
if !["uhh", "umm", "err", "uh", "um"]
.iter()
.any(|&filler| content.to_lowercase().contains(filler))
{
score += 0.1;
}
let words: HashSet<&str> = content.split_whitespace().collect();
let unique_ratio = words.len() as f32 / content.split_whitespace().count().max(1) as f32;
score += unique_ratio * 0.1;
let sentence_count = content.matches(['.', '!', '?']).count();
if sentence_count > 0 {
score += 0.1;
}
score.min(1.0)
}
pub fn assess_engagement(content: &str) -> EngagementLevel {
let content_lower = content.to_lowercase();
let engagement_indicators = content.matches(['!', '?']).count()
+ [
"wow",
"really",
"interesting",
"amazing",
"fantastic",
"incredible",
"awesome",
]
.iter()
.map(|&word| content_lower.matches(word).count())
.sum::<usize>()
+ if content_lower.contains("very") || content_lower.contains("extremely") {
1
} else {
0
}
+ if content.len() > 100 { 1 } else { 0 };
let emotional_words = [
"love",
"hate",
"excited",
"thrilled",
"devastated",
"overjoyed",
];
let emotional_intensity = emotional_words
.iter()
.map(|&word| content_lower.matches(word).count())
.sum::<usize>();
let total_score = engagement_indicators + emotional_intensity;
match total_score {
0..=1 => EngagementLevel::Low,
2..=3 => EngagementLevel::Medium,
4..=6 => EngagementLevel::High,
_ => EngagementLevel::VeryHigh,
}
}
pub fn detect_reasoning_type(content: &str) -> Option<ReasoningType> {
let content_lower = content.to_lowercase();
if [
"because",
"therefore",
"thus",
"consequently",
"hence",
"so",
"since",
"as a result",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some(ReasoningType::Logical);
}
if [
"causes",
"leads to",
"results in",
"due to",
"caused by",
"effect of",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some(ReasoningType::Causal);
}
if [
"like",
"similar to",
"analogous",
"comparable",
"just as",
"in the same way",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some(ReasoningType::Analogical);
}
if [
"calculate",
"equation",
"formula",
"math",
"number",
"statistics",
"probability",
"algorithm",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some(ReasoningType::Mathematical);
}
if [
"feel",
"emotion",
"heart",
"intuition",
"gut feeling",
"emotional",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some(ReasoningType::Emotional);
}
if [
"imagine",
"creative",
"innovative",
"brainstorm",
"think outside",
"original",
]
.iter()
.any(|&pattern| content_lower.contains(pattern))
{
return Some(ReasoningType::Creative);
}
None
}
pub fn detect_safety_issues(content: &str) -> Vec<String> {
let mut flags = Vec::new();
let content_lower = content.to_lowercase();
let safety_patterns = [
(
"violence",
&[
"kill", "hurt", "harm", "attack", "violence", "weapon", "fight", "murder",
] as &[&str],
),
(
"inappropriate",
&[
"inappropriate",
"offensive",
"rude",
"insulting",
"harassment",
],
),
(
"personal_info",
&[
"password",
"ssn",
"social security",
"credit card",
"bank account",
"phone number",
],
),
(
"hate_speech",
&["hate", "racist", "sexist", "discrimination", "prejudice"],
),
(
"self_harm",
&[
"suicide",
"self-harm",
"cut myself",
"kill myself",
"end it all",
],
),
(
"illegal",
&["illegal", "drugs", "steal", "fraud", "scam", "criminal"],
),
(
"adult_content",
&["sexual", "explicit", "pornographic", "adult content"],
),
];
for (flag, patterns) in safety_patterns {
if patterns.iter().any(|pattern| content_lower.contains(pattern)) {
flags.push(flag.to_string());
}
}
flags
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sentiment_positive_text() {
let sentiment = TextAnalyzer::analyze_sentiment("This is great and amazing work!");
assert_eq!(
sentiment.as_deref(),
Some("positive"),
"clearly positive text should detect positive sentiment"
);
}
#[test]
fn test_sentiment_negative_text() {
let sentiment =
TextAnalyzer::analyze_sentiment("I hate this terrible and awful experience.");
assert_eq!(
sentiment.as_deref(),
Some("negative"),
"clearly negative text should detect negative sentiment"
);
}
#[test]
fn test_sentiment_neutral_text() {
let sentiment = TextAnalyzer::analyze_sentiment("okay fine normal standard");
assert_eq!(
sentiment.as_deref(),
Some("neutral"),
"neutral filler words should detect neutral sentiment"
);
}
#[test]
fn test_sentiment_empty_text_returns_some() {
let sentiment = TextAnalyzer::analyze_sentiment("");
assert!(
sentiment.is_some(),
"empty text must still return Some variant"
);
}
#[test]
fn test_intent_question_with_question_mark() {
let intent = TextAnalyzer::classify_intent("What is the meaning of life?");
assert_eq!(
intent.as_deref(),
Some("question"),
"text with '?' should classify as question"
);
}
#[test]
fn test_intent_question_starts_with_how() {
let intent = TextAnalyzer::classify_intent("How does this work?");
assert_eq!(
intent.as_deref(),
Some("question"),
"'how' prefix should classify as question"
);
}
#[test]
fn test_intent_request_contains_please() {
let intent = TextAnalyzer::classify_intent("Please help me with this.");
assert_eq!(
intent.as_deref(),
Some("request"),
"'please' should classify as request"
);
}
#[test]
fn test_intent_gratitude_thanks() {
let intent = TextAnalyzer::classify_intent("Thanks for your help.");
assert_eq!(
intent.as_deref(),
Some("gratitude"),
"'thanks' should classify as gratitude"
);
}
#[test]
fn test_intent_greeting_hello() {
let intent = TextAnalyzer::classify_intent("Hello there!");
assert_eq!(
intent.as_deref(),
Some("greeting"),
"'hello' should classify as greeting"
);
}
#[test]
fn test_intent_farewell() {
let intent = TextAnalyzer::classify_intent("Goodbye, see you later.");
assert_eq!(
intent.as_deref(),
Some("farewell"),
"farewell keywords should classify as farewell"
);
}
#[test]
fn test_intent_complaint_with_problem() {
let intent = TextAnalyzer::classify_intent("I am experiencing a problem.");
assert_eq!(
intent.as_deref(),
Some("complaint"),
"'problem' should classify as complaint"
);
}
#[test]
fn test_intent_statement_for_generic_text() {
let intent = TextAnalyzer::classify_intent("The sky is blue.");
assert_eq!(
intent.as_deref(),
Some("statement"),
"generic sentence should classify as statement"
);
}
#[test]
fn test_extract_topics_technology() {
let topics = TextAnalyzer::extract_topics("I love programming and software development.");
assert!(
topics.contains(&"technology".to_string()),
"tech keywords should extract technology topic"
);
}
#[test]
fn test_extract_topics_health() {
let topics = TextAnalyzer::extract_topics("I exercise daily for fitness and wellness.");
assert!(
topics.contains(&"health".to_string()),
"health keywords should extract health topic"
);
}
#[test]
fn test_extract_topics_empty_text_returns_no_topics() {
let topics = TextAnalyzer::extract_topics("xyz_obscure_none");
assert!(topics.is_empty(), "obscure text should yield no topics");
}
#[test]
fn test_extract_entities_finds_email() {
let entities =
TextAnalyzer::extract_entities("Contact me at user@example.com for details.");
let emails: Vec<_> = entities.iter().filter(|e| e.entity_type == "EMAIL").collect();
assert!(
!emails.is_empty(),
"email address should be extracted as EMAIL entity"
);
}
#[test]
fn test_extract_entities_finds_money() {
let entities = TextAnalyzer::extract_entities("The price is $42.99 today.");
let money: Vec<_> = entities.iter().filter(|e| e.entity_type == "MONEY").collect();
assert!(
!money.is_empty(),
"dollar amount should be extracted as MONEY entity"
);
}
#[test]
fn test_extract_entities_finds_url() {
let entities = TextAnalyzer::extract_entities("Visit https://example.com for more info.");
let urls: Vec<_> = entities.iter().filter(|e| e.entity_type == "URL").collect();
assert!(!urls.is_empty(), "URL should be extracted as URL entity");
}
#[test]
fn test_confidence_increases_with_length() {
let short_conf = TextAnalyzer::calculate_confidence("hi");
let long_conf = TextAnalyzer::calculate_confidence(
"This is a much longer statement that contains more information.",
);
assert!(
long_conf > short_conf,
"longer text should yield higher confidence"
);
}
#[test]
fn test_quality_score_reasonable_text() {
let score = TextAnalyzer::calculate_quality_score("This is a complete sentence.");
assert!(score > 0.5, "well-formed sentence should score above 0.5");
}
#[test]
fn test_engagement_low_for_plain_text() {
let level = TextAnalyzer::assess_engagement("hello");
assert_eq!(
level,
EngagementLevel::Low,
"minimal text should have low engagement"
);
}
#[test]
fn test_engagement_high_with_exclamations_and_keywords() {
let level = TextAnalyzer::assess_engagement(
"Wow! This is absolutely amazing and incredible!! I love it so much!",
);
assert!(
matches!(level, EngagementLevel::High | EngagementLevel::VeryHigh),
"enthusiastic text should have high engagement"
);
}
#[test]
fn test_detect_reasoning_logical() {
let rt = TextAnalyzer::detect_reasoning_type("Because it rained, the ground is wet.");
assert_eq!(
rt,
Some(ReasoningType::Logical),
"'because' should detect logical reasoning"
);
}
#[test]
fn test_detect_reasoning_causal() {
let rt = TextAnalyzer::detect_reasoning_type("Stress causes health problems.");
assert_eq!(
rt,
Some(ReasoningType::Causal),
"'causes' should detect causal reasoning"
);
}
#[test]
fn test_detect_reasoning_mathematical() {
let rt = TextAnalyzer::detect_reasoning_type("Please calculate the equation.");
assert_eq!(
rt,
Some(ReasoningType::Mathematical),
"math keywords should detect mathematical reasoning"
);
}
#[test]
fn test_safety_detects_violence_keywords() {
let flags = TextAnalyzer::detect_safety_issues("Someone wants to hurt others.");
assert!(
flags.contains(&"violence".to_string()),
"'hurt' should trigger violence flag"
);
}
#[test]
fn test_safety_clean_text_returns_empty() {
let flags = TextAnalyzer::detect_safety_issues("I love sunny days and good food.");
assert!(flags.is_empty(), "clean text should return no safety flags");
}
}