use ahash::AHashMap;
use std::cmp::Ordering;
#[derive(Debug, Clone)]
struct ScoredToken {
token: String,
position: usize,
importance_score: f32,
#[allow(dead_code)]
context_boost: f32,
#[allow(dead_code)]
frequency_score: f32,
}
impl PartialEq for ScoredToken {
fn eq(&self, other: &Self) -> bool {
self.importance_score == other.importance_score
}
}
impl Eq for ScoredToken {}
impl PartialOrd for ScoredToken {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ScoredToken {
fn cmp(&self, other: &Self) -> Ordering {
self.importance_score.total_cmp(&other.importance_score)
}
}
pub struct SemanticAnalyzer {
importance_weights: AHashMap<String, f32>,
hypernyms: AHashMap<String, String>,
semantic_clusters: AHashMap<String, Vec<String>>,
}
impl SemanticAnalyzer {
pub fn new(_language: &str) -> Self {
let mut analyzer = Self {
importance_weights: AHashMap::new(),
hypernyms: AHashMap::new(),
semantic_clusters: AHashMap::new(),
};
analyzer.initialize_importance_weights();
analyzer.initialize_hypernyms();
analyzer.initialize_semantic_clusters();
analyzer
}
pub fn apply_semantic_filtering(&self, text: &str, threshold: f32) -> String {
let tokens = self.tokenize_and_score(text);
let filtered_tokens = self.filter_by_importance(tokens, threshold);
self.reconstruct_text(filtered_tokens)
}
pub fn apply_hypernym_compression(&self, text: &str, target_reduction: Option<f32>) -> String {
let tokens = self.tokenize_and_score(text);
let compressed_tokens = self.compress_with_hypernyms(tokens, target_reduction);
self.reconstruct_text(compressed_tokens)
}
fn tokenize_and_score(&self, text: &str) -> Vec<ScoredToken> {
let words: Vec<&str> = text.split_whitespace().collect();
let mut scored_tokens = Vec::with_capacity(words.len());
let mut word_freq = AHashMap::new();
for word in &words {
let clean_word = self.clean_word(word);
*word_freq.entry(clean_word).or_insert(0) += 1;
}
for (position, word) in words.iter().enumerate() {
let clean_word = self.clean_word(word);
let base_importance = self.calculate_base_importance(&clean_word);
let context_boost = self.calculate_context_boost(&clean_word, position, &words);
let frequency_score = self.calculate_frequency_score(&clean_word, &word_freq, words.len());
let total_score = base_importance + context_boost + frequency_score;
scored_tokens.push(ScoredToken {
token: word.to_string(),
position,
importance_score: total_score,
context_boost,
frequency_score,
});
}
scored_tokens
}
fn filter_by_importance(&self, tokens: Vec<ScoredToken>, threshold: f32) -> Vec<ScoredToken> {
tokens
.into_iter()
.filter(|token| token.importance_score >= threshold)
.collect()
}
fn compress_with_hypernyms(&self, tokens: Vec<ScoredToken>, target_reduction: Option<f32>) -> Vec<ScoredToken> {
let mut result = tokens;
if let Some(target) = target_reduction {
let target_count = ((1.0 - target) * result.len() as f32) as usize;
result.sort_by(|a, b| b.importance_score.total_cmp(&a.importance_score));
for token in result.iter_mut().skip(target_count) {
if let Some(hypernym) = self.get_hypernym(&token.token) {
token.token = hypernym;
token.importance_score *= 0.8;
}
}
result.truncate(target_count.max(1));
} else {
for token in &mut result {
if token.importance_score < 0.5
&& let Some(hypernym) = self.get_hypernym(&token.token)
{
token.token = hypernym;
}
}
}
result.sort_by_key(|token| token.position);
result
}
fn reconstruct_text(&self, tokens: Vec<ScoredToken>) -> String {
tokens
.into_iter()
.map(|token| token.token)
.collect::<Vec<_>>()
.join(" ")
}
fn calculate_base_importance(&self, word: &str) -> f32 {
if let Some(&weight) = self.importance_weights.get(word) {
return weight;
}
let mut score = 0.3;
score += (word.len() as f32 * 0.02).min(0.2);
if word.chars().next().map(|c| c.is_uppercase()).unwrap_or(false) {
score += 0.2;
}
if word.chars().any(|c| c.is_numeric()) {
score += 0.15;
}
if self.is_technical_term(word) {
score += 0.25;
}
score.min(1.0)
}
fn calculate_context_boost(&self, word: &str, position: usize, words: &[&str]) -> f32 {
let mut boost = 0.0;
if position == 0 || position == words.len() - 1 {
boost += 0.1;
}
let window = 2;
let start = position.saturating_sub(window);
let end = (position + window + 1).min(words.len());
for &context_word in &words[start..end] {
if context_word != word {
boost += self.calculate_contextual_weight(word, context_word);
}
}
boost.min(0.3)
}
fn calculate_frequency_score(&self, word: &str, word_freq: &AHashMap<String, i32>, total_words: usize) -> f32 {
if let Some(&freq) = word_freq.get(word) {
let tf = freq as f32 / total_words as f32;
(tf.ln() + 1.0) * 0.1
} else {
0.0
}
}
fn calculate_contextual_weight(&self, word: &str, context_word: &str) -> f32 {
if self.is_technical_term(word) && self.is_technical_term(context_word) {
0.05
} else if context_word.chars().next().map(|c| c.is_uppercase()).unwrap_or(false) {
0.02
} else {
0.0
}
}
fn is_technical_term(&self, word: &str) -> bool {
word.len() > 6
&& (word.contains("_")
|| word.chars().filter(|&c| c.is_uppercase()).count() > 1
|| word.ends_with("tion")
|| word.ends_with("ment")
|| word.ends_with("ing"))
}
fn get_hypernym(&self, word: &str) -> Option<String> {
let clean_word = self.clean_word(word).to_lowercase();
self.hypernyms.get(&clean_word).cloned()
}
fn clean_word(&self, word: &str) -> String {
word.chars()
.filter(|c| c.is_alphanumeric())
.collect::<String>()
.to_lowercase()
}
fn initialize_importance_weights(&mut self) {
let high_importance = [
("result", 0.8),
("conclusion", 0.8),
("important", 0.7),
("significant", 0.7),
("analysis", 0.7),
("method", 0.6),
("data", 0.6),
("system", 0.6),
("performance", 0.6),
("improvement", 0.6),
];
for (word, score) in &high_importance {
self.importance_weights.insert(word.to_string(), *score);
}
let medium_importance = [
("process", 0.5),
("algorithm", 0.5),
("function", 0.5),
("model", 0.5),
("implementation", 0.5),
];
for (word, score) in &medium_importance {
self.importance_weights.insert(word.to_string(), *score);
}
}
fn initialize_hypernyms(&mut self) {
let hypernym_pairs = [
("car", "vehicle"),
("dog", "animal"),
("apple", "fruit"),
("chair", "furniture"),
("book", "publication"),
("computer", "device"),
("algorithm", "method"),
("implementation", "approach"),
("optimization", "improvement"),
("analysis", "study"),
];
for (word, hypernym) in &hypernym_pairs {
self.hypernyms.insert(word.to_string(), hypernym.to_string());
}
}
fn initialize_semantic_clusters(&mut self) {
self.semantic_clusters.insert(
"computing".to_string(),
vec![
"computer".to_string(),
"algorithm".to_string(),
"software".to_string(),
"programming".to_string(),
"code".to_string(),
],
);
self.semantic_clusters.insert(
"analysis".to_string(),
vec![
"analysis".to_string(),
"study".to_string(),
"research".to_string(),
"investigation".to_string(),
"examination".to_string(),
],
);
self.semantic_clusters.insert(
"performance".to_string(),
vec![
"performance".to_string(),
"speed".to_string(),
"efficiency".to_string(),
"optimization".to_string(),
"improvement".to_string(),
],
);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_semantic_filtering() {
let analyzer = SemanticAnalyzer::new("en");
let input = "The quick brown fox jumps over the lazy dog with great performance";
let result = analyzer.apply_semantic_filtering(input, 0.4);
assert!(result.contains("performance") || result.contains("fox") || result.contains("dog"));
assert!(result.len() < input.len());
}
#[test]
fn test_hypernym_compression() {
let analyzer = SemanticAnalyzer::new("en");
let input = "The car drove past the dog near the apple tree";
let result = analyzer.apply_hypernym_compression(input, Some(0.5));
let original_words = input.split_whitespace().count();
let result_words = result.split_whitespace().count();
assert!(result_words <= (original_words as f32 * 0.5) as usize + 1);
}
#[test]
fn test_importance_scoring() {
let analyzer = SemanticAnalyzer::new("en");
let tokens = analyzer.tokenize_and_score("The important analysis shows significant results");
let important_token = tokens.iter().find(|t| t.token == "important").unwrap();
let analysis_token = tokens.iter().find(|t| t.token == "analysis").unwrap();
let the_token = tokens.iter().find(|t| t.token == "The").unwrap();
assert!(important_token.importance_score > the_token.importance_score);
assert!(analysis_token.importance_score > the_token.importance_score);
}
#[test]
fn test_semantic_filtering_empty_text() {
let analyzer = SemanticAnalyzer::new("en");
let result = analyzer.apply_semantic_filtering("", 0.5);
assert_eq!(result, "");
}
#[test]
fn test_semantic_filtering_high_threshold() {
let analyzer = SemanticAnalyzer::new("en");
let input = "The quick brown fox";
let result = analyzer.apply_semantic_filtering(input, 0.9);
assert!(result.len() <= input.len());
}
#[test]
fn test_hypernym_compression_without_target() {
let analyzer = SemanticAnalyzer::new("en");
let input = "The car drove past the dog";
let result = analyzer.apply_hypernym_compression(input, None);
assert!(!result.is_empty());
}
#[test]
fn test_technical_term_detection() {
let analyzer = SemanticAnalyzer::new("en");
assert!(analyzer.is_technical_term("implementation"));
assert!(analyzer.is_technical_term("optimization"));
assert!(analyzer.is_technical_term("processing"));
assert!(analyzer.is_technical_term("HTTP_SERVER"));
assert!(!analyzer.is_technical_term("cat"));
assert!(!analyzer.is_technical_term("dog"));
}
#[test]
fn test_clean_word() {
let analyzer = SemanticAnalyzer::new("en");
assert_eq!(analyzer.clean_word("Hello!"), "hello");
assert_eq!(analyzer.clean_word("test123"), "test123");
assert_eq!(analyzer.clean_word("word,"), "word");
assert_eq!(analyzer.clean_word("(test)"), "test");
}
#[test]
fn test_calculate_base_importance() {
let analyzer = SemanticAnalyzer::new("en");
let result_score = analyzer.calculate_base_importance("result");
let conclusion_score = analyzer.calculate_base_importance("conclusion");
assert!(result_score > 0.5);
assert!(conclusion_score > 0.5);
let process_score = analyzer.calculate_base_importance("process");
assert!(process_score >= 0.4);
let regular_score = analyzer.calculate_base_importance("cat");
assert!(regular_score < result_score);
}
#[test]
fn test_calculate_base_importance_uppercase() {
let analyzer = SemanticAnalyzer::new("en");
let uppercase_score = analyzer.calculate_base_importance("Test");
let lowercase_score = analyzer.calculate_base_importance("test");
assert!(uppercase_score > lowercase_score);
}
#[test]
fn test_calculate_base_importance_with_numbers() {
let analyzer = SemanticAnalyzer::new("en");
let with_number = analyzer.calculate_base_importance("test123");
let without_number = analyzer.calculate_base_importance("test");
assert!(with_number > without_number);
}
#[test]
fn test_calculate_base_importance_length_bonus() {
let analyzer = SemanticAnalyzer::new("en");
let long_word = analyzer.calculate_base_importance("verylongword");
let short_word = analyzer.calculate_base_importance("cat");
assert!(long_word > short_word);
}
#[test]
fn test_get_hypernym() {
let analyzer = SemanticAnalyzer::new("en");
assert_eq!(analyzer.get_hypernym("car"), Some("vehicle".to_string()));
assert_eq!(analyzer.get_hypernym("dog"), Some("animal".to_string()));
assert_eq!(analyzer.get_hypernym("apple"), Some("fruit".to_string()));
assert_eq!(analyzer.get_hypernym("unknown"), None);
}
#[test]
fn test_get_hypernym_case_insensitive() {
let analyzer = SemanticAnalyzer::new("en");
assert_eq!(analyzer.get_hypernym("CAR"), Some("vehicle".to_string()));
assert_eq!(analyzer.get_hypernym("Dog"), Some("animal".to_string()));
}
#[test]
fn test_tokenize_and_score_positions() {
let analyzer = SemanticAnalyzer::new("en");
let tokens = analyzer.tokenize_and_score("first middle last");
assert_eq!(tokens[0].position, 0);
assert_eq!(tokens[1].position, 1);
assert_eq!(tokens[2].position, 2);
}
#[test]
fn test_context_boost_for_edge_positions() {
let analyzer = SemanticAnalyzer::new("en");
let tokens = analyzer.tokenize_and_score("first middle last");
assert!(tokens[0].importance_score > 0.0);
assert!(tokens[2].importance_score > 0.0);
}
#[test]
fn test_frequency_score() {
let analyzer = SemanticAnalyzer::new("en");
let tokens = analyzer.tokenize_and_score("test test test other");
let test_token = tokens.iter().find(|t| t.token == "test").unwrap();
let other_token = tokens.iter().find(|t| t.token == "other").unwrap();
assert!(test_token.frequency_score > other_token.frequency_score);
}
#[test]
fn test_scored_token_ordering() {
let token1 = ScoredToken {
token: "a".to_string(),
position: 0,
importance_score: 0.5,
context_boost: 0.0,
frequency_score: 0.0,
};
let token2 = ScoredToken {
token: "b".to_string(),
position: 1,
importance_score: 0.7,
context_boost: 0.0,
frequency_score: 0.0,
};
assert!(token2 > token1);
assert_eq!(token1, token1.clone());
}
#[test]
fn test_reconstruct_text() {
let analyzer = SemanticAnalyzer::new("en");
let tokens = vec![
ScoredToken {
token: "Hello".to_string(),
position: 0,
importance_score: 0.5,
context_boost: 0.0,
frequency_score: 0.0,
},
ScoredToken {
token: "world".to_string(),
position: 1,
importance_score: 0.5,
context_boost: 0.0,
frequency_score: 0.0,
},
];
let result = analyzer.reconstruct_text(tokens);
assert_eq!(result, "Hello world");
}
#[test]
fn test_compress_with_hypernyms_respects_target() {
let analyzer = SemanticAnalyzer::new("en");
let tokens = vec![
ScoredToken {
token: "car".to_string(),
position: 0,
importance_score: 0.3,
context_boost: 0.0,
frequency_score: 0.0,
},
ScoredToken {
token: "dog".to_string(),
position: 1,
importance_score: 0.3,
context_boost: 0.0,
frequency_score: 0.0,
},
ScoredToken {
token: "test".to_string(),
position: 2,
importance_score: 0.8,
context_boost: 0.0,
frequency_score: 0.0,
},
];
let result = analyzer.compress_with_hypernyms(tokens, Some(0.5));
assert!(result.len() <= 2);
}
#[test]
fn test_initialize_importance_weights() {
let analyzer = SemanticAnalyzer::new("en");
assert!(analyzer.importance_weights.contains_key("result"));
assert!(analyzer.importance_weights.contains_key("conclusion"));
assert!(analyzer.importance_weights.contains_key("important"));
assert!(analyzer.importance_weights.contains_key("process"));
}
#[test]
fn test_initialize_hypernyms() {
let analyzer = SemanticAnalyzer::new("en");
assert!(analyzer.hypernyms.contains_key("car"));
assert!(analyzer.hypernyms.contains_key("dog"));
assert!(analyzer.hypernyms.contains_key("apple"));
}
#[test]
fn test_initialize_semantic_clusters() {
let analyzer = SemanticAnalyzer::new("en");
assert!(analyzer.semantic_clusters.contains_key("computing"));
assert!(analyzer.semantic_clusters.contains_key("analysis"));
assert!(analyzer.semantic_clusters.contains_key("performance"));
}
#[test]
fn test_contextual_weight_technical_terms() {
let analyzer = SemanticAnalyzer::new("en");
let weight = analyzer.calculate_contextual_weight("implementation", "optimization");
assert!(weight > 0.0);
}
#[test]
fn test_hypernym_compression_zero_target() {
let analyzer = SemanticAnalyzer::new("en");
let input = "The car drove fast";
let result = analyzer.apply_hypernym_compression(input, Some(0.0));
assert!(!result.is_empty());
}
}