use super::{NameCandidate, NamingStrategy};
use std::collections::HashMap;
pub struct DomainTermExtractor {
common_verbs: Vec<&'static str>,
stop_words: Vec<&'static str>,
}
impl DomainTermExtractor {
pub fn new() -> Self {
Self {
common_verbs: vec![
"get", "set", "is", "has", "can", "should", "with", "to", "from", "into", "as",
"new", "default", "clone", "eq", "ne", "cmp", "hash", "manage",
],
stop_words: vec![
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with",
"by", "from", "as", "is", "was", "are", "were", "be", "been", "being", "have",
"has", "had", "do", "does", "did", "will", "would", "should", "could", "may",
"might", "must", "can", "this", "that", "these", "those", "data",
],
}
}
pub fn extract_from_methods(&self, methods: &[String]) -> Option<NameCandidate> {
if methods.is_empty() {
return None;
}
let mut verb_counts: HashMap<String, usize> = HashMap::new();
let mut noun_counts: HashMap<String, usize> = HashMap::new();
let action_verbs = [
"infer",
"classify",
"extract",
"detect",
"recommend",
"suggest",
"calculate",
"compute",
"analyze",
"validate",
"format",
"parse",
"transform",
"convert",
"generate",
"build",
"create",
"update",
"serialize",
"deserialize",
"encode",
"decode",
"evaluate",
"render",
"display",
"group",
"cluster",
"merge",
"split",
];
for method in methods {
let tokens = self.tokenize_method_name(method);
if let Some(first_token) = tokens.first() {
if action_verbs.contains(&first_token.as_str()) {
*verb_counts.entry(first_token.clone()).or_insert(0) += 1;
}
}
for (i, token) in tokens.iter().enumerate() {
if i > 0 && !self.is_stop_word(token) && !action_verbs.contains(&token.as_str()) {
let specificity = self.calculate_term_specificity(token);
if specificity > 0.5 {
*noun_counts.entry(token.clone()).or_insert(0) += 1;
}
}
}
}
let dominant_verb = verb_counts
.into_iter()
.filter(|(_, count)| (*count as f64 / methods.len() as f64) > 0.3)
.max_by_key(|(_, count)| *count)
.map(|(verb, _)| verb);
let dominant_noun = noun_counts
.into_iter()
.filter(|(_, count)| (*count as f64 / methods.len() as f64) > 0.3)
.max_by_key(|(_, count)| *count)
.map(|(noun, _)| noun);
match (dominant_noun, dominant_verb) {
(Some(noun), Some(verb)) => {
let module_name = format!("{}_{}", noun, verb);
Some(NameCandidate {
module_name,
confidence: 0.90,
specificity_score: 0.85,
reasoning: format!(
"Extracted verb '{}' and noun '{}' from method names",
verb, noun
),
strategy: NamingStrategy::DomainTerms,
})
}
(Some(noun), None) => {
Some(NameCandidate {
module_name: noun.clone(),
confidence: 0.75,
specificity_score: self.calculate_term_specificity(&noun),
reasoning: format!("Extracted dominant noun '{}' from method names", noun),
strategy: NamingStrategy::DomainTerms,
})
}
(None, Some(verb)) => {
let gerund = if verb.ends_with('e') {
format!("{}ing", &verb[..verb.len() - 1])
} else {
format!("{}ing", verb)
};
Some(NameCandidate {
module_name: gerund.clone(),
confidence: 0.70,
specificity_score: 0.65,
reasoning: format!("Extracted dominant verb '{}' from method names", verb),
strategy: NamingStrategy::BehavioralPattern,
})
}
(None, None) => None,
}
}
pub fn generate_domain_name(&self, methods: &[String]) -> Option<NameCandidate> {
if methods.is_empty() {
return None;
}
let terms = self.extract_domain_terms(methods);
if terms.is_empty() {
return None;
}
if let Some(name) = self.find_verb_noun_pair(&terms) {
return Some(name);
}
let (primary_term, primary_freq) = &terms[0];
if *primary_freq < 0.3 {
return None;
}
Some(NameCandidate {
module_name: primary_term.clone(),
confidence: (primary_freq * 0.9).min(0.85), specificity_score: self.calculate_term_specificity(primary_term),
reasoning: format!(
"Dominant term '{}' appears in {:.0}% of methods",
primary_term,
primary_freq * 100.0
),
strategy: NamingStrategy::DomainTerms,
})
}
pub fn extract_from_description(&self, description: &str) -> Option<NameCandidate> {
let tokens = self.tokenize_text(description);
let significant_tokens: Vec<String> = tokens
.into_iter()
.filter(|t| !self.is_stop_word(t))
.filter(|t| !self.is_common_verb(t))
.filter(|t| t.len() > 3)
.collect();
if significant_tokens.is_empty() {
return None;
}
let mut freq_map: HashMap<String, usize> = HashMap::new();
for token in &significant_tokens {
*freq_map.entry(token.clone()).or_insert(0) += 1;
}
let mut terms: Vec<_> = freq_map.into_iter().collect();
terms.sort_by(|a, b| {
let freq_cmp = b.1.cmp(&a.1);
if freq_cmp != std::cmp::Ordering::Equal {
return freq_cmp;
}
let a_specificity = self.calculate_term_specificity(&a.0);
let b_specificity = self.calculate_term_specificity(&b.0);
let specificity_cmp = b_specificity
.partial_cmp(&a_specificity)
.unwrap_or(std::cmp::Ordering::Equal);
if specificity_cmp != std::cmp::Ordering::Equal {
return specificity_cmp;
}
let a_is_gerund = a.0.ends_with("ing");
let b_is_gerund = b.0.ends_with("ing");
match (a_is_gerund, b_is_gerund) {
(false, true) => std::cmp::Ordering::Less, (true, false) => std::cmp::Ordering::Greater, _ => std::cmp::Ordering::Equal,
}
});
let (term, count) = &terms[0];
let freq = *count as f64 / significant_tokens.len() as f64;
if freq < 0.2 {
return None;
}
Some(NameCandidate {
module_name: term.clone(),
confidence: 0.6, specificity_score: self.calculate_term_specificity(term),
reasoning: format!("Extracted from description: '{}'", description),
strategy: NamingStrategy::DomainTerms,
})
}
fn extract_domain_terms(&self, methods: &[String]) -> Vec<(String, f64)> {
let all_tokens: Vec<String> = methods
.iter()
.flat_map(|m| self.tokenize_method_name(m))
.collect();
if all_tokens.is_empty() {
return vec![];
}
let mut freq_map: HashMap<String, usize> = HashMap::new();
for token in &all_tokens {
*freq_map.entry(token.clone()).or_insert(0) += 1;
}
let total_tokens = all_tokens.len() as f64;
let mut terms: Vec<(String, f64)> = freq_map
.into_iter()
.map(|(term, count)| (term, count as f64 / total_tokens))
.filter(|(term, _)| !self.is_stop_word(term))
.filter(|(term, _)| !self.is_common_verb(term))
.filter(|(_term, freq)| *freq >= 0.15) .collect();
terms.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
terms
}
fn tokenize_method_name(&self, method: &str) -> Vec<String> {
let mut tokens = Vec::new();
for part in method.split('_') {
tokens.extend(self.split_camel_case(part));
}
tokens
.into_iter()
.map(|s| s.to_lowercase())
.filter(|s| s.len() > 2) .collect()
}
fn tokenize_text(&self, text: &str) -> Vec<String> {
text.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.map(String::from)
.collect()
}
fn split_camel_case(&self, s: &str) -> Vec<String> {
if s.is_empty() {
return vec![];
}
let mut result = Vec::new();
let mut current = String::new();
let chars: Vec<char> = s.chars().collect();
for i in 0..chars.len() {
let ch = chars[i];
let is_upper = ch.is_uppercase();
let prev_was_lower = i > 0 && chars[i - 1].is_lowercase();
let next_is_lower = i + 1 < chars.len() && chars[i + 1].is_lowercase();
if is_upper && (!current.is_empty() && (prev_was_lower || next_is_lower)) {
#[allow(clippy::if_same_then_else, clippy::len_zero)]
if !prev_was_lower && next_is_lower && current.len() > 0 {
result.push(current.clone());
current.clear();
} else if prev_was_lower {
result.push(current.clone());
current.clear();
}
}
current.push(ch);
}
if !current.is_empty() {
result.push(current);
}
result
}
fn find_verb_noun_pair(&self, terms: &[(String, f64)]) -> Option<NameCandidate> {
if terms.len() < 2 {
return None;
}
let action_verbs = [
"format",
"parse",
"validate",
"calculate",
"compute",
"analyze",
"process",
"transform",
"convert",
"serialize",
"deserialize",
"encode",
"decode",
"render",
"display",
"print",
"write",
"read",
"load",
"save",
"create",
"build",
"generate",
];
for (verb_candidate, verb_freq) in terms {
if action_verbs.contains(&verb_candidate.as_str()) {
for (noun_candidate, noun_freq) in terms {
if noun_candidate != verb_candidate
&& !action_verbs.contains(&noun_candidate.as_str())
{
let combined_freq = (verb_freq + noun_freq) / 2.0;
if combined_freq > 0.25 {
let module_name = format!("{}_{}", verb_candidate, noun_candidate);
return Some(NameCandidate {
module_name,
confidence: 0.85, specificity_score: 0.8,
reasoning: format!(
"Identified verb-noun pattern: '{}' + '{}' (avg frequency: {:.0}%)",
verb_candidate, noun_candidate, combined_freq * 100.0
),
strategy: NamingStrategy::DomainTerms,
});
}
}
}
}
}
None
}
fn calculate_term_specificity(&self, term: &str) -> f64 {
let generic_terms = [
"data", "value", "item", "object", "type", "info", "element", "thing",
];
if generic_terms.contains(&term) {
return 0.3;
}
let domain_terms = [
("coverage", 0.9),
("metrics", 0.85),
("complexity", 0.9),
("analysis", 0.8),
("validation", 0.85),
("formatting", 0.85),
("parsing", 0.9),
("serialization", 0.9),
("computation", 0.85),
("optimization", 0.9),
];
for (domain_term, score) in &domain_terms {
if term.contains(domain_term) {
return *score;
}
}
0.6
}
fn is_stop_word(&self, word: &str) -> bool {
self.stop_words.contains(&word)
}
fn is_common_verb(&self, word: &str) -> bool {
self.common_verbs.contains(&word)
}
}
pub fn extract_dominant_verb(methods: &[String]) -> Option<String> {
use std::collections::HashMap;
let verbs = [
"validate",
"parse",
"format",
"calculate",
"transform",
"convert",
"generate",
"analyze",
"detect",
"classify",
"build",
"create",
"update",
"delete",
"query",
"serialize",
"deserialize",
"encode",
"decode",
"evaluate",
"render",
"display",
];
let mut verb_counts: HashMap<&str, usize> = HashMap::new();
for method in methods {
let method_lower = method.to_lowercase();
for verb in &verbs {
if method_lower.starts_with(verb) {
*verb_counts.entry(verb).or_default() += 1;
break; }
}
}
verb_counts
.into_iter()
.filter(|(_, count)| (*count as f64 / methods.len() as f64) > 0.3)
.max_by_key(|(_, count)| *count)
.map(|(verb, _)| capitalize_first(verb))
}
fn capitalize_first(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(first) => first.to_uppercase().chain(chars).collect(),
}
}
impl Default for DomainTermExtractor {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_snake_case() {
let extractor = DomainTermExtractor::new();
let tokens = extractor.tokenize_method_name("format_coverage_status");
assert_eq!(tokens, vec!["format", "coverage", "status"]);
}
#[test]
fn test_tokenize_camel_case() {
let extractor = DomainTermExtractor::new();
let tokens = extractor.tokenize_method_name("calculateMetrics");
assert_eq!(tokens, vec!["calculate", "metrics"]);
}
#[test]
fn test_tokenize_mixed_case() {
let extractor = DomainTermExtractor::new();
let tokens = extractor.tokenize_method_name("get_APIKey");
assert!(tokens.contains(&"api".to_string()));
assert!(tokens.contains(&"key".to_string()));
}
#[test]
fn test_extract_domain_terms() {
let extractor = DomainTermExtractor::new();
let methods = vec![
"format_coverage_status".to_string(),
"format_coverage_factor".to_string(),
"calculate_coverage_percentage".to_string(),
];
let terms = extractor.extract_domain_terms(&methods);
assert!(!terms.is_empty());
let coverage_term = terms.iter().find(|(term, _)| term == "coverage");
assert!(coverage_term.is_some());
let (_, freq) = coverage_term.unwrap();
assert!(*freq > 0.3); }
#[test]
fn test_filters_common_verbs() {
let extractor = DomainTermExtractor::new();
let methods = vec![
"get_value".to_string(),
"set_value".to_string(),
"is_valid".to_string(),
];
let terms = extractor.extract_domain_terms(&methods);
assert!(!terms.iter().any(|(term, _)| term == "get"));
assert!(!terms.iter().any(|(term, _)| term == "set"));
}
#[test]
fn test_verb_noun_extraction() {
let extractor = DomainTermExtractor::new();
let methods = vec![
"format_coverage".to_string(),
"format_status".to_string(),
"parse_coverage".to_string(),
];
let name = extractor.generate_domain_name(&methods);
assert!(name.is_some());
let candidate = name.unwrap();
assert!(
candidate.module_name.contains("format") || candidate.module_name.contains("coverage")
);
}
#[test]
fn test_extract_from_description() {
let extractor = DomainTermExtractor::new();
let description = "Manage coverage data and its transformations";
let name = extractor.extract_from_description(description);
assert!(name.is_some());
let candidate = name.unwrap();
assert_eq!(candidate.module_name, "coverage");
}
#[test]
fn test_specificity_scoring() {
let extractor = DomainTermExtractor::new();
assert!(extractor.calculate_term_specificity("coverage") > 0.8);
assert!(extractor.calculate_term_specificity("complexity") > 0.8);
assert!(extractor.calculate_term_specificity("data") < 0.5);
assert!(extractor.calculate_term_specificity("value") < 0.5);
}
#[test]
fn test_extract_validation_verb() {
let methods = vec![
"validate_input".to_string(),
"validate_output".to_string(),
"validate_schema".to_string(),
];
assert_eq!(
extract_dominant_verb(&methods),
Some("Validate".to_string())
);
}
#[test]
fn test_extract_parsing_verb() {
let methods = vec![
"parse_file".to_string(),
"parse_tokens".to_string(),
"parse_expression".to_string(),
"other_method".to_string(),
];
assert_eq!(extract_dominant_verb(&methods), Some("Parse".to_string()));
}
#[test]
fn test_no_dominant_verb() {
let methods = vec![
"process_a".to_string(),
"handle_b".to_string(),
"compute_c".to_string(),
];
assert_eq!(extract_dominant_verb(&methods), None);
}
#[test]
fn test_extract_format_verb() {
let methods = vec![
"format_output".to_string(),
"format_error".to_string(),
"format_status".to_string(),
"other_method".to_string(),
];
assert_eq!(extract_dominant_verb(&methods), Some("Format".to_string()));
}
#[test]
fn test_verb_case_insensitive() {
let methods = vec![
"Validate_Input".to_string(),
"VALIDATE_OUTPUT".to_string(),
"validate_schema".to_string(),
];
assert_eq!(
extract_dominant_verb(&methods),
Some("Validate".to_string())
);
}
#[test]
fn test_capitalize_first() {
assert_eq!(capitalize_first("hello"), "Hello");
assert_eq!(capitalize_first("WORLD"), "WORLD");
assert_eq!(capitalize_first(""), "");
}
}