impl PatternExtractor {
fn calculate_variation_score(&self, matches: &[regex::Match], content: &str) -> f64 {
if matches.len() <= 1 {
return 0.0;
}
let contexts: Vec<String> = matches
.iter()
.take(5)
.map(|m| {
let start = m.start().saturating_sub(20);
let end = (m.end() + 20).min(content.len());
let start_char = content
.char_indices()
.find(|(i, _)| *i >= start)
.map_or(start, |(i, _)| i);
let end_char = content
.char_indices()
.rev()
.find(|(i, _)| *i <= end)
.map_or(end, |(i, c)| i + c.len_utf8());
content
.get(start_char..end_char)
.unwrap_or_default()
.to_string()
})
.collect();
let mut total_similarity = 0.0;
let mut comparisons = 0;
for i in 0..contexts.len() {
for j in (i + 1)..contexts.len() {
let similarity = self.calculate_string_similarity(&contexts[i], &contexts[j]);
total_similarity += similarity;
comparisons += 1;
}
}
if comparisons > 0 {
1.0 - (total_similarity / f64::from(comparisons)) } else {
0.0
}
}
fn calculate_string_similarity(&self, s1: &str, s2: &str) -> f64 {
let words1: std::collections::HashSet<&str> = s1.split_whitespace().collect();
let words2: std::collections::HashSet<&str> = s2.split_whitespace().collect();
let intersection = words1.intersection(&words2).count();
let union = words1.union(&words2).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
fn calculate_pattern_variations(&self, _collection: &mut PatternCollection) {}
fn hash_pattern(&self, ast_data: &str) -> String {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
ast_data.hash(&mut hasher);
format!("{:x}", hasher.finish())
}
fn normalize_code_snippet(snippet: &str) -> String {
use regex::Regex;
let trimmed = snippet.trim();
let re_string = Regex::new(r#""[^"]*""#).expect("valid regex");
let normalized = re_string.replace_all(trimmed, "\"STR\"");
let re_num = Regex::new(r"\b\d+\b").expect("valid regex");
let normalized = re_num.replace_all(&normalized, "N");
let re_ident = Regex::new(r"\b[a-zA-Z_]\w*\b").expect("valid regex");
let keywords: std::collections::HashSet<&str> = [
"if", "else", "match", "for", "while", "let", "mut", "fn", "return", "true", "false",
"self", "Ok", "Err", "Some", "None", "Result", "Option", "Vec", "String", "impl",
"pub", "struct", "enum", "async", "await", "unsafe", "use", "mod", "const", "static",
"type", "where", "trait", "loop", "break", "continue", "ref", "in", "as", "crate",
"super", "dyn", "move", "extern", "STR", "N",
]
.into_iter()
.collect();
let normalized = re_ident.replace_all(&normalized, |caps: ®ex::Captures| {
let word = caps.get(0).expect("group 0").as_str();
if keywords.contains(word) {
word.to_string()
} else {
"IDENT".to_string()
}
});
let re_ws = Regex::new(r"\s+").expect("valid regex");
re_ws.replace_all(&normalized, " ").to_string()
}
fn extract_match_context(content: &str, m: ®ex::Match) -> String {
let line_start = content[..m.start()].rfind('\n').map_or(0, |p| p + 1);
let line_end = content[m.end()..]
.find('\n')
.map_or(content.len(), |p| m.end() + p);
content
.get(line_start..line_end)
.unwrap_or_default()
.to_string()
}
#[allow(clippy::too_many_arguments)]
fn group_by_structural_hash(
&self,
matches: &[regex::Match],
content: &str,
file_path: &Path,
pattern_type: PatternType,
min_group_size: usize,
loc_per_match: usize,
collection: &mut PatternCollection,
) {
let mut groups: HashMap<String, Vec<(usize, String)>> = HashMap::new();
for m in matches.iter().take(20) {
let context = Self::extract_match_context(content, m);
let normalized = Self::normalize_code_snippet(&context);
let structural_hash = self.hash_pattern(&normalized);
let line_num = content.get(..m.start()).unwrap_or_default().lines().count() + 1;
groups
.entry(structural_hash)
.or_default()
.push((line_num, context));
}
for (hash, group) in &groups {
if group.len() >= min_group_size {
let locations: Vec<Location> = group
.iter()
.take(10)
.map(|(line, _)| Location {
file: file_path.to_owned(),
line: *line,
column: 1,
})
.collect();
let example_code = group
.first()
.map(|(_, ctx)| ctx.chars().take(100).collect::<String>())
.unwrap_or_default();
let pattern = AstPattern {
pattern_type,
pattern_hash: hash.clone(),
frequency: group.len().min(10),
locations,
variation_score: 0.0, example_code,
estimated_loc: group.len() * loc_per_match,
};
collection.add_pattern(pattern);
}
}
}
}