use crate::ast_fingerprint::AstFingerprint;
use crate::compare_functions;
use crate::function_extractor::{extract_functions, FunctionDefinition, SimilarityResult};
use crate::tsed::TSEDOptions;
#[derive(Debug, Clone)]
pub struct FastSimilarityOptions {
pub fingerprint_threshold: f64,
pub similarity_threshold: f64,
pub tsed_options: TSEDOptions,
pub debug_stats: bool,
}
impl Default for FastSimilarityOptions {
fn default() -> Self {
Self {
fingerprint_threshold: 0.5,
similarity_threshold: 0.7,
tsed_options: TSEDOptions::default(),
debug_stats: false,
}
}
}
#[derive(Debug)]
struct FingerprintedFunction {
function: FunctionDefinition,
fingerprint: AstFingerprint,
}
pub fn find_similar_functions_fast(
filename: &str,
source_text: &str,
options: &FastSimilarityOptions,
) -> Result<Vec<SimilarityResult>, String> {
let functions = extract_functions(filename, source_text)?;
let mut fingerprinted = Vec::new();
for func in functions {
if let Some(min_tokens) = options.tsed_options.min_tokens {
let tokens = func.node_count.unwrap_or(0);
if tokens < min_tokens {
continue;
}
} else {
if func.line_count() < options.tsed_options.min_lines {
continue;
}
}
let start = func.body_span.start as usize;
let end = func.body_span.end as usize;
let body = &source_text[start..end];
let fingerprint = match AstFingerprint::from_source(body) {
Ok(fp) => fp,
Err(_) => continue, };
fingerprinted.push(FingerprintedFunction { function: func, fingerprint });
}
let mut similar_pairs = Vec::new();
let mut comparisons_made = 0;
let mut comparisons_skipped = 0;
for i in 0..fingerprinted.len() {
for j in (i + 1)..fingerprinted.len() {
let func1 = &fingerprinted[i];
let func2 = &fingerprinted[j];
if !func1
.fingerprint
.might_be_similar(&func2.fingerprint, options.fingerprint_threshold)
{
comparisons_skipped += 1;
continue;
}
let fp_similarity = func1.fingerprint.similarity(&func2.fingerprint);
if fp_similarity < options.fingerprint_threshold {
comparisons_skipped += 1;
continue;
}
comparisons_made += 1;
let similarity = compare_functions(
&func1.function,
&func2.function,
source_text,
source_text,
&options.tsed_options,
)?;
if similarity >= options.similarity_threshold {
similar_pairs.push(SimilarityResult::new(
func1.function.clone(),
func2.function.clone(),
similarity,
));
}
}
}
if options.debug_stats {
let total = comparisons_made + comparisons_skipped;
if total > 0 {
let skip_rate = (comparisons_skipped as f64 / total as f64) * 100.0;
eprintln!(
"Fast comparison: {} detailed, {} skipped ({:.1}% skip rate)",
comparisons_made, comparisons_skipped, skip_rate
);
}
}
similar_pairs.sort_by(|a, b| {
b.impact
.cmp(&a.impact)
.then(b.similarity.partial_cmp(&a.similarity).unwrap_or(std::cmp::Ordering::Equal))
});
Ok(similar_pairs)
}
pub fn find_similar_functions_across_files_fast(
files: &[(String, String)],
options: &FastSimilarityOptions,
) -> Result<Vec<(String, SimilarityResult, String)>, String> {
let mut all_functions = Vec::new();
for (filename, source) in files {
let functions = extract_functions(filename, source)?;
for func in functions {
if let Some(min_tokens) = options.tsed_options.min_tokens {
let tokens = func.node_count.unwrap_or(0);
if tokens < min_tokens {
continue;
}
} else {
if func.line_count() < options.tsed_options.min_lines {
continue;
}
}
let start = func.body_span.start as usize;
let end = func.body_span.end as usize;
let body = &source[start..end];
let fingerprint = match AstFingerprint::from_source(body) {
Ok(fp) => fp,
Err(_) => continue, };
all_functions.push((
filename.clone(),
source.clone(),
FingerprintedFunction { function: func, fingerprint },
));
}
}
let mut similar_pairs = Vec::new();
let mut comparisons_made = 0;
let mut comparisons_skipped = 0;
for i in 0..all_functions.len() {
for j in (i + 1)..all_functions.len() {
let (file1, source1, func1) = &all_functions[i];
let (file2, source2, func2) = &all_functions[j];
if file1 == file2 {
continue;
}
if !func1
.fingerprint
.might_be_similar(&func2.fingerprint, options.fingerprint_threshold)
{
comparisons_skipped += 1;
continue;
}
let fp_similarity = func1.fingerprint.similarity(&func2.fingerprint);
if fp_similarity < options.fingerprint_threshold {
comparisons_skipped += 1;
continue;
}
comparisons_made += 1;
let similarity = compare_functions(
&func1.function,
&func2.function,
source1,
source2,
&options.tsed_options,
)?;
if similarity >= options.similarity_threshold {
similar_pairs.push((
file1.clone(),
SimilarityResult::new(
func1.function.clone(),
func2.function.clone(),
similarity,
),
file2.clone(),
));
}
}
}
if options.debug_stats {
let total = comparisons_made + comparisons_skipped;
if total > 0 {
let skip_rate = (comparisons_skipped as f64 / total as f64) * 100.0;
eprintln!(
"Fast cross-file comparison: {} detailed, {} skipped ({:.1}% skip rate)",
comparisons_made, comparisons_skipped, skip_rate
);
}
}
similar_pairs.sort_by(|(_, a, _), (_, b, _)| {
b.impact
.cmp(&a.impact)
.then(b.similarity.partial_cmp(&a.similarity).unwrap_or(std::cmp::Ordering::Equal))
});
Ok(similar_pairs)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fast_similarity() {
let code = r#"
export function add(a: number, b: number): number {
const result = a + b;
return result;
}
export function sum(x: number, y: number): number {
const result = x + y;
return result;
}
export function multiply(a: number, b: number): number {
const result = a * b;
return result;
}
"#;
let options = FastSimilarityOptions {
fingerprint_threshold: 0.2,
similarity_threshold: 0.25,
tsed_options: TSEDOptions { min_lines: 1, ..Default::default() },
debug_stats: true,
};
let result = find_similar_functions_fast("test.ts", code, &options);
assert!(result.is_ok());
let pairs = result.unwrap();
if pairs.is_empty() {
let debug_options = FastSimilarityOptions {
fingerprint_threshold: 0.0,
similarity_threshold: 0.0,
tsed_options: TSEDOptions { min_lines: 1, ..Default::default() },
debug_stats: false,
};
let debug_result =
find_similar_functions_fast("test.ts", code, &debug_options).unwrap();
for pair in &debug_result {
println!(
"{} ~ {}: {:.2}%",
pair.func1.name,
pair.func2.name,
pair.similarity * 100.0
);
}
}
assert!(!pairs.is_empty(), "No similar pairs found");
let found_add_sum = pairs.iter().any(|p| {
(p.func1.name == "add" && p.func2.name == "sum")
|| (p.func1.name == "sum" && p.func2.name == "add")
});
assert!(found_add_sum);
}
#[test]
fn test_identical_functions_always_pass_fingerprint() {
let code1 = "function test(a: number, b: number): number { return a + b; }";
let code2 = "function test(a: number, b: number): number { return a + b; }";
let fp1 = AstFingerprint::from_source(code1).expect("Failed to create fingerprint");
let fp2 = AstFingerprint::from_source(code2).expect("Failed to create fingerprint");
assert!(fp1.might_be_similar(&fp2, 0.9));
assert!(fp1.might_be_similar(&fp2, 0.5));
assert!(fp1.might_be_similar(&fp2, 0.1));
assert_eq!(fp1.similarity(&fp2), 1.0);
}
}