similarity_core/
cli_parallel.rs

1use crate::TSEDOptions;
2use rayon::prelude::*;
3use std::fs;
4use std::path::PathBuf;
5
6/// Generic file data structure for any language
7#[derive(Debug)]
8pub struct FileData<F> {
9    pub path: PathBuf,
10    pub content: String,
11    pub functions: Vec<F>,
12}
13
14/// Trait for extracting functions from source code
15pub trait FunctionExtractor {
16    type Function: Clone + Send + Sync;
17
18    fn extract_functions(
19        &self,
20        filename: &str,
21        content: &str,
22    ) -> Result<Vec<Self::Function>, Box<dyn std::error::Error>>;
23}
24
25/// Generic similarity result
26#[derive(Debug, Clone)]
27pub struct SimilarityResult<F> {
28    pub func1: F,
29    pub func2: F,
30    pub similarity: f64,
31}
32
33impl<F> SimilarityResult<F> {
34    pub fn new(func1: F, func2: F, similarity: f64) -> Self {
35        Self { func1, func2, similarity }
36    }
37}
38
39/// Trait for finding similar functions
40pub trait SimilarityChecker {
41    type Function: Clone + Send + Sync;
42
43    fn find_similar_in_file(
44        &self,
45        filename: &str,
46        content: &str,
47        threshold: f64,
48        options: &TSEDOptions,
49        fast_mode: bool,
50    ) -> Result<Vec<SimilarityResult<Self::Function>>, Box<dyn std::error::Error>>;
51
52    fn compare_functions(
53        &self,
54        func1: &Self::Function,
55        func2: &Self::Function,
56        content1: &str,
57        content2: &str,
58        options: &TSEDOptions,
59    ) -> Result<f64, Box<dyn std::error::Error>>;
60}
61
62/// Load and parse files in parallel using a generic extractor
63pub fn load_files_parallel<E>(files: &[PathBuf], extractor: &E) -> Vec<FileData<E::Function>>
64where
65    E: FunctionExtractor + Sync,
66    E::Function: Send,
67{
68    files
69        .par_iter()
70        .filter_map(|file| {
71            match fs::read_to_string(file) {
72                Ok(content) => {
73                    let filename = file.to_string_lossy();
74                    // Extract functions, skip if parse error
75                    match extractor.extract_functions(&filename, &content) {
76                        Ok(functions) => Some(FileData { path: file.clone(), content, functions }),
77                        Err(_) => None, // Skip files with parse errors
78                    }
79                }
80                Err(e) => {
81                    eprintln!("Error reading {}: {}", file.display(), e);
82                    None
83                }
84            }
85        })
86        .collect()
87}
88
89/// Check for duplicates within files in parallel
90pub fn check_within_file_duplicates_parallel<S>(
91    files: &[PathBuf],
92    threshold: f64,
93    options: &TSEDOptions,
94    fast_mode: bool,
95    checker: &S,
96) -> Vec<(PathBuf, Vec<SimilarityResult<S::Function>>)>
97where
98    S: SimilarityChecker + Sync,
99{
100    files
101        .par_iter()
102        .filter_map(|file| match fs::read_to_string(file) {
103            Ok(code) => {
104                let file_str = file.to_string_lossy();
105
106                match checker.find_similar_in_file(&file_str, &code, threshold, options, fast_mode)
107                {
108                    Ok(pairs) if !pairs.is_empty() => Some((file.clone(), pairs)),
109                    _ => None,
110                }
111            }
112            Err(_) => None,
113        })
114        .collect()
115}
116
117/// Check for duplicates across files using parallel processing
118pub fn check_cross_file_duplicates_parallel<S>(
119    file_data: &[FileData<S::Function>],
120    threshold: f64,
121    options: &TSEDOptions,
122    checker: &S,
123) -> Vec<(String, SimilarityResult<S::Function>, String)>
124where
125    S: SimilarityChecker + Sync,
126    S::Function: Clone + Send + Sync,
127{
128    // Prepare all function pairs with file information
129    let mut all_functions = Vec::new();
130    for data in file_data {
131        let filename = data.path.to_string_lossy().to_string();
132        for func in &data.functions {
133            all_functions.push((filename.clone(), data.content.clone(), func.clone()));
134        }
135    }
136
137    // Generate all cross-file pairs
138    let mut pairs_to_check = Vec::new();
139    for i in 0..all_functions.len() {
140        for j in (i + 1)..all_functions.len() {
141            let (file1, _, _) = &all_functions[i];
142            let (file2, _, _) = &all_functions[j];
143
144            // Only check across different files
145            if file1 != file2 {
146                pairs_to_check.push((i, j));
147            }
148        }
149    }
150
151    // Process pairs in parallel
152    pairs_to_check
153        .into_par_iter()
154        .filter_map(|(i, j)| {
155            let (file1, content1, func1) = &all_functions[i];
156            let (file2, content2, func2) = &all_functions[j];
157
158            // Use checker's compare_functions
159            match checker.compare_functions(func1, func2, content1, content2, options) {
160                Ok(similarity) => {
161                    if similarity >= threshold {
162                        Some((
163                            file1.clone(),
164                            SimilarityResult::new(func1.clone(), func2.clone(), similarity),
165                            file2.clone(),
166                        ))
167                    } else {
168                        None
169                    }
170                }
171                Err(_) => None,
172            }
173        })
174        .collect()
175}