fn extract_blocks(
lines: &[&str],
path: &Path,
min_lines: usize,
max_tokens: usize,
detection_type: crate::cli::DuplicateType,
) -> Vec<(String, String, usize, usize, String)> {
let mut blocks = Vec::new();
let file_str = path.to_string_lossy().to_string();
match detection_type {
crate::cli::DuplicateType::Exact => {
extract_exact_blocks(&mut blocks, lines, &file_str, min_lines, max_tokens);
}
crate::cli::DuplicateType::Fuzzy => {
extract_fuzzy_blocks(&mut blocks, lines, &file_str, min_lines, max_tokens);
}
_ => {} }
blocks
}
fn extract_exact_blocks(
blocks: &mut Vec<(String, String, usize, usize, String)>,
lines: &[&str],
file_str: &str,
min_lines: usize,
max_tokens: usize,
) {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
for i in 0..lines.len().saturating_sub(min_lines) {
let block_lines = &lines[i..i + min_lines];
let content = normalize_block(block_lines);
if count_tokens(&content) <= max_tokens {
let mut hasher = DefaultHasher::new();
content.hash(&mut hasher);
let hash = format!("{:x}", hasher.finish());
blocks.push((hash, file_str.to_string(), i + 1, i + min_lines, content));
}
}
}
fn extract_fuzzy_blocks(
blocks: &mut Vec<(String, String, usize, usize, String)>,
lines: &[&str],
file_str: &str,
min_lines: usize,
max_tokens: usize,
) {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut i = 0;
while i < lines.len() {
if is_block_start(lines[i]) {
let end = find_block_end(&lines[i..]).unwrap_or(min_lines) + i;
if end - i >= min_lines {
let block_lines = &lines[i..end];
let content = normalize_block(block_lines);
if count_tokens(&content) <= max_tokens {
let mut hasher = DefaultHasher::new();
content.hash(&mut hasher);
let hash = format!("{:x}", hasher.finish());
blocks.push((hash, file_str.to_string(), i + 1, end, content));
}
}
i = end;
} else {
i += 1;
}
}
}
fn normalize_block(lines: &[&str]) -> String {
lines
.iter()
.map(|line| line.trim())
.filter(|line| !line.is_empty() && !line.starts_with("//") && !line.starts_with('#'))
.collect::<Vec<_>>()
.join("\n")
}
fn count_tokens(content: &str) -> usize {
content.split_whitespace().count()
}
fn is_block_start(line: &str) -> bool {
let trimmed = line.trim();
if is_function_declaration(trimmed) {
return true;
}
if is_type_declaration(trimmed) {
return true;
}
if is_block_opening(trimmed) {
return true;
}
false
}
fn is_function_declaration(line: &str) -> bool {
line.contains("fn ") || line.contains("function") || line.contains("def ")
}
fn is_type_declaration(line: &str) -> bool {
line.contains("class ") || line.contains("struct ") || line.contains("impl ")
}
fn is_block_opening(line: &str) -> bool {
line.ends_with('{') && !line.starts_with('{')
}
fn find_block_end(lines: &[&str]) -> Option<usize> {
let mut brace_count = 0;
let mut in_block = false;
for (i, line) in lines.iter().enumerate() {
for ch in line.chars() {
match ch {
'{' => {
brace_count += 1;
in_block = true;
}
'}' => {
brace_count -= 1;
if brace_count == 0 && in_block {
return Some(i + 1);
}
}
_ => {}
}
}
}
None
}
fn find_duplicate_blocks(
all_blocks: Vec<(String, String, usize, usize, String)>,
_threshold: f32,
) -> Vec<DuplicateBlock> {
let mut hash_groups: HashMap<String, Vec<(String, usize, usize, String)>> = HashMap::new();
for (hash, file, start, end, content) in all_blocks {
hash_groups
.entry(hash)
.or_default()
.push((file, start, end, content));
}
let mut duplicates = Vec::new();
for (hash, locations) in hash_groups {
if locations.len() > 1 {
let lines = locations[0].2 - locations[0].1 + 1;
let tokens = count_tokens(&locations[0].3);
let duplicate_locations: Vec<DuplicateLocation> = locations
.into_iter()
.map(|(file, start, end, content)| {
let preview = content.lines().take(3).collect::<Vec<_>>().join("\n");
DuplicateLocation {
file,
start_line: start,
end_line: end,
content_preview: if content.lines().count() > 3 {
format!("{preview}...")
} else {
preview
},
}
})
.collect();
duplicates.push(DuplicateBlock {
hash,
locations: duplicate_locations,
lines,
tokens,
similarity: 1.0, });
}
}
duplicates.sort_by(|a, b| b.lines.cmp(&a.lines));
duplicates
}
fn should_process_file(path: &Path, include: &Option<String>, exclude: &Option<String>) -> bool {
let path_str = path.to_string_lossy();
if let Some(excl) = exclude {
if path_str.contains(excl) {
return false;
}
}
if let Some(incl) = include {
return path_str.contains(incl);
}
true
}
fn is_source_file(path: &Path) -> bool {
matches!(
path.extension().and_then(|s| s.to_str()),
Some("rs" | "js" | "ts" | "py" | "java" | "cpp" | "c" | "kt" | "kts")
)
}