use std::collections::HashMap;
#[cfg(test)]
use std::collections::HashSet;
use serde::{Deserialize, Serialize};
use xxhash_rust::xxh3::xxh3_64;
use crate::graph::ControlFlowGraph;
use super::minhash::MinHashDetector;
use super::types::CloneType;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CodeBlock {
pub file: String,
pub function_name: Option<String>,
pub start_line: usize,
pub end_line: usize,
pub content: String,
pub block_hash: u64,
}
impl CodeBlock {
pub fn lines(&self) -> usize {
self.end_line.saturating_sub(self.start_line) + 1
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BlockCloneGroup {
pub block_type: CloneType,
pub instances: Vec<CodeBlock>,
pub similarity: f64,
}
const BLOCK_START_KEYWORDS: &[&str] = &[
"if", "else", "elif", "else if", "for", "while", "do", "try", "catch", "except", "finally",
"match", "switch", "case", "def", "fn", "func", "function",
];
fn starts_with_block_keyword(trimmed: &str) -> bool {
for &kw in BLOCK_START_KEYWORDS {
if let Some(rest) = trimmed.strip_prefix(kw) {
if rest.is_empty() || rest.starts_with(|c: char| !c.is_alphanumeric() && c != '_') {
return true;
}
}
}
false
}
pub struct BlockCloneDetector {
min_block_lines: usize,
similarity_threshold: f64,
shingle_size: usize,
}
impl BlockCloneDetector {
pub fn new(min_block_lines: usize, similarity_threshold: f64) -> Self {
Self {
min_block_lines,
similarity_threshold,
shingle_size: 3,
}
}
pub fn with_defaults() -> Self {
Self::new(4, 0.6)
}
pub fn extract_blocks(&self, source: &str, file_path: &str) -> Vec<CodeBlock> {
let lines: Vec<&str> = source.lines().collect();
let mut blocks = Vec::new();
let mut i = 0;
while i < lines.len() {
let trimmed = lines[i].trim();
if !trimmed.is_empty() && starts_with_block_keyword(trimmed) {
let start_idx = i;
let end_idx = self.find_block_end(&lines, i);
let line_count = end_idx - start_idx;
if line_count >= self.min_block_lines {
let content: String = lines[start_idx..end_idx].join("\n");
let normalized = normalize_block_content(&content);
let block_hash = xxh3_64(normalized.as_bytes());
blocks.push(CodeBlock {
file: file_path.to_string(),
function_name: self.find_enclosing_function(&lines, start_idx),
start_line: start_idx + 1, end_line: end_idx, content,
block_hash,
});
}
i += 1;
} else {
i += 1;
}
}
blocks
}
pub fn extract_blocks_from_cfg(
&self,
source: &str,
file_path: &str,
cfg: Option<&ControlFlowGraph>,
) -> Vec<CodeBlock> {
let cfg = match cfg {
Some(c) => c,
None => return self.extract_blocks(source, file_path),
};
let mut blocks = Vec::new();
let lines: Vec<&str> = source.lines().collect();
for (_node_id, basic_block) in cfg.blocks() {
if basic_block.is_entry || basic_block.is_exit || basic_block.statements.is_empty() {
continue;
}
let mut min_line = usize::MAX;
let mut max_line = 0usize;
for span in &basic_block.statements {
let start_line = byte_offset_to_line(source, span.start);
let end_line =
byte_offset_to_line(source, span.end.saturating_sub(1).max(span.start));
min_line = min_line.min(start_line);
max_line = max_line.max(end_line);
}
if min_line == usize::MAX || max_line == 0 {
continue;
}
let line_count = max_line - min_line + 1;
if line_count < self.min_block_lines {
continue;
}
let start_idx = min_line.saturating_sub(1);
let end_idx = max_line.min(lines.len());
let content: String = lines[start_idx..end_idx].join("\n");
let normalized = normalize_block_content(&content);
let block_hash = xxh3_64(normalized.as_bytes());
blocks.push(CodeBlock {
file: file_path.to_string(),
function_name: Some(cfg.function_name.clone()),
start_line: min_line,
end_line: max_line,
content,
block_hash,
});
}
blocks
}
fn find_block_end(&self, lines: &[&str], start_idx: usize) -> usize {
let start_line = lines[start_idx];
let start_indent = start_line.len() - start_line.trim_start().len();
let has_brace = lines[start_idx..].iter().take(3).any(|l| l.contains('{'));
if has_brace {
let mut depth = 0i32;
for (offset, line) in lines[start_idx..].iter().enumerate() {
for ch in line.chars() {
if ch == '{' {
depth += 1;
} else if ch == '}' {
depth -= 1;
if depth == 0 {
return start_idx + offset + 1;
}
}
}
}
lines.len()
} else {
let trimmed_start = start_line.trim();
let is_ruby_end_style = (trimmed_start.starts_with("def ")
|| trimmed_start.starts_with("do")
|| trimmed_start.starts_with("begin"))
&& !trimmed_start.ends_with(':')
&& !trimmed_start.contains('{');
if is_ruby_end_style {
for (idx, line) in lines.iter().enumerate().skip(start_idx + 1) {
let trimmed = line.trim();
if trimmed == "end" {
let indent = line.len() - line.trim_start().len();
if indent <= start_indent {
return idx + 1;
}
}
}
lines.len()
} else {
for (idx, line) in lines.iter().enumerate().skip(start_idx + 1) {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let indent = line.len() - line.trim_start().len();
if indent <= start_indent {
return idx;
}
}
lines.len()
}
}
}
fn find_enclosing_function(&self, lines: &[&str], block_start: usize) -> Option<String> {
let block_indent = lines[block_start].len() - lines[block_start].trim_start().len();
for i in (0..block_start).rev() {
let line = lines[i];
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let indent = line.len() - line.trim_start().len();
if indent < block_indent {
if let Some(name) = extract_function_name(trimmed) {
return Some(name);
}
}
}
None
}
pub fn detect_block_clones(&self, files: &[(String, String)]) -> Vec<BlockCloneGroup> {
let minhash = MinHashDetector::new(128, self.shingle_size, self.similarity_threshold);
let mut all_blocks: Vec<CodeBlock> = Vec::new();
for (path, source) in files {
let blocks = self.extract_blocks(source, path);
all_blocks.extend(blocks);
}
if all_blocks.len() < 2 {
return Vec::new();
}
let mut signatures: Vec<crate::clones::minhash::FunctionSignature> =
Vec::with_capacity(all_blocks.len());
for block in &all_blocks {
let shingle_hashes = minhash.compute_shingles(&block.content);
if shingle_hashes.is_empty() {
signatures.push(crate::clones::minhash::FunctionSignature {
file: block.file.clone(),
name: format!(
"block@{}:{}-{}",
block.file, block.start_line, block.end_line
),
start_line: block.start_line,
end_line: block.end_line,
sketch: minhash.build_sketch(&[]),
shingle_hashes: Vec::new(),
});
continue;
}
let sketch = minhash.build_sketch(&shingle_hashes);
signatures.push(crate::clones::minhash::FunctionSignature {
file: block.file.clone(),
name: format!(
"block@{}:{}-{}",
block.file, block.start_line, block.end_line
),
start_line: block.start_line,
end_line: block.end_line,
sketch,
shingle_hashes,
});
}
let raw_groups = minhash.detect_clones(&signatures);
let mut block_groups: Vec<BlockCloneGroup> = Vec::new();
let block_index: HashMap<(&str, usize), &CodeBlock> = all_blocks
.iter()
.map(|b| ((b.file.as_str(), b.start_line), b))
.collect();
for group in &raw_groups {
let mut instances: Vec<CodeBlock> = Vec::new();
for inst in &group.instances {
if let Some(block) = block_index.get(&(inst.file.as_str(), inst.start_line)) {
instances.push((*block).clone());
}
}
if instances.len() >= 2 {
let all_same_hash = instances
.windows(2)
.all(|w| w[0].block_hash == w[1].block_hash);
let block_type = if all_same_hash {
CloneType::Type1
} else {
CloneType::Type3
};
block_groups.push(BlockCloneGroup {
block_type,
instances,
similarity: group.similarity,
});
}
}
block_groups
}
pub fn merge_adjacent_blocks(&self, groups: &mut Vec<BlockCloneGroup>) {
if groups.len() < 2 {
return;
}
let mut merged = true;
while merged {
merged = false;
let mut i = 0;
while i < groups.len() {
let mut j = i + 1;
while j < groups.len() {
if Self::groups_are_adjacent(&groups[i], &groups[j]) {
let group_j = groups.remove(j);
Self::merge_into(&mut groups[i], &group_j);
merged = true;
} else {
j += 1;
}
}
i += 1;
}
}
}
fn groups_are_adjacent(a: &BlockCloneGroup, b: &BlockCloneGroup) -> bool {
for inst_a in &a.instances {
for inst_b in &b.instances {
if inst_a.file == inst_b.file {
let gap_threshold = 2;
let a_end = inst_a.end_line;
let b_start = inst_b.start_line;
let b_end = inst_b.end_line;
let a_start = inst_a.start_line;
if (a_end >= b_start && a_end <= b_end + gap_threshold)
|| (b_end >= a_start && b_end <= a_end + gap_threshold)
|| (b_start <= a_end + gap_threshold && b_start >= a_start)
|| (a_start <= b_end + gap_threshold && a_start >= b_start)
{
return true;
}
}
}
}
false
}
fn merge_into(target: &mut BlockCloneGroup, source: &BlockCloneGroup) {
for src_inst in &source.instances {
let mut found = false;
for tgt_inst in target.instances.iter_mut() {
if tgt_inst.file == src_inst.file {
let new_start = tgt_inst.start_line.min(src_inst.start_line);
let new_end = tgt_inst.end_line.max(src_inst.end_line);
tgt_inst.start_line = new_start;
tgt_inst.end_line = new_end;
tgt_inst.content = format!("{}\n{}", tgt_inst.content, src_inst.content);
tgt_inst.block_hash = xxh3_64(tgt_inst.content.as_bytes());
found = true;
break;
}
}
if !found {
target.instances.push(src_inst.clone());
}
}
target.similarity = target.similarity.min(source.similarity);
}
}
fn byte_offset_to_line(source: &str, offset: usize) -> usize {
let clamped = offset.min(source.len());
source[..clamped].matches('\n').count() + 1
}
fn normalize_block_content(content: &str) -> String {
content
.lines()
.map(|l| l.trim())
.filter(|l| !l.is_empty())
.collect::<Vec<_>>()
.join(" ")
.to_lowercase()
}
fn extract_function_name(trimmed: &str) -> Option<String> {
let patterns: &[(&str, &str)] = &[
("def ", "("),
("fn ", "("),
("fn ", "<"),
("func ", "("),
("function ", "("),
];
for &(prefix, delimiter) in patterns {
if let Some(rest) = find_after_keyword(trimmed, prefix) {
if let Some(name_end) = rest.find(delimiter) {
let name = rest[..name_end].trim();
if !name.is_empty() && name.chars().all(|c| c.is_alphanumeric() || c == '_') {
return Some(name.to_string());
}
}
}
}
None
}
fn find_after_keyword<'a>(line: &'a str, keyword: &str) -> Option<&'a str> {
if let Some(pos) = line.find(keyword) {
Some(&line[pos + keyword.len()..])
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_blocks_python_if_for_while() {
let source = r#"
def process(items):
total = 0
for item in items:
if item > 0:
total += item
count += 1
log(item)
else:
skip(item)
log_skip(item)
count_skip += 1
while total > 100:
total -= 10
adjust(total)
log(total)
record(total)
return total
"#;
let detector = BlockCloneDetector::with_defaults();
let blocks = detector.extract_blocks(source, "test.py");
let block_starts: Vec<usize> = blocks.iter().map(|b| b.start_line).collect();
assert!(
!blocks.is_empty(),
"Should extract at least some blocks from Python source"
);
let has_for = blocks
.iter()
.any(|b| b.content.contains("for item in items"));
assert!(
has_for,
"Should extract the for-loop block, got starts: {block_starts:?}"
);
let has_if = blocks.iter().any(|b| {
b.content.starts_with("if item > 0")
|| b.content.trim_start().starts_with("if item > 0")
});
assert!(has_if, "Should extract the if block");
let has_while = blocks
.iter()
.any(|b| b.content.contains("while total > 100"));
assert!(has_while, "Should extract the while block");
}
#[test]
fn test_extract_blocks_javascript_brace_delimited() {
let source = r#"function validate(input) {
if (input === null) {
console.log("null input");
throw new Error("null");
return false;
}
for (let i = 0; i < input.length; i++) {
if (input[i] < 0) {
console.log("negative");
input[i] = 0;
count++;
flag = true;
}
}
try {
process(input);
save(input);
log(input);
notify();
} catch (e) {
handleError(e);
log(e);
cleanup();
retry();
}
return true;
}"#;
let detector = BlockCloneDetector::with_defaults();
let blocks = detector.extract_blocks(source, "test.js");
assert!(
!blocks.is_empty(),
"Should extract blocks from JavaScript source"
);
let has_function = blocks
.iter()
.any(|b| b.content.contains("function validate"));
assert!(has_function, "Should extract the function block");
let has_for = blocks.iter().any(|b| b.content.contains("for (let i"));
assert!(has_for, "Should extract the for-loop block");
let has_try = blocks.iter().any(|b| b.content.contains("try {"));
assert!(has_try, "Should extract the try block");
}
#[test]
fn test_detect_duplicate_if_blocks() {
let source_a = r#"
function checkA(val) {
if (val > 0) {
let result = val * 2;
console.log(result);
save(result);
return result;
}
}
"#;
let source_b = r#"
function checkB(value) {
if (value > 0) {
let output = value * 2;
console.log(output);
save(output);
return output;
}
}
"#;
let detector = BlockCloneDetector::new(4, 0.3);
let groups = detector.detect_block_clones(&[
("a.js".to_string(), source_a.to_string()),
("b.js".to_string(), source_b.to_string()),
]);
assert!(
!groups.is_empty(),
"Should detect duplicate if-blocks across files"
);
let cross_file = groups.iter().any(|g| {
let files: HashSet<&str> = g.instances.iter().map(|i| i.file.as_str()).collect();
files.len() >= 2
});
assert!(
cross_file,
"Should have at least one cross-file clone group"
);
}
#[test]
fn test_merge_adjacent_blocks() {
let detector = BlockCloneDetector::with_defaults();
let block_a1 = CodeBlock {
file: "a.py".to_string(),
function_name: Some("foo".to_string()),
start_line: 5,
end_line: 10,
content: "if x > 0:\n do_a()\n do_b()\n do_c()\n do_d()\n do_e()"
.to_string(),
block_hash: 111,
};
let block_a2 = CodeBlock {
file: "b.py".to_string(),
function_name: Some("bar".to_string()),
start_line: 5,
end_line: 10,
content: "if y > 0:\n do_a()\n do_b()\n do_c()\n do_d()\n do_e()"
.to_string(),
block_hash: 222,
};
let block_b1 = CodeBlock {
file: "a.py".to_string(),
function_name: Some("foo".to_string()),
start_line: 11,
end_line: 16,
content: "for i in range(10):\n process(i)\n log(i)\n save(i)\n check(i)\n done(i)".to_string(),
block_hash: 333,
};
let block_b2 = CodeBlock {
file: "b.py".to_string(),
function_name: Some("bar".to_string()),
start_line: 11,
end_line: 16,
content: "for j in range(10):\n process(j)\n log(j)\n save(j)\n check(j)\n done(j)".to_string(),
block_hash: 444,
};
let mut groups = vec![
BlockCloneGroup {
block_type: CloneType::Type3,
instances: vec![block_a1, block_a2],
similarity: 0.8,
},
BlockCloneGroup {
block_type: CloneType::Type3,
instances: vec![block_b1, block_b2],
similarity: 0.7,
},
];
detector.merge_adjacent_blocks(&mut groups);
assert_eq!(
groups.len(),
1,
"Adjacent clone groups should be merged into one, got {}",
groups.len()
);
let merged = &groups[0];
for inst in &merged.instances {
assert_eq!(inst.start_line, 5, "Merged start should be 5");
assert_eq!(inst.end_line, 16, "Merged end should be 16");
}
assert!(
(merged.similarity - 0.7).abs() < f64::EPSILON,
"Merged similarity should be min(0.8, 0.7) = 0.7, got {}",
merged.similarity
);
}
#[test]
fn test_blocks_below_min_lines_filtered() {
let source = "if x:\n pass\n";
let detector = BlockCloneDetector::with_defaults(); let blocks = detector.extract_blocks(source, "test.py");
assert!(
blocks.is_empty(),
"2-line if block should be filtered out (min is 4)"
);
}
#[test]
fn test_block_hash_deterministic() {
let source = "for i in range(10):\n process(i)\n log(i)\n save(i)\n done(i)\n";
let detector = BlockCloneDetector::with_defaults();
let blocks_a = detector.extract_blocks(source, "test.py");
let blocks_b = detector.extract_blocks(source, "test.py");
assert!(!blocks_a.is_empty());
assert_eq!(
blocks_a[0].block_hash, blocks_b[0].block_hash,
"Same content should produce same hash"
);
}
#[test]
fn test_enclosing_function_detected() {
let source = r#"def outer():
x = 1
if x > 0:
do_something()
do_more()
finish()
cleanup()
"#;
let detector = BlockCloneDetector::with_defaults();
let blocks = detector.extract_blocks(source, "test.py");
let if_block = blocks
.iter()
.find(|b| b.content.trim_start().starts_with("if x > 0"));
assert!(if_block.is_some(), "Should find the if block");
assert_eq!(
if_block.unwrap().function_name.as_deref(),
Some("outer"),
"Should detect 'outer' as the enclosing function"
);
}
#[test]
fn test_no_clones_in_dissimilar_blocks() {
let source_a = r#"
function alpha() {
if (true) {
doAlpha();
alphaStep2();
alphaStep3();
alphaStep4();
}
}
"#;
let source_b = r#"
function beta() {
for (let i = 0; i < 100; i++) {
doBeta(i);
betaStep2(i);
betaStep3(i);
betaStep4(i);
}
}
"#;
let detector = BlockCloneDetector::new(4, 0.8);
let groups = detector.detect_block_clones(&[
("a.js".to_string(), source_a.to_string()),
("b.js".to_string(), source_b.to_string()),
]);
let cross_file = groups.iter().any(|g| {
let files: HashSet<&str> = g.instances.iter().map(|i| i.file.as_str()).collect();
files.len() >= 2
});
assert!(
!cross_file,
"Dissimilar blocks should not be detected as clones at 0.8 threshold"
);
}
#[test]
fn test_extract_blocks_from_cfg_none_falls_back() {
let source = r#"
def process(items):
total = 0
for item in items:
if item > 0:
total += item
count += 1
log(item)
else:
skip(item)
log_skip(item)
count_skip += 1
while total > 100:
total -= 10
adjust(total)
log(total)
record(total)
return total
"#;
let detector = BlockCloneDetector::with_defaults();
let blocks_text = detector.extract_blocks(source, "test.py");
let blocks_cfg = detector.extract_blocks_from_cfg(source, "test.py", None);
assert_eq!(
blocks_text.len(),
blocks_cfg.len(),
"CFG=None should produce identical results to text-based extraction"
);
for (a, b) in blocks_text.iter().zip(blocks_cfg.iter()) {
assert_eq!(a.start_line, b.start_line);
assert_eq!(a.end_line, b.end_line);
assert_eq!(a.block_hash, b.block_hash);
}
}
#[test]
fn test_extract_blocks_from_cfg_with_cfg() {
use crate::core::SourceSpan;
use crate::graph::ControlFlowGraph;
let mut cfg = ControlFlowGraph::new("test_func");
let entry = cfg.create_entry();
let block_a = cfg.create_block("body");
let exit = cfg.create_exit();
cfg.add_edge(entry, block_a, crate::graph::CfgEdgeKind::FallThrough);
cfg.add_edge(block_a, exit, crate::graph::CfgEdgeKind::Return);
let source = "def test_func():\n x = 1\n y = 2\n z = 3\n w = 4\n return x + y + z + w\n";
cfg.add_statement(block_a, SourceSpan::new(17, 74));
let detector = BlockCloneDetector::new(3, 0.6);
let blocks = detector.extract_blocks_from_cfg(source, "test.py", Some(&cfg));
assert!(
!blocks.is_empty(),
"Should extract at least one block from CFG"
);
let block = &blocks[0];
assert_eq!(
block.function_name.as_deref(),
Some("test_func"),
"CFG-extracted block should carry function name"
);
}
#[test]
fn test_extract_blocks_from_cfg_filters_small_blocks() {
use crate::core::SourceSpan;
use crate::graph::ControlFlowGraph;
let mut cfg = ControlFlowGraph::new("small_func");
let entry = cfg.create_entry();
let block_a = cfg.create_block("tiny");
let exit = cfg.create_exit();
cfg.add_edge(entry, block_a, crate::graph::CfgEdgeKind::FallThrough);
cfg.add_edge(block_a, exit, crate::graph::CfgEdgeKind::Return);
let source = "def small():\n return 1\n";
cfg.add_statement(block_a, SourceSpan::new(13, 26));
let detector = BlockCloneDetector::with_defaults(); let blocks = detector.extract_blocks_from_cfg(source, "test.py", Some(&cfg));
assert!(
blocks.is_empty(),
"Blocks smaller than min_block_lines should be filtered out"
);
}
#[test]
fn test_byte_offset_to_line() {
let source = "line1\nline2\nline3\n";
assert_eq!(byte_offset_to_line(source, 0), 1); assert_eq!(byte_offset_to_line(source, 5), 1); assert_eq!(byte_offset_to_line(source, 6), 2); assert_eq!(byte_offset_to_line(source, 12), 3); }
}