kiss/duplication/
extraction.rs1use crate::minhash::normalize_code;
2use crate::parsing::ParsedFile;
3use crate::rust_parsing::ParsedRustFile;
4use crate::units::get_child_by_field;
5use rayon::prelude::*;
6use std::path::{Path, PathBuf};
7use syn::{ImplItem, Item};
8use tree_sitter::Node;
9
10const MIN_CHUNK_TOKENS: usize = 10;
11const MIN_CHUNK_LINES: usize = 5;
12
13#[derive(Debug, Clone)]
14pub struct CodeChunk {
15 pub file: PathBuf,
16 pub name: String,
17 pub start_line: usize,
18 pub end_line: usize,
19 pub normalized: String,
20}
21
22pub(crate) fn is_nontrivial_chunk(normalized: &str, line_count: usize) -> bool {
23 line_count >= MIN_CHUNK_LINES && normalized.split_whitespace().count() >= MIN_CHUNK_TOKENS
24}
25
26#[must_use]
27pub fn extract_chunks_for_duplication(parsed_files: &[&ParsedFile]) -> Vec<CodeChunk> {
28 let mut per_file: Vec<(usize, Vec<CodeChunk>)> = parsed_files
32 .par_iter()
33 .enumerate()
34 .map(|(idx, parsed)| {
35 let mut chunks = Vec::new();
36 extract_function_chunks(
37 parsed.tree.root_node(),
38 &parsed.source,
39 &parsed.path,
40 &mut chunks,
41 );
42 (idx, chunks)
43 })
44 .collect();
45 per_file.sort_by_key(|(idx, _)| *idx);
46 let total: usize = per_file.iter().map(|(_, v)| v.len()).sum();
47 let mut out = Vec::with_capacity(total);
48 for (_, mut v) in per_file {
49 out.append(&mut v);
50 }
51 out
52}
53
54#[must_use]
55pub fn extract_rust_chunks_for_duplication(parsed_files: &[&ParsedRustFile]) -> Vec<CodeChunk> {
56 let mut chunks = Vec::new();
59 for parsed in parsed_files {
60 extract_rust_function_chunks(&parsed.ast, &parsed.source, &parsed.path, &mut chunks);
61 }
62 chunks
63}
64
65pub(super) fn extract_rust_function_chunks(
66 ast: &syn::File,
67 source: &str,
68 file: &Path,
69 chunks: &mut Vec<CodeChunk>,
70) {
71 for item in &ast.items {
72 extract_chunks_from_item(item, source, file, chunks);
73 }
74}
75
76pub(super) fn extract_chunks_from_item(
77 item: &Item,
78 source: &str,
79 file: &Path,
80 chunks: &mut Vec<CodeChunk>,
81) {
82 match item {
83 Item::Fn(func) => {
84 let start = func.sig.fn_token.span.start().line;
85 let end = func.block.brace_token.span.close().end().line;
86 add_rust_function_chunk(
87 &func.sig.ident.to_string(),
88 start,
89 end,
90 source,
91 file,
92 chunks,
93 );
94 }
95 Item::Impl(impl_block) => {
96 for impl_item in &impl_block.items {
97 if let ImplItem::Fn(method) = impl_item {
98 let start = method.sig.fn_token.span.start().line;
99 let end = method.block.brace_token.span.close().end().line;
100 add_rust_function_chunk(
101 &method.sig.ident.to_string(),
102 start,
103 end,
104 source,
105 file,
106 chunks,
107 );
108 }
109 }
110 }
111 Item::Mod(m) => {
112 if let Some((_, items)) = &m.content {
113 for item in items {
114 extract_chunks_from_item(item, source, file, chunks);
115 }
116 }
117 }
118 _ => {}
119 }
120}
121
122pub(super) fn add_rust_function_chunk(
123 name: &str,
124 start_line: usize,
125 end_line: usize,
126 source: &str,
127 file: &Path,
128 chunks: &mut Vec<CodeChunk>,
129) {
130 let line_count = end_line.saturating_sub(start_line) + 1;
131 let lines: Vec<&str> = source.lines().collect();
132 if start_line > 0 && end_line <= lines.len() {
133 let body_text: String = lines[start_line - 1..end_line].join("\n");
134 let normalized = normalize_code(&body_text);
135 if is_nontrivial_chunk(&normalized, line_count) {
136 chunks.push(CodeChunk {
137 file: file.to_path_buf(),
138 name: name.to_string(),
139 start_line,
140 end_line,
141 normalized,
142 });
143 }
144 }
145}
146
147pub(super) fn extract_function_chunks(
148 node: Node,
149 source: &str,
150 file: &Path,
151 chunks: &mut Vec<CodeChunk>,
152) {
153 match node.kind() {
154 "function_definition" | "async_function_definition" => {
155 let name = get_child_by_field(node, "name", source).unwrap_or_default();
156 let (start_line, end_line) =
157 (node.start_position().row + 1, node.end_position().row + 1);
158 let line_count = end_line.saturating_sub(start_line) + 1;
159 if let Some(body) = node.child_by_field_name("body") {
160 let normalized = normalize_code(&source[body.start_byte()..body.end_byte()]);
161 if is_nontrivial_chunk(&normalized, line_count) {
162 chunks.push(CodeChunk {
163 file: file.to_path_buf(),
164 name,
165 start_line,
166 end_line,
167 normalized,
168 });
169 }
170 }
171 let mut cursor = node.walk();
172 for child in node.children(&mut cursor) {
173 extract_function_chunks(child, source, file, chunks);
174 }
175 }
176 _ => {
177 let mut cursor = node.walk();
178 for child in node.children(&mut cursor) {
179 extract_function_chunks(child, source, file, chunks);
180 }
181 }
182 }
183}