Skip to main content

kiss/duplication/
extraction.rs

1use crate::minhash::normalize_code;
2use crate::parsing::ParsedFile;
3use crate::rust_parsing::ParsedRustFile;
4use crate::units::get_child_by_field;
5use rayon::prelude::*;
6use std::path::{Path, PathBuf};
7use syn::{ImplItem, Item};
8use tree_sitter::Node;
9
10const MIN_CHUNK_TOKENS: usize = 10;
11const MIN_CHUNK_LINES: usize = 5;
12
13#[derive(Debug, Clone)]
14pub struct CodeChunk {
15    pub file: PathBuf,
16    pub name: String,
17    pub start_line: usize,
18    pub end_line: usize,
19    pub normalized: String,
20}
21
22pub(crate) fn is_nontrivial_chunk(normalized: &str, line_count: usize) -> bool {
23    line_count >= MIN_CHUNK_LINES && normalized.split_whitespace().count() >= MIN_CHUNK_TOKENS
24}
25
26#[must_use]
27pub fn extract_chunks_for_duplication(parsed_files: &[&ParsedFile]) -> Vec<CodeChunk> {
28    // Parallelize per-file extraction but preserve deterministic ordering:
29    // - files in input order
30    // - within each file, traversal order from `extract_function_chunks`
31    let mut per_file: Vec<(usize, Vec<CodeChunk>)> = parsed_files
32        .par_iter()
33        .enumerate()
34        .map(|(idx, parsed)| {
35            let mut chunks = Vec::new();
36            extract_function_chunks(
37                parsed.tree.root_node(),
38                &parsed.source,
39                &parsed.path,
40                &mut chunks,
41            );
42            (idx, chunks)
43        })
44        .collect();
45    per_file.sort_by_key(|(idx, _)| *idx);
46    let total: usize = per_file.iter().map(|(_, v)| v.len()).sum();
47    let mut out = Vec::with_capacity(total);
48    for (_, mut v) in per_file {
49        out.append(&mut v);
50    }
51    out
52}
53
54#[must_use]
55pub fn extract_rust_chunks_for_duplication(parsed_files: &[&ParsedRustFile]) -> Vec<CodeChunk> {
56    // Rust AST (`syn::File`) is not Send/Sync, so we keep this sequential.
57    // Ordering is naturally stable by input order.
58    let mut chunks = Vec::new();
59    for parsed in parsed_files {
60        extract_rust_function_chunks(&parsed.ast, &parsed.source, &parsed.path, &mut chunks);
61    }
62    chunks
63}
64
65pub(super) fn extract_rust_function_chunks(
66    ast: &syn::File,
67    source: &str,
68    file: &Path,
69    chunks: &mut Vec<CodeChunk>,
70) {
71    for item in &ast.items {
72        extract_chunks_from_item(item, source, file, chunks);
73    }
74}
75
76pub(super) fn extract_chunks_from_item(
77    item: &Item,
78    source: &str,
79    file: &Path,
80    chunks: &mut Vec<CodeChunk>,
81) {
82    match item {
83        Item::Fn(func) => {
84            let start = func.sig.fn_token.span.start().line;
85            let end = func.block.brace_token.span.close().end().line;
86            add_rust_function_chunk(
87                &func.sig.ident.to_string(),
88                start,
89                end,
90                source,
91                file,
92                chunks,
93            );
94        }
95        Item::Impl(impl_block) => {
96            for impl_item in &impl_block.items {
97                if let ImplItem::Fn(method) = impl_item {
98                    let start = method.sig.fn_token.span.start().line;
99                    let end = method.block.brace_token.span.close().end().line;
100                    add_rust_function_chunk(
101                        &method.sig.ident.to_string(),
102                        start,
103                        end,
104                        source,
105                        file,
106                        chunks,
107                    );
108                }
109            }
110        }
111        Item::Mod(m) => {
112            if let Some((_, items)) = &m.content {
113                for item in items {
114                    extract_chunks_from_item(item, source, file, chunks);
115                }
116            }
117        }
118        _ => {}
119    }
120}
121
122pub(super) fn add_rust_function_chunk(
123    name: &str,
124    start_line: usize,
125    end_line: usize,
126    source: &str,
127    file: &Path,
128    chunks: &mut Vec<CodeChunk>,
129) {
130    let line_count = end_line.saturating_sub(start_line) + 1;
131    let lines: Vec<&str> = source.lines().collect();
132    if start_line > 0 && end_line <= lines.len() {
133        let body_text: String = lines[start_line - 1..end_line].join("\n");
134        let normalized = normalize_code(&body_text);
135        if is_nontrivial_chunk(&normalized, line_count) {
136            chunks.push(CodeChunk {
137                file: file.to_path_buf(),
138                name: name.to_string(),
139                start_line,
140                end_line,
141                normalized,
142            });
143        }
144    }
145}
146
147pub(super) fn extract_function_chunks(
148    node: Node,
149    source: &str,
150    file: &Path,
151    chunks: &mut Vec<CodeChunk>,
152) {
153    match node.kind() {
154        "function_definition" | "async_function_definition" => {
155            let name = get_child_by_field(node, "name", source).unwrap_or_default();
156            let (start_line, end_line) =
157                (node.start_position().row + 1, node.end_position().row + 1);
158            let line_count = end_line.saturating_sub(start_line) + 1;
159            if let Some(body) = node.child_by_field_name("body") {
160                let normalized = normalize_code(&source[body.start_byte()..body.end_byte()]);
161                if is_nontrivial_chunk(&normalized, line_count) {
162                    chunks.push(CodeChunk {
163                        file: file.to_path_buf(),
164                        name,
165                        start_line,
166                        end_line,
167                        normalized,
168                    });
169                }
170            }
171            let mut cursor = node.walk();
172            for child in node.children(&mut cursor) {
173                extract_function_chunks(child, source, file, chunks);
174            }
175        }
176        _ => {
177            let mut cursor = node.walk();
178            for child in node.children(&mut cursor) {
179                extract_function_chunks(child, source, file, chunks);
180            }
181        }
182    }
183}