1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::{
6 DuplicateGroup, DuplicateReport, DuplicationDensityReport, ImportEdge, ImportReport,
7 ModuleDuplicationDensityRow, TodoReport, TodoTagRow,
8};
9use tokmd_types::{ExportData, FileKind, FileRow};
10
11use tokmd_analysis_util::normalize_path;
12use tokmd_math::round_f64;
13
14const DEFAULT_MAX_FILE_BYTES: u64 = 128 * 1024;
15const IMPORT_MAX_LINES: usize = 200;
16
17#[derive(Debug, Clone, Copy)]
18pub enum ImportGranularity {
19 Module,
20 File,
21}
22
23#[derive(Debug, Clone, Copy, Default)]
24pub struct ContentLimits {
25 pub max_bytes: Option<u64>,
26 pub max_file_bytes: Option<u64>,
27}
28
29pub fn build_todo_report(
30 root: &Path,
31 files: &[PathBuf],
32 limits: &ContentLimits,
33 total_code: usize,
34) -> Result<TodoReport> {
35 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
36 let tags = ["TODO", "FIXME", "HACK", "XXX"];
37 let mut total_bytes = 0u64;
38 let max_total = limits.max_bytes;
39 let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;
40
41 for rel in files {
42 if max_total.is_some_and(|limit| total_bytes >= limit) {
43 break;
44 }
45 let path = root.join(rel);
46 let bytes = tokmd_content::read_head(&path, per_file_limit)?;
47 total_bytes += bytes.len() as u64;
48 if !tokmd_content::is_text_like(&bytes) {
49 continue;
50 }
51 let text = String::from_utf8_lossy(&bytes);
52 for (tag, count) in tokmd_content::count_tags(&text, &tags) {
53 *counts.entry(tag).or_insert(0) += count;
54 }
55 }
56
57 let total: usize = counts.values().sum();
58 let kloc = if total_code == 0 {
59 0.0
60 } else {
61 total_code as f64 / 1000.0
62 };
63 let density = if kloc == 0.0 {
64 0.0
65 } else {
66 round_f64(total as f64 / kloc, 2)
67 };
68
69 let tags = counts
70 .into_iter()
71 .map(|(tag, count)| TodoTagRow { tag, count })
72 .collect();
73
74 Ok(TodoReport {
75 total,
76 density_per_kloc: density,
77 tags,
78 })
79}
80
81pub fn build_duplicate_report(
82 root: &Path,
83 files: &[PathBuf],
84 export: &ExportData,
85 limits: &ContentLimits,
86) -> Result<DuplicateReport> {
87 let mut by_size: BTreeMap<u64, Vec<PathBuf>> = BTreeMap::new();
88 let size_limit = limits.max_file_bytes;
89
90 for rel in files {
91 let size = std::fs::metadata(root.join(rel))
92 .map(|m| m.len())
93 .unwrap_or(0);
94 if size_limit.is_some_and(|limit| size > limit) {
95 continue;
96 }
97 by_size.entry(size).or_default().push(rel.clone());
98 }
99
100 let mut path_to_module: BTreeMap<String, String> = BTreeMap::new();
101 let mut module_bytes: BTreeMap<String, u64> = BTreeMap::new();
102 for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
103 let normalized = normalize_path(&row.path, root);
104 path_to_module.insert(normalized, row.module.clone());
105 if let Some(val) = module_bytes.get_mut(&row.module) {
106 *val += row.bytes as u64;
107 } else {
108 module_bytes.insert(row.module.clone(), row.bytes as u64);
109 }
110 }
111
112 let mut groups: Vec<DuplicateGroup> = Vec::new();
113 let mut wasted_bytes = 0u64;
114 let mut duplicate_files = 0usize;
115 let mut duplicated_bytes = 0u64;
116
117 let mut module_duplicate_files: BTreeMap<String, usize> = BTreeMap::new();
118 let mut module_wasted_files: BTreeMap<String, usize> = BTreeMap::new();
119 let mut module_duplicated_bytes: BTreeMap<String, u64> = BTreeMap::new();
120 let mut module_wasted_bytes: BTreeMap<String, u64> = BTreeMap::new();
121
122 for (size, paths) in by_size {
123 if paths.len() < 2 || size == 0 {
124 continue;
125 }
126 let mut by_hash: BTreeMap<String, Vec<String>> = BTreeMap::new();
127 for rel in paths {
128 let path = root.join(&rel);
129 if let Ok(hash) = hash_file_full(&path) {
130 by_hash
131 .entry(hash)
132 .or_default()
133 .push(rel.to_string_lossy().replace('\\', "/"));
134 }
135 }
136 for (hash, mut files) in by_hash {
137 if files.len() < 2 {
138 continue;
139 }
140 files.sort();
141 wasted_bytes += (files.len() as u64 - 1) * size;
142
143 for (idx, file) in files.iter().enumerate() {
144 let module = path_to_module
145 .get(file)
146 .cloned()
147 .unwrap_or_else(|| "(unknown)".to_string());
148 if let Some(val) = module_duplicate_files.get_mut(&module) {
149 *val += 1;
150 } else {
151 module_duplicate_files.insert(module.clone(), 1);
152 }
153 if let Some(val) = module_duplicated_bytes.get_mut(&module) {
154 *val += size;
155 } else {
156 module_duplicated_bytes.insert(module.clone(), size);
157 }
158 duplicate_files += 1;
159 duplicated_bytes += size;
160
161 if idx > 0 {
162 if let Some(val) = module_wasted_files.get_mut(&module) {
163 *val += 1;
164 } else {
165 module_wasted_files.insert(module.clone(), 1);
166 }
167 if let Some(val) = module_wasted_bytes.get_mut(&module) {
168 *val += size;
169 } else {
170 module_wasted_bytes.insert(module.clone(), size);
171 }
172 }
173 }
174
175 groups.push(DuplicateGroup {
176 hash,
177 bytes: size,
178 files,
179 });
180 }
181 }
182
183 groups.sort_by(|a, b| b.bytes.cmp(&a.bytes).then_with(|| a.hash.cmp(&b.hash)));
184
185 let mut modules: BTreeSet<String> = BTreeSet::new();
186 modules.extend(module_duplicate_files.keys().cloned());
187 modules.extend(module_wasted_files.keys().cloned());
188
189 let mut by_module: Vec<ModuleDuplicationDensityRow> = modules
190 .into_iter()
191 .map(|module| {
192 let duplicate_files = module_duplicate_files.get(&module).copied().unwrap_or(0);
193 let wasted_files = module_wasted_files.get(&module).copied().unwrap_or(0);
194 let duplicated_bytes = module_duplicated_bytes.get(&module).copied().unwrap_or(0);
195 let wasted_bytes = module_wasted_bytes.get(&module).copied().unwrap_or(0);
196 let module_total = module_bytes.get(&module).copied().unwrap_or(0);
197 let density = if module_total == 0 {
198 0.0
199 } else {
200 round_f64(wasted_bytes as f64 / module_total as f64, 4)
201 };
202 ModuleDuplicationDensityRow {
203 module,
204 duplicate_files,
205 wasted_files,
206 duplicated_bytes,
207 wasted_bytes,
208 module_bytes: module_total,
209 density,
210 }
211 })
212 .collect();
213 by_module.sort_by(|a, b| {
214 b.wasted_bytes
215 .cmp(&a.wasted_bytes)
216 .then_with(|| a.module.cmp(&b.module))
217 });
218
219 let total_codebase_bytes: u64 = module_bytes.values().sum();
220 let wasted_pct_of_codebase = if total_codebase_bytes == 0 {
221 0.0
222 } else {
223 round_f64(wasted_bytes as f64 / total_codebase_bytes as f64, 4)
224 };
225 let density = DuplicationDensityReport {
226 duplicate_groups: groups.len(),
227 duplicate_files,
228 duplicated_bytes,
229 wasted_bytes,
230 wasted_pct_of_codebase,
231 by_module,
232 };
233
234 Ok(DuplicateReport {
235 groups,
236 wasted_bytes,
237 strategy: "exact-blake3".to_string(),
238 density: Some(density),
239 near: None,
240 })
241}
242
243pub fn build_import_report(
244 root: &Path,
245 files: &[PathBuf],
246 export: &ExportData,
247 granularity: ImportGranularity,
248 limits: &ContentLimits,
249) -> Result<ImportReport> {
250 let mut map: BTreeMap<String, &FileRow> = BTreeMap::new();
251 for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
252 let key = normalize_path(&row.path, root);
253 map.insert(key, row);
254 }
255
256 let mut edges: BTreeMap<(&str, String), usize> = BTreeMap::new();
257 let mut total_bytes = 0u64;
258 let max_total = limits.max_bytes;
259 let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;
260
261 for rel in files {
262 if max_total.is_some_and(|limit| total_bytes >= limit) {
263 break;
264 }
265 let rel_str = rel.to_string_lossy().replace('\\', "/");
266 let row = match map.get(&rel_str) {
267 Some(r) => *r,
268 None => continue,
269 };
270 if !tokmd_analysis_imports::supports_language(&row.lang) {
271 continue;
272 }
273 let path = root.join(rel);
274 let lines = match tokmd_content::read_lines(&path, IMPORT_MAX_LINES, per_file_limit) {
275 Ok(lines) => lines,
276 Err(_) => continue,
277 };
278 total_bytes += lines.iter().map(|l| l.len() as u64).sum::<u64>();
279 let imports = tokmd_analysis_imports::parse_imports(&row.lang, &lines);
280 if imports.is_empty() {
281 continue;
282 }
283 let source = match granularity {
284 ImportGranularity::Module => row.module.as_str(),
285 ImportGranularity::File => row.path.as_str(),
286 };
287 for import in imports {
288 let target = tokmd_analysis_imports::normalize_import_target(&import);
289 let key = (source, target);
290 *edges.entry(key).or_insert(0) += 1;
291 }
292 }
293
294 let mut edge_rows: Vec<ImportEdge> = edges
295 .into_iter()
296 .map(|((from, to), count)| ImportEdge {
297 from: from.to_string(),
298 to,
299 count,
300 })
301 .collect();
302 edge_rows.sort_by(|a, b| b.count.cmp(&a.count).then_with(|| a.from.cmp(&b.from)));
303
304 Ok(ImportReport {
305 granularity: match granularity {
306 ImportGranularity::Module => "module".to_string(),
307 ImportGranularity::File => "file".to_string(),
308 },
309 edges: edge_rows,
310 })
311}
312
313fn hash_file_full(path: &Path) -> Result<String> {
314 use std::io::Read;
315 let mut file = std::fs::File::open(path)?;
316 let mut hasher = blake3::Hasher::new();
317 let mut buf = [0u8; 8192];
318 loop {
319 let read = file.read(&mut buf)?;
320 if read == 0 {
321 break;
322 }
323 hasher.update(&buf[..read]);
324 }
325 Ok(hasher.finalize().to_hex().to_string())
326}