Skip to main content

tokmd_analysis_content/
content.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::{
6    DuplicateGroup, DuplicateReport, DuplicationDensityReport, ImportEdge, ImportReport,
7    ModuleDuplicationDensityRow, TodoReport, TodoTagRow,
8};
9use tokmd_types::{ExportData, FileKind, FileRow};
10
11use tokmd_analysis_util::normalize_path;
12use tokmd_math::round_f64;
13
14const DEFAULT_MAX_FILE_BYTES: u64 = 128 * 1024;
15const IMPORT_MAX_LINES: usize = 200;
16
17#[derive(Debug, Clone, Copy)]
18pub enum ImportGranularity {
19    Module,
20    File,
21}
22
23#[derive(Debug, Clone, Copy, Default)]
24pub struct ContentLimits {
25    pub max_bytes: Option<u64>,
26    pub max_file_bytes: Option<u64>,
27}
28
29pub fn build_todo_report(
30    root: &Path,
31    files: &[PathBuf],
32    limits: &ContentLimits,
33    total_code: usize,
34) -> Result<TodoReport> {
35    let mut counts: BTreeMap<String, usize> = BTreeMap::new();
36    let tags = ["TODO", "FIXME", "HACK", "XXX"];
37    let mut total_bytes = 0u64;
38    let max_total = limits.max_bytes;
39    let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;
40
41    for rel in files {
42        if max_total.is_some_and(|limit| total_bytes >= limit) {
43            break;
44        }
45        let path = root.join(rel);
46        let bytes = tokmd_content::read_head(&path, per_file_limit)?;
47        total_bytes += bytes.len() as u64;
48        if !tokmd_content::is_text_like(&bytes) {
49            continue;
50        }
51        let text = String::from_utf8_lossy(&bytes);
52        for (tag, count) in tokmd_content::count_tags(&text, &tags) {
53            *counts.entry(tag).or_insert(0) += count;
54        }
55    }
56
57    let total: usize = counts.values().sum();
58    let kloc = if total_code == 0 {
59        0.0
60    } else {
61        total_code as f64 / 1000.0
62    };
63    let density = if kloc == 0.0 {
64        0.0
65    } else {
66        round_f64(total as f64 / kloc, 2)
67    };
68
69    let tags = counts
70        .into_iter()
71        .map(|(tag, count)| TodoTagRow { tag, count })
72        .collect();
73
74    Ok(TodoReport {
75        total,
76        density_per_kloc: density,
77        tags,
78    })
79}
80
81pub fn build_duplicate_report(
82    root: &Path,
83    files: &[PathBuf],
84    export: &ExportData,
85    limits: &ContentLimits,
86) -> Result<DuplicateReport> {
87    let mut by_size: BTreeMap<u64, Vec<PathBuf>> = BTreeMap::new();
88    let size_limit = limits.max_file_bytes;
89
90    for rel in files {
91        let size = std::fs::metadata(root.join(rel))
92            .map(|m| m.len())
93            .unwrap_or(0);
94        if size_limit.is_some_and(|limit| size > limit) {
95            continue;
96        }
97        by_size.entry(size).or_default().push(rel.clone());
98    }
99
100    let mut path_to_module: BTreeMap<String, String> = BTreeMap::new();
101    let mut module_bytes: BTreeMap<String, u64> = BTreeMap::new();
102    for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
103        let normalized = normalize_path(&row.path, root);
104        path_to_module.insert(normalized, row.module.clone());
105        if let Some(val) = module_bytes.get_mut(&row.module) {
106            *val += row.bytes as u64;
107        } else {
108            module_bytes.insert(row.module.clone(), row.bytes as u64);
109        }
110    }
111
112    let mut groups: Vec<DuplicateGroup> = Vec::new();
113    let mut wasted_bytes = 0u64;
114    let mut duplicate_files = 0usize;
115    let mut duplicated_bytes = 0u64;
116
117    let mut module_duplicate_files: BTreeMap<String, usize> = BTreeMap::new();
118    let mut module_wasted_files: BTreeMap<String, usize> = BTreeMap::new();
119    let mut module_duplicated_bytes: BTreeMap<String, u64> = BTreeMap::new();
120    let mut module_wasted_bytes: BTreeMap<String, u64> = BTreeMap::new();
121
122    for (size, paths) in by_size {
123        if paths.len() < 2 || size == 0 {
124            continue;
125        }
126        let mut by_hash: BTreeMap<String, Vec<String>> = BTreeMap::new();
127        for rel in paths {
128            let path = root.join(&rel);
129            if let Ok(hash) = hash_file_full(&path) {
130                by_hash
131                    .entry(hash)
132                    .or_default()
133                    .push(rel.to_string_lossy().replace('\\', "/"));
134            }
135        }
136        for (hash, mut files) in by_hash {
137            if files.len() < 2 {
138                continue;
139            }
140            files.sort();
141            wasted_bytes += (files.len() as u64 - 1) * size;
142
143            for (idx, file) in files.iter().enumerate() {
144                let module = path_to_module
145                    .get(file)
146                    .cloned()
147                    .unwrap_or_else(|| "(unknown)".to_string());
148                if let Some(val) = module_duplicate_files.get_mut(&module) {
149                    *val += 1;
150                } else {
151                    module_duplicate_files.insert(module.clone(), 1);
152                }
153                if let Some(val) = module_duplicated_bytes.get_mut(&module) {
154                    *val += size;
155                } else {
156                    module_duplicated_bytes.insert(module.clone(), size);
157                }
158                duplicate_files += 1;
159                duplicated_bytes += size;
160
161                if idx > 0 {
162                    if let Some(val) = module_wasted_files.get_mut(&module) {
163                        *val += 1;
164                    } else {
165                        module_wasted_files.insert(module.clone(), 1);
166                    }
167                    if let Some(val) = module_wasted_bytes.get_mut(&module) {
168                        *val += size;
169                    } else {
170                        module_wasted_bytes.insert(module.clone(), size);
171                    }
172                }
173            }
174
175            groups.push(DuplicateGroup {
176                hash,
177                bytes: size,
178                files,
179            });
180        }
181    }
182
183    groups.sort_by(|a, b| b.bytes.cmp(&a.bytes).then_with(|| a.hash.cmp(&b.hash)));
184
185    let mut modules: BTreeSet<String> = BTreeSet::new();
186    modules.extend(module_duplicate_files.keys().cloned());
187    modules.extend(module_wasted_files.keys().cloned());
188
189    let mut by_module: Vec<ModuleDuplicationDensityRow> = modules
190        .into_iter()
191        .map(|module| {
192            let duplicate_files = module_duplicate_files.get(&module).copied().unwrap_or(0);
193            let wasted_files = module_wasted_files.get(&module).copied().unwrap_or(0);
194            let duplicated_bytes = module_duplicated_bytes.get(&module).copied().unwrap_or(0);
195            let wasted_bytes = module_wasted_bytes.get(&module).copied().unwrap_or(0);
196            let module_total = module_bytes.get(&module).copied().unwrap_or(0);
197            let density = if module_total == 0 {
198                0.0
199            } else {
200                round_f64(wasted_bytes as f64 / module_total as f64, 4)
201            };
202            ModuleDuplicationDensityRow {
203                module,
204                duplicate_files,
205                wasted_files,
206                duplicated_bytes,
207                wasted_bytes,
208                module_bytes: module_total,
209                density,
210            }
211        })
212        .collect();
213    by_module.sort_by(|a, b| {
214        b.wasted_bytes
215            .cmp(&a.wasted_bytes)
216            .then_with(|| a.module.cmp(&b.module))
217    });
218
219    let total_codebase_bytes: u64 = module_bytes.values().sum();
220    let wasted_pct_of_codebase = if total_codebase_bytes == 0 {
221        0.0
222    } else {
223        round_f64(wasted_bytes as f64 / total_codebase_bytes as f64, 4)
224    };
225    let density = DuplicationDensityReport {
226        duplicate_groups: groups.len(),
227        duplicate_files,
228        duplicated_bytes,
229        wasted_bytes,
230        wasted_pct_of_codebase,
231        by_module,
232    };
233
234    Ok(DuplicateReport {
235        groups,
236        wasted_bytes,
237        strategy: "exact-blake3".to_string(),
238        density: Some(density),
239        near: None,
240    })
241}
242
243pub fn build_import_report(
244    root: &Path,
245    files: &[PathBuf],
246    export: &ExportData,
247    granularity: ImportGranularity,
248    limits: &ContentLimits,
249) -> Result<ImportReport> {
250    let mut map: BTreeMap<String, &FileRow> = BTreeMap::new();
251    for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
252        let key = normalize_path(&row.path, root);
253        map.insert(key, row);
254    }
255
256    let mut edges: BTreeMap<(&str, String), usize> = BTreeMap::new();
257    let mut total_bytes = 0u64;
258    let max_total = limits.max_bytes;
259    let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;
260
261    for rel in files {
262        if max_total.is_some_and(|limit| total_bytes >= limit) {
263            break;
264        }
265        let rel_str = rel.to_string_lossy().replace('\\', "/");
266        let row = match map.get(&rel_str) {
267            Some(r) => *r,
268            None => continue,
269        };
270        if !tokmd_analysis_imports::supports_language(&row.lang) {
271            continue;
272        }
273        let path = root.join(rel);
274        let lines = match tokmd_content::read_lines(&path, IMPORT_MAX_LINES, per_file_limit) {
275            Ok(lines) => lines,
276            Err(_) => continue,
277        };
278        total_bytes += lines.iter().map(|l| l.len() as u64).sum::<u64>();
279        let imports = tokmd_analysis_imports::parse_imports(&row.lang, &lines);
280        if imports.is_empty() {
281            continue;
282        }
283        let source = match granularity {
284            ImportGranularity::Module => row.module.as_str(),
285            ImportGranularity::File => row.path.as_str(),
286        };
287        for import in imports {
288            let target = tokmd_analysis_imports::normalize_import_target(&import);
289            let key = (source, target);
290            *edges.entry(key).or_insert(0) += 1;
291        }
292    }
293
294    let mut edge_rows: Vec<ImportEdge> = edges
295        .into_iter()
296        .map(|((from, to), count)| ImportEdge {
297            from: from.to_string(),
298            to,
299            count,
300        })
301        .collect();
302    edge_rows.sort_by(|a, b| b.count.cmp(&a.count).then_with(|| a.from.cmp(&b.from)));
303
304    Ok(ImportReport {
305        granularity: match granularity {
306            ImportGranularity::Module => "module".to_string(),
307            ImportGranularity::File => "file".to_string(),
308        },
309        edges: edge_rows,
310    })
311}
312
313fn hash_file_full(path: &Path) -> Result<String> {
314    use std::io::Read;
315    let mut file = std::fs::File::open(path)?;
316    let mut hasher = blake3::Hasher::new();
317    let mut buf = [0u8; 8192];
318    loop {
319        let read = file.read(&mut buf)?;
320        if read == 0 {
321            break;
322        }
323        hasher.update(&buf[..read]);
324    }
325    Ok(hasher.finalize().to_hex().to_string())
326}