tokmd-analysis 1.10.0

Analysis logic and enrichers for tokmd receipts.
Documentation
use std::collections::{BTreeMap, BTreeSet};
use std::path::{Path, PathBuf};

use anyhow::Result;
use tokmd_analysis_types::{
    DuplicateGroup, DuplicateReport, DuplicationDensityReport, ImportEdge, ImportReport,
    ModuleDuplicationDensityRow, TodoReport, TodoTagRow,
};
use tokmd_types::{ExportData, FileKind, FileRow};

use tokmd_analysis_types::normalize_path;
use tokmd_scan::round_f64;

pub(crate) mod complexity;
pub(crate) mod io;

const DEFAULT_MAX_FILE_BYTES: u64 = 128 * 1024;
const IMPORT_MAX_LINES: usize = 200;

#[derive(Debug, Clone, Copy)]
pub(crate) enum ImportGranularity {
    Module,
    File,
}

#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct ContentLimits {
    pub max_bytes: Option<u64>,
    pub max_file_bytes: Option<u64>,
}

pub(crate) fn build_todo_report(
    root: &Path,
    files: &[PathBuf],
    limits: &ContentLimits,
    total_code: usize,
) -> Result<TodoReport> {
    let mut counts: BTreeMap<String, usize> = BTreeMap::new();
    let tags = ["TODO", "FIXME", "HACK", "XXX"];
    let mut total_bytes = 0u64;
    let max_total = limits.max_bytes;
    let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;

    for rel in files {
        if max_total.is_some_and(|limit| total_bytes >= limit) {
            break;
        }
        let path = root.join(rel);
        let bytes = crate::content::io::read_head(&path, per_file_limit)?;
        total_bytes += bytes.len() as u64;
        if !crate::content::io::is_text_like(&bytes) {
            continue;
        }
        let text = String::from_utf8_lossy(&bytes);
        for (tag, count) in crate::content::io::count_tags(&text, &tags) {
            *counts.entry(tag).or_insert(0) += count;
        }
    }

    let total: usize = counts.values().sum();
    let kloc = if total_code == 0 {
        0.0
    } else {
        total_code as f64 / 1000.0
    };
    let density = if kloc == 0.0 {
        0.0
    } else {
        round_f64(total as f64 / kloc, 2)
    };

    let mut tags: Vec<TodoTagRow> = counts
        .into_iter()
        .map(|(tag, count)| TodoTagRow { tag, count })
        .collect();
    tags.sort_by(|a, b| b.count.cmp(&a.count).then_with(|| a.tag.cmp(&b.tag)));

    Ok(TodoReport {
        total,
        density_per_kloc: density,
        tags,
    })
}

pub(crate) fn build_duplicate_report(
    root: &Path,
    files: &[PathBuf],
    export: &ExportData,
    limits: &ContentLimits,
) -> Result<DuplicateReport> {
    let mut by_size: BTreeMap<u64, Vec<&PathBuf>> = BTreeMap::new();
    let size_limit = limits.max_file_bytes;

    for rel in files {
        let size = std::fs::metadata(root.join(rel))
            .map(|m| m.len())
            .unwrap_or(0);
        if size_limit.is_some_and(|limit| size > limit) {
            continue;
        }
        by_size.entry(size).or_default().push(rel);
    }

    let mut path_to_module: BTreeMap<String, &str> = BTreeMap::new();
    let mut module_bytes: BTreeMap<&str, u64> = BTreeMap::new();
    for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
        let normalized = normalize_path(&row.path, root);
        path_to_module.insert(normalized, row.module.as_str());
        *module_bytes.entry(row.module.as_str()).or_insert(0) += row.bytes as u64;
    }

    let mut groups: Vec<DuplicateGroup> = Vec::new();
    let mut wasted_bytes = 0u64;
    let mut duplicate_files = 0usize;
    let mut duplicated_bytes = 0u64;

    let mut module_duplicate_files: BTreeMap<String, usize> = BTreeMap::new();
    let mut module_wasted_files: BTreeMap<String, usize> = BTreeMap::new();
    let mut module_duplicated_bytes: BTreeMap<String, u64> = BTreeMap::new();
    let mut module_wasted_bytes: BTreeMap<String, u64> = BTreeMap::new();

    for (size, paths) in by_size {
        if paths.len() < 2 || size == 0 {
            continue;
        }
        let mut by_hash: BTreeMap<String, Vec<String>> = BTreeMap::new();
        for rel in paths {
            let path = root.join(rel);
            if let Ok(hash) = hash_file_full(&path) {
                by_hash
                    .entry(hash)
                    .or_default()
                    .push(rel.to_string_lossy().replace('\\', "/"));
            }
        }
        for (hash, mut files) in by_hash {
            if files.len() < 2 {
                continue;
            }
            files.sort();
            wasted_bytes += (files.len() as u64 - 1) * size;

            for (idx, file) in files.iter().enumerate() {
                let module = path_to_module.get(file).copied().unwrap_or("(unknown)");
                if let Some(val) = module_duplicate_files.get_mut(module) {
                    *val += 1;
                } else {
                    module_duplicate_files.insert(module.to_string(), 1);
                }
                if let Some(val) = module_duplicated_bytes.get_mut(module) {
                    *val += size;
                } else {
                    module_duplicated_bytes.insert(module.to_string(), size);
                }
                duplicate_files += 1;
                duplicated_bytes += size;

                if idx > 0 {
                    if let Some(val) = module_wasted_files.get_mut(module) {
                        *val += 1;
                    } else {
                        module_wasted_files.insert(module.to_string(), 1);
                    }
                    if let Some(val) = module_wasted_bytes.get_mut(module) {
                        *val += size;
                    } else {
                        module_wasted_bytes.insert(module.to_string(), size);
                    }
                }
            }

            groups.push(DuplicateGroup {
                hash,
                bytes: size,
                files,
            });
        }
    }

    groups.sort_by(|a, b| b.bytes.cmp(&a.bytes).then_with(|| a.hash.cmp(&b.hash)));

    let mut modules: BTreeSet<String> = BTreeSet::new();
    modules.extend(module_duplicate_files.keys().cloned());
    modules.extend(module_wasted_files.keys().cloned());

    let mut by_module: Vec<ModuleDuplicationDensityRow> = modules
        .into_iter()
        .map(|module| {
            let duplicate_files = module_duplicate_files.get(&module).copied().unwrap_or(0);
            let wasted_files = module_wasted_files.get(&module).copied().unwrap_or(0);
            let duplicated_bytes = module_duplicated_bytes.get(&module).copied().unwrap_or(0);
            let wasted_bytes = module_wasted_bytes.get(&module).copied().unwrap_or(0);
            let module_total = module_bytes.get(module.as_str()).copied().unwrap_or(0);
            let density = if module_total == 0 {
                0.0
            } else {
                round_f64(wasted_bytes as f64 / module_total as f64, 4)
            };
            ModuleDuplicationDensityRow {
                module,
                duplicate_files,
                wasted_files,
                duplicated_bytes,
                wasted_bytes,
                module_bytes: module_total,
                density,
            }
        })
        .collect();
    by_module.sort_by(|a, b| {
        b.wasted_bytes
            .cmp(&a.wasted_bytes)
            .then_with(|| a.module.cmp(&b.module))
    });

    let total_codebase_bytes: u64 = module_bytes.values().sum();
    let wasted_pct_of_codebase = if total_codebase_bytes == 0 {
        0.0
    } else {
        round_f64(wasted_bytes as f64 / total_codebase_bytes as f64, 4)
    };
    let density = DuplicationDensityReport {
        duplicate_groups: groups.len(),
        duplicate_files,
        duplicated_bytes,
        wasted_bytes,
        wasted_pct_of_codebase,
        by_module,
    };

    Ok(DuplicateReport {
        groups,
        wasted_bytes,
        strategy: "exact-blake3".to_string(),
        density: Some(density),
        near: None,
    })
}

pub(crate) fn build_import_report(
    root: &Path,
    files: &[PathBuf],
    export: &ExportData,
    granularity: ImportGranularity,
    limits: &ContentLimits,
) -> Result<ImportReport> {
    let mut map: BTreeMap<String, &FileRow> = BTreeMap::new();
    for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
        let key = normalize_path(&row.path, root);
        map.insert(key, row);
    }

    let mut edges: BTreeMap<(&str, String), usize> = BTreeMap::new();
    let mut total_bytes = 0u64;
    let max_total = limits.max_bytes;
    let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;

    for rel in files {
        if max_total.is_some_and(|limit| total_bytes >= limit) {
            break;
        }
        let rel_str = rel.to_string_lossy().replace('\\', "/");
        let row = match map.get(&rel_str) {
            Some(r) => *r,
            None => continue,
        };
        if !crate::imports::supports_language(&row.lang) {
            continue;
        }
        let path = root.join(rel);
        let lines = match crate::content::io::read_lines(&path, IMPORT_MAX_LINES, per_file_limit) {
            Ok(lines) => lines,
            Err(_) => continue,
        };
        total_bytes += lines.iter().map(|l| l.len() as u64).sum::<u64>();
        let imports = crate::imports::parse_imports(&row.lang, &lines);
        if imports.is_empty() {
            continue;
        }
        let source = match granularity {
            ImportGranularity::Module => row.module.as_str(),
            ImportGranularity::File => row.path.as_str(),
        };
        for import in imports {
            let target = crate::imports::normalize_import_target(&import);
            let key = (source, target);
            *edges.entry(key).or_insert(0) += 1;
        }
    }

    let mut edge_rows: Vec<ImportEdge> = edges
        .into_iter()
        .map(|((from, to), count)| ImportEdge {
            from: from.to_string(),
            to,
            count,
        })
        .collect();
    edge_rows.sort_by(|a, b| {
        b.count
            .cmp(&a.count)
            .then_with(|| a.from.cmp(&b.from))
            .then_with(|| a.to.cmp(&b.to))
    });

    Ok(ImportReport {
        granularity: match granularity {
            ImportGranularity::Module => "module".to_string(),
            ImportGranularity::File => "file".to_string(),
        },
        edges: edge_rows,
    })
}

fn hash_file_full(path: &Path) -> Result<String> {
    use std::io::Read;
    let mut file = std::fs::File::open(path)?;
    let mut hasher = blake3::Hasher::new();
    let mut buf = [0u8; 8192];
    loop {
        let read = file.read(&mut buf)?;
        if read == 0 {
            break;
        }
        hasher.update(&buf[..read]);
    }
    Ok(hasher.finalize().to_hex().to_string())
}

#[cfg(test)]
#[path = "tests.rs"]
mod moved_tests;