tokmd-analysis 1.10.0

Analysis logic and enrichers for tokmd receipts.
Documentation
//! Topic-cloud extraction for analysis receipts.
//!
//! This module preserves the former `analysis topics module` seam inside the
//! `tokmd-analysis` owner crate.

use std::collections::{BTreeMap, BTreeSet};

use tokmd_analysis_types::{TopicClouds, TopicTerm};
use tokmd_types::{ExportData, FileKind, FileRow};

const TOP_K: usize = 8;

pub(crate) fn build_topic_clouds(export: &ExportData) -> TopicClouds {
    let parents: Vec<&FileRow> = export
        .rows
        .iter()
        .filter(|r| r.kind == FileKind::Parent)
        .collect();

    let stopwords = build_stopwords(export);
    let mut terms_by_module: BTreeMap<&str, BTreeMap<String, u32>> = BTreeMap::new();
    let mut df_map: BTreeMap<String, u32> = BTreeMap::new();

    for row in parents {
        let mut terms = tokenize_path(&row.path, &stopwords);
        if terms.is_empty() {
            continue;
        }
        let weight = weight_for_row(row);
        let module_terms = terms_by_module.entry(row.module.as_str()).or_default();
        terms.sort_unstable();

        for term in &terms {
            match module_terms.get_mut(term) {
                Some(count) => *count += weight,
                None => {
                    module_terms.insert(term.clone(), weight);
                }
            }
        }

        terms.dedup();
        for term in terms {
            match df_map.get_mut(&term) {
                Some(count) => *count += 1,
                None => {
                    df_map.insert(term, 1);
                }
            }
        }
    }

    let module_count = terms_by_module.len() as f64;
    let mut per_module: BTreeMap<String, Vec<TopicTerm>> = BTreeMap::new();
    let mut overall_tf: BTreeMap<String, u32> = BTreeMap::new();

    for (module, tf_map) in &terms_by_module {
        let mut rows: Vec<TopicTerm> = tf_map
            .iter()
            .map(|(term, tf)| {
                let df = *df_map.get(term).unwrap_or(&0);
                let score = score_term(*tf, df, module_count);
                TopicTerm {
                    term: term.clone(),
                    score,
                    tf: *tf,
                    df,
                }
            })
            .collect();
        rows.sort_by(|a, b| {
            b.score
                .partial_cmp(&a.score)
                .unwrap_or(std::cmp::Ordering::Equal)
                .then_with(|| a.term.cmp(&b.term))
        });
        rows.truncate(TOP_K);
        per_module.insert(module.to_string(), rows);

        for (term, tf) in tf_map {
            *overall_tf.entry(term.clone()).or_insert(0) += *tf;
        }
    }

    let mut overall: Vec<TopicTerm> = overall_tf
        .iter()
        .map(|(term, tf)| {
            let df = *df_map.get(term).unwrap_or(&0);
            let score = score_term(*tf, df, module_count);
            TopicTerm {
                term: term.clone(),
                score,
                tf: *tf,
                df,
            }
        })
        .collect();
    overall.sort_by(|a, b| {
        b.score
            .partial_cmp(&a.score)
            .unwrap_or(std::cmp::Ordering::Equal)
            .then_with(|| a.term.cmp(&b.term))
    });
    overall.truncate(TOP_K);

    TopicClouds {
        per_module,
        overall,
    }
}

fn score_term(tf: u32, df: u32, module_count: f64) -> f64 {
    let tf = tf as f64;
    let df = df as f64;
    let idf = ((module_count + 1.0) / (df + 1.0)).ln() + 1.0;
    tf * idf
}

fn weight_for_row(row: &FileRow) -> u32 {
    let weight = u32::try_from(row.tokens).unwrap_or(u32::MAX);
    weight.max(1)
}

fn tokenize_path(path: &str, stopwords: &BTreeSet<String>) -> Vec<String> {
    let mut out = Vec::new();
    for part in path.replace('\\', "/").split('/') {
        if part.is_empty() {
            continue;
        }
        for token in part.split(['_', '-', '.']).filter(|t| !t.is_empty()) {
            let term = token.to_lowercase();
            if stopwords.contains(&term) {
                continue;
            }
            out.push(term);
        }
    }
    out
}

fn build_stopwords(export: &ExportData) -> BTreeSet<String> {
    let mut stop = BTreeSet::new();
    let base = [
        "src",
        "lib",
        "mod",
        "index",
        "test",
        "tests",
        "impl",
        "main",
        "bin",
        "pkg",
        "package",
        "target",
        "build",
        "dist",
        "out",
        "gen",
        "generated",
    ];
    for word in base {
        stop.insert(word.to_string());
    }
    let extensions = [
        "rs", "js", "ts", "tsx", "jsx", "py", "go", "java", "kt", "kts", "rb", "php", "c", "cc",
        "cpp", "h", "hpp", "cs", "swift", "m", "mm", "scala", "sql", "toml", "yaml", "yml", "json",
        "md", "markdown", "txt", "lock", "cfg", "ini", "env", "nix", "zig", "dart",
    ];
    for ext in extensions {
        stop.insert(ext.to_string());
    }
    for root in &export.module_roots {
        stop.insert(root.to_lowercase());
    }
    stop
}

#[cfg(test)]
mod unit_tests {
    use super::*;
    use tokmd_types::{ChildIncludeMode, ExportData, FileKind, FileRow};

    #[test]
    fn topic_clouds_are_deterministic() {
        let rows = vec![
            FileRow {
                path: "crates/auth/src/login.rs".to_string(),
                module: "crates/auth".to_string(),
                lang: "Rust".to_string(),
                kind: FileKind::Parent,
                code: 10,
                comments: 0,
                blanks: 0,
                lines: 10,
                bytes: 100,
                tokens: 50,
            },
            FileRow {
                path: "crates/auth/src/token.rs".to_string(),
                module: "crates/auth".to_string(),
                lang: "Rust".to_string(),
                kind: FileKind::Parent,
                code: 10,
                comments: 0,
                blanks: 0,
                lines: 10,
                bytes: 100,
                tokens: 50,
            },
            FileRow {
                path: "crates/payments/src/stripe_api.rs".to_string(),
                module: "crates/payments".to_string(),
                lang: "Rust".to_string(),
                kind: FileKind::Parent,
                code: 10,
                comments: 0,
                blanks: 0,
                lines: 10,
                bytes: 100,
                tokens: 50,
            },
            FileRow {
                path: "crates/payments/src/refund.rs".to_string(),
                module: "crates/payments".to_string(),
                lang: "Rust".to_string(),
                kind: FileKind::Parent,
                code: 10,
                comments: 0,
                blanks: 0,
                lines: 10,
                bytes: 100,
                tokens: 50,
            },
        ];
        let export = ExportData {
            rows,
            module_roots: vec!["crates".to_string()],
            module_depth: 2,
            children: ChildIncludeMode::Separate,
        };

        let topics = build_topic_clouds(&export);
        let auth = topics.per_module.get("crates/auth").unwrap();
        let payments = topics.per_module.get("crates/payments").unwrap();

        assert!(auth.iter().any(|t| t.term == "login"));
        assert!(auth.iter().any(|t| t.term == "token"));
        assert!(payments.iter().any(|t| t.term == "stripe"));
        assert!(payments.iter().any(|t| t.term == "refund"));
    }
}

#[cfg(test)]
mod tests;