Skip to main content

tokmd_analysis_topics/
lib.rs

1//! # tokmd-analysis-topics
2//!
3//! Topic-cloud extraction microcrate for analysis receipts.
4
5use std::collections::{BTreeMap, BTreeSet};
6
7use tokmd_analysis_types::{TopicClouds, TopicTerm};
8use tokmd_types::{ExportData, FileKind, FileRow};
9
10const TOP_K: usize = 8;
11
12/// Build lightweight topic clouds from file path tokens.
13pub fn build_topic_clouds(export: &ExportData) -> TopicClouds {
14    let parents: Vec<&FileRow> = export
15        .rows
16        .iter()
17        .filter(|r| r.kind == FileKind::Parent)
18        .collect();
19
20    let stopwords = build_stopwords(export);
21    let mut terms_by_module: BTreeMap<String, BTreeMap<String, u32>> = BTreeMap::new();
22    let mut df_map: BTreeMap<String, u32> = BTreeMap::new();
23
24    for row in parents {
25        let terms = tokenize_path(&row.path, &stopwords);
26        if terms.is_empty() {
27            continue;
28        }
29        let weight = weight_for_row(row);
30        let module_terms = terms_by_module.entry(row.module.clone()).or_default();
31        let mut seen: BTreeSet<String> = BTreeSet::new();
32        for term in terms {
33            *module_terms.entry(term.clone()).or_insert(0) += weight;
34            seen.insert(term);
35        }
36        for term in seen {
37            *df_map.entry(term).or_insert(0) += 1;
38        }
39    }
40
41    let module_count = terms_by_module.len() as f64;
42    let mut per_module: BTreeMap<String, Vec<TopicTerm>> = BTreeMap::new();
43    let mut overall_tf: BTreeMap<String, u32> = BTreeMap::new();
44
45    for (module, tf_map) in &terms_by_module {
46        let mut rows: Vec<TopicTerm> = tf_map
47            .iter()
48            .map(|(term, tf)| {
49                let df = *df_map.get(term).unwrap_or(&0);
50                let score = score_term(*tf, df, module_count);
51                TopicTerm {
52                    term: term.clone(),
53                    score,
54                    tf: *tf,
55                    df,
56                }
57            })
58            .collect();
59        rows.sort_by(|a, b| {
60            b.score
61                .partial_cmp(&a.score)
62                .unwrap_or(std::cmp::Ordering::Equal)
63                .then_with(|| a.term.cmp(&b.term))
64        });
65        rows.truncate(TOP_K);
66        per_module.insert(module.clone(), rows);
67
68        for (term, tf) in tf_map {
69            *overall_tf.entry(term.clone()).or_insert(0) += *tf;
70        }
71    }
72
73    let mut overall: Vec<TopicTerm> = overall_tf
74        .iter()
75        .map(|(term, tf)| {
76            let df = *df_map.get(term).unwrap_or(&0);
77            let score = score_term(*tf, df, module_count);
78            TopicTerm {
79                term: term.clone(),
80                score,
81                tf: *tf,
82                df,
83            }
84        })
85        .collect();
86    overall.sort_by(|a, b| {
87        b.score
88            .partial_cmp(&a.score)
89            .unwrap_or(std::cmp::Ordering::Equal)
90            .then_with(|| a.term.cmp(&b.term))
91    });
92    overall.truncate(TOP_K);
93
94    TopicClouds {
95        per_module,
96        overall,
97    }
98}
99
100fn score_term(tf: u32, df: u32, module_count: f64) -> f64 {
101    let tf = tf as f64;
102    let df = df as f64;
103    let idf = ((module_count + 1.0) / (df + 1.0)).ln() + 1.0;
104    tf * idf
105}
106
107fn weight_for_row(row: &FileRow) -> u32 {
108    let weight = u32::try_from(row.tokens).unwrap_or(u32::MAX);
109    weight.max(1)
110}
111
112fn tokenize_path(path: &str, stopwords: &BTreeSet<String>) -> Vec<String> {
113    let mut out = Vec::new();
114    for part in path.replace('\\', "/").split('/') {
115        if part.is_empty() {
116            continue;
117        }
118        for token in part.split(['_', '-', '.']).filter(|t| !t.is_empty()) {
119            let term = token.to_lowercase();
120            if stopwords.contains(&term) {
121                continue;
122            }
123            out.push(term);
124        }
125    }
126    out
127}
128
129fn build_stopwords(export: &ExportData) -> BTreeSet<String> {
130    let mut stop = BTreeSet::new();
131    let base = [
132        "src",
133        "lib",
134        "mod",
135        "index",
136        "test",
137        "tests",
138        "impl",
139        "main",
140        "bin",
141        "pkg",
142        "package",
143        "target",
144        "build",
145        "dist",
146        "out",
147        "gen",
148        "generated",
149    ];
150    for word in base {
151        stop.insert(word.to_string());
152    }
153    let extensions = [
154        "rs", "js", "ts", "tsx", "jsx", "py", "go", "java", "kt", "kts", "rb", "php", "c", "cc",
155        "cpp", "h", "hpp", "cs", "swift", "m", "mm", "scala", "sql", "toml", "yaml", "yml", "json",
156        "md", "markdown", "txt", "lock", "cfg", "ini", "env", "nix", "zig", "dart",
157    ];
158    for ext in extensions {
159        stop.insert(ext.to_string());
160    }
161    for root in &export.module_roots {
162        stop.insert(root.to_lowercase());
163    }
164    stop
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170    use tokmd_types::{ChildIncludeMode, ExportData, FileKind, FileRow};
171
172    #[test]
173    fn topic_clouds_are_deterministic() {
174        let rows = vec![
175            FileRow {
176                path: "crates/auth/src/login.rs".to_string(),
177                module: "crates/auth".to_string(),
178                lang: "Rust".to_string(),
179                kind: FileKind::Parent,
180                code: 10,
181                comments: 0,
182                blanks: 0,
183                lines: 10,
184                bytes: 100,
185                tokens: 50,
186            },
187            FileRow {
188                path: "crates/auth/src/token.rs".to_string(),
189                module: "crates/auth".to_string(),
190                lang: "Rust".to_string(),
191                kind: FileKind::Parent,
192                code: 10,
193                comments: 0,
194                blanks: 0,
195                lines: 10,
196                bytes: 100,
197                tokens: 50,
198            },
199            FileRow {
200                path: "crates/payments/src/stripe_api.rs".to_string(),
201                module: "crates/payments".to_string(),
202                lang: "Rust".to_string(),
203                kind: FileKind::Parent,
204                code: 10,
205                comments: 0,
206                blanks: 0,
207                lines: 10,
208                bytes: 100,
209                tokens: 50,
210            },
211            FileRow {
212                path: "crates/payments/src/refund.rs".to_string(),
213                module: "crates/payments".to_string(),
214                lang: "Rust".to_string(),
215                kind: FileKind::Parent,
216                code: 10,
217                comments: 0,
218                blanks: 0,
219                lines: 10,
220                bytes: 100,
221                tokens: 50,
222            },
223        ];
224        let export = ExportData {
225            rows,
226            module_roots: vec!["crates".to_string()],
227            module_depth: 2,
228            children: ChildIncludeMode::Separate,
229        };
230
231        let topics = build_topic_clouds(&export);
232        let auth = topics.per_module.get("crates/auth").unwrap();
233        let payments = topics.per_module.get("crates/payments").unwrap();
234
235        assert!(auth.iter().any(|t| t.term == "login"));
236        assert!(auth.iter().any(|t| t.term == "token"));
237        assert!(payments.iter().any(|t| t.term == "stripe"));
238        assert!(payments.iter().any(|t| t.term == "refund"));
239    }
240}