1use std::collections::{BTreeMap, BTreeSet};
6
7use tokmd_analysis_types::{TopicClouds, TopicTerm};
8use tokmd_types::{ExportData, FileKind, FileRow};
9
10const TOP_K: usize = 8;
11
12pub fn build_topic_clouds(export: &ExportData) -> TopicClouds {
14 let parents: Vec<&FileRow> = export
15 .rows
16 .iter()
17 .filter(|r| r.kind == FileKind::Parent)
18 .collect();
19
20 let stopwords = build_stopwords(export);
21 let mut terms_by_module: BTreeMap<String, BTreeMap<String, u32>> = BTreeMap::new();
22 let mut df_map: BTreeMap<String, u32> = BTreeMap::new();
23
24 for row in parents {
25 let terms = tokenize_path(&row.path, &stopwords);
26 if terms.is_empty() {
27 continue;
28 }
29 let weight = weight_for_row(row);
30 let module_terms = terms_by_module.entry(row.module.clone()).or_default();
31 let mut seen: BTreeSet<String> = BTreeSet::new();
32 for term in terms {
33 *module_terms.entry(term.clone()).or_insert(0) += weight;
34 seen.insert(term);
35 }
36 for term in seen {
37 *df_map.entry(term).or_insert(0) += 1;
38 }
39 }
40
41 let module_count = terms_by_module.len() as f64;
42 let mut per_module: BTreeMap<String, Vec<TopicTerm>> = BTreeMap::new();
43 let mut overall_tf: BTreeMap<String, u32> = BTreeMap::new();
44
45 for (module, tf_map) in &terms_by_module {
46 let mut rows: Vec<TopicTerm> = tf_map
47 .iter()
48 .map(|(term, tf)| {
49 let df = *df_map.get(term).unwrap_or(&0);
50 let score = score_term(*tf, df, module_count);
51 TopicTerm {
52 term: term.clone(),
53 score,
54 tf: *tf,
55 df,
56 }
57 })
58 .collect();
59 rows.sort_by(|a, b| {
60 b.score
61 .partial_cmp(&a.score)
62 .unwrap_or(std::cmp::Ordering::Equal)
63 .then_with(|| a.term.cmp(&b.term))
64 });
65 rows.truncate(TOP_K);
66 per_module.insert(module.clone(), rows);
67
68 for (term, tf) in tf_map {
69 *overall_tf.entry(term.clone()).or_insert(0) += *tf;
70 }
71 }
72
73 let mut overall: Vec<TopicTerm> = overall_tf
74 .iter()
75 .map(|(term, tf)| {
76 let df = *df_map.get(term).unwrap_or(&0);
77 let score = score_term(*tf, df, module_count);
78 TopicTerm {
79 term: term.clone(),
80 score,
81 tf: *tf,
82 df,
83 }
84 })
85 .collect();
86 overall.sort_by(|a, b| {
87 b.score
88 .partial_cmp(&a.score)
89 .unwrap_or(std::cmp::Ordering::Equal)
90 .then_with(|| a.term.cmp(&b.term))
91 });
92 overall.truncate(TOP_K);
93
94 TopicClouds {
95 per_module,
96 overall,
97 }
98}
99
100fn score_term(tf: u32, df: u32, module_count: f64) -> f64 {
101 let tf = tf as f64;
102 let df = df as f64;
103 let idf = ((module_count + 1.0) / (df + 1.0)).ln() + 1.0;
104 tf * idf
105}
106
107fn weight_for_row(row: &FileRow) -> u32 {
108 let weight = u32::try_from(row.tokens).unwrap_or(u32::MAX);
109 weight.max(1)
110}
111
112fn tokenize_path(path: &str, stopwords: &BTreeSet<String>) -> Vec<String> {
113 let mut out = Vec::new();
114 for part in path.replace('\\', "/").split('/') {
115 if part.is_empty() {
116 continue;
117 }
118 for token in part.split(['_', '-', '.']).filter(|t| !t.is_empty()) {
119 let term = token.to_lowercase();
120 if stopwords.contains(&term) {
121 continue;
122 }
123 out.push(term);
124 }
125 }
126 out
127}
128
129fn build_stopwords(export: &ExportData) -> BTreeSet<String> {
130 let mut stop = BTreeSet::new();
131 let base = [
132 "src",
133 "lib",
134 "mod",
135 "index",
136 "test",
137 "tests",
138 "impl",
139 "main",
140 "bin",
141 "pkg",
142 "package",
143 "target",
144 "build",
145 "dist",
146 "out",
147 "gen",
148 "generated",
149 ];
150 for word in base {
151 stop.insert(word.to_string());
152 }
153 let extensions = [
154 "rs", "js", "ts", "tsx", "jsx", "py", "go", "java", "kt", "kts", "rb", "php", "c", "cc",
155 "cpp", "h", "hpp", "cs", "swift", "m", "mm", "scala", "sql", "toml", "yaml", "yml", "json",
156 "md", "markdown", "txt", "lock", "cfg", "ini", "env", "nix", "zig", "dart",
157 ];
158 for ext in extensions {
159 stop.insert(ext.to_string());
160 }
161 for root in &export.module_roots {
162 stop.insert(root.to_lowercase());
163 }
164 stop
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170 use tokmd_types::{ChildIncludeMode, ExportData, FileKind, FileRow};
171
172 #[test]
173 fn topic_clouds_are_deterministic() {
174 let rows = vec![
175 FileRow {
176 path: "crates/auth/src/login.rs".to_string(),
177 module: "crates/auth".to_string(),
178 lang: "Rust".to_string(),
179 kind: FileKind::Parent,
180 code: 10,
181 comments: 0,
182 blanks: 0,
183 lines: 10,
184 bytes: 100,
185 tokens: 50,
186 },
187 FileRow {
188 path: "crates/auth/src/token.rs".to_string(),
189 module: "crates/auth".to_string(),
190 lang: "Rust".to_string(),
191 kind: FileKind::Parent,
192 code: 10,
193 comments: 0,
194 blanks: 0,
195 lines: 10,
196 bytes: 100,
197 tokens: 50,
198 },
199 FileRow {
200 path: "crates/payments/src/stripe_api.rs".to_string(),
201 module: "crates/payments".to_string(),
202 lang: "Rust".to_string(),
203 kind: FileKind::Parent,
204 code: 10,
205 comments: 0,
206 blanks: 0,
207 lines: 10,
208 bytes: 100,
209 tokens: 50,
210 },
211 FileRow {
212 path: "crates/payments/src/refund.rs".to_string(),
213 module: "crates/payments".to_string(),
214 lang: "Rust".to_string(),
215 kind: FileKind::Parent,
216 code: 10,
217 comments: 0,
218 blanks: 0,
219 lines: 10,
220 bytes: 100,
221 tokens: 50,
222 },
223 ];
224 let export = ExportData {
225 rows,
226 module_roots: vec!["crates".to_string()],
227 module_depth: 2,
228 children: ChildIncludeMode::Separate,
229 };
230
231 let topics = build_topic_clouds(&export);
232 let auth = topics.per_module.get("crates/auth").unwrap();
233 let payments = topics.per_module.get("crates/payments").unwrap();
234
235 assert!(auth.iter().any(|t| t.term == "login"));
236 assert!(auth.iter().any(|t| t.term == "token"));
237 assert!(payments.iter().any(|t| t.term == "stripe"));
238 assert!(payments.iter().any(|t| t.term == "refund"));
239 }
240}