1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::HalsteadMetrics;
6use tokmd_types::{ExportData, FileKind, FileRow};
7
8use tokmd_analysis_util::{AnalysisLimits, normalize_path};
9
10const DEFAULT_MAX_FILE_BYTES: u64 = 128 * 1024;
11
12pub fn is_halstead_lang(lang: &str) -> bool {
14 matches!(
15 lang.to_lowercase().as_str(),
16 "rust"
17 | "javascript"
18 | "typescript"
19 | "python"
20 | "go"
21 | "c"
22 | "c++"
23 | "java"
24 | "c#"
25 | "php"
26 | "ruby"
27 )
28}
29
30pub fn operators_for_lang(lang: &str) -> &'static [&'static str] {
32 match lang.to_lowercase().as_str() {
33 "rust" => &[
34 "fn", "let", "mut", "if", "else", "match", "while", "for", "loop", "return", "break",
35 "continue", "struct", "enum", "impl", "trait", "pub", "use", "mod", "async", "await",
36 "move", "ref", "self", "Self", "where", "type", "const", "static", "unsafe", "extern",
37 "crate", "super", "as", "in", "->", "=>", "::", "=", "==", "!=", "<", ">", "<=", ">=",
38 "+", "-", "*", "/", "%", "&", "|", "^", "!", "&&", "||", "<<", ">>", "+=", "-=", "*=",
39 "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "..", "..=", "?",
40 ],
41 "javascript" | "typescript" => &[
42 "function",
43 "var",
44 "let",
45 "const",
46 "if",
47 "else",
48 "switch",
49 "case",
50 "while",
51 "for",
52 "do",
53 "return",
54 "break",
55 "continue",
56 "class",
57 "new",
58 "this",
59 "super",
60 "import",
61 "export",
62 "default",
63 "try",
64 "catch",
65 "finally",
66 "throw",
67 "async",
68 "await",
69 "yield",
70 "typeof",
71 "instanceof",
72 "in",
73 "of",
74 "delete",
75 "void",
76 "=>",
77 "=",
78 "==",
79 "===",
80 "!=",
81 "!==",
82 "<",
83 ">",
84 "<=",
85 ">=",
86 "+",
87 "-",
88 "*",
89 "/",
90 "%",
91 "&",
92 "|",
93 "^",
94 "!",
95 "&&",
96 "||",
97 "~",
98 "<<",
99 ">>",
100 ">>>",
101 "+=",
102 "-=",
103 "*=",
104 "/=",
105 "%=",
106 "&=",
107 "|=",
108 "^=",
109 "<<=",
110 ">>=",
111 ">>>=",
112 "**",
113 "**=",
114 "?.",
115 "??",
116 "??=",
117 "++",
118 "--",
119 "...",
120 ],
121 "python" => &[
122 "def", "class", "if", "elif", "else", "while", "for", "return", "break", "continue",
123 "import", "from", "as", "try", "except", "finally", "raise", "with", "yield", "lambda",
124 "pass", "del", "global", "nonlocal", "assert", "async", "await", "and", "or", "not",
125 "is", "in", "=", "==", "!=", "<", ">", "<=", ">=", "+", "-", "*", "/", "//", "%", "**",
126 "&", "|", "^", "~", "<<", ">>", "+=", "-=", "*=", "/=", "//=", "%=", "**=", "&=", "|=",
127 "^=", "<<=", ">>=", ":=",
128 ],
129 "go" => &[
130 "func",
131 "var",
132 "const",
133 "type",
134 "struct",
135 "interface",
136 "if",
137 "else",
138 "switch",
139 "case",
140 "for",
141 "range",
142 "return",
143 "break",
144 "continue",
145 "goto",
146 "defer",
147 "go",
148 "select",
149 "chan",
150 "map",
151 "package",
152 "import",
153 "fallthrough",
154 ":=",
155 "=",
156 "==",
157 "!=",
158 "<",
159 ">",
160 "<=",
161 ">=",
162 "+",
163 "-",
164 "*",
165 "/",
166 "%",
167 "&",
168 "|",
169 "^",
170 "!",
171 "&&",
172 "||",
173 "<<",
174 ">>",
175 "&^",
176 "+=",
177 "-=",
178 "*=",
179 "/=",
180 "%=",
181 "&=",
182 "|=",
183 "^=",
184 "<<=",
185 ">>=",
186 "&^=",
187 "++",
188 "--",
189 "<-",
190 ],
191 "c" | "c++" | "java" | "c#" | "php" => &[
192 "if",
193 "else",
194 "switch",
195 "case",
196 "while",
197 "for",
198 "do",
199 "return",
200 "break",
201 "continue",
202 "class",
203 "struct",
204 "new",
205 "delete",
206 "try",
207 "catch",
208 "throw",
209 "finally",
210 "void",
211 "static",
212 "const",
213 "public",
214 "private",
215 "protected",
216 "virtual",
217 "override",
218 "abstract",
219 "interface",
220 "extends",
221 "implements",
222 "import",
223 "include",
224 "using",
225 "namespace",
226 "typedef",
227 "template",
228 "=",
229 "==",
230 "!=",
231 "<",
232 ">",
233 "<=",
234 ">=",
235 "+",
236 "-",
237 "*",
238 "/",
239 "%",
240 "&",
241 "|",
242 "^",
243 "!",
244 "&&",
245 "||",
246 "~",
247 "<<",
248 ">>",
249 "+=",
250 "-=",
251 "*=",
252 "/=",
253 "%=",
254 "&=",
255 "|=",
256 "^=",
257 "<<=",
258 ">>=",
259 "++",
260 "--",
261 "->",
262 "::",
263 "sizeof",
264 "typeof",
265 ],
266 "ruby" => &[
267 "def",
268 "class",
269 "module",
270 "if",
271 "elsif",
272 "else",
273 "unless",
274 "while",
275 "until",
276 "for",
277 "do",
278 "return",
279 "break",
280 "next",
281 "begin",
282 "rescue",
283 "ensure",
284 "raise",
285 "yield",
286 "require",
287 "include",
288 "extend",
289 "attr_reader",
290 "attr_writer",
291 "attr_accessor",
292 "self",
293 "super",
294 "end",
295 "and",
296 "or",
297 "not",
298 "=",
299 "==",
300 "!=",
301 "<",
302 ">",
303 "<=",
304 ">=",
305 "+",
306 "-",
307 "*",
308 "/",
309 "%",
310 "&",
311 "|",
312 "^",
313 "!",
314 "&&",
315 "||",
316 "~",
317 "<<",
318 ">>",
319 "+=",
320 "-=",
321 "*=",
322 "/=",
323 "%=",
324 "&=",
325 "|=",
326 "^=",
327 "<<=",
328 ">>=",
329 "**",
330 "**=",
331 "<=>",
332 "=~",
333 "!~",
334 "..",
335 "...",
336 ],
337 _ => &[],
338 }
339}
340
341pub struct FileTokenCounts {
343 pub operators: BTreeMap<String, usize>,
344 pub operands: BTreeSet<String>,
345 pub total_operators: usize,
346 pub total_operands: usize,
347}
348
349pub fn tokenize_for_halstead(text: &str, lang: &str) -> FileTokenCounts {
351 let ops = operators_for_lang(lang);
352 let op_set: BTreeSet<&str> = ops.iter().copied().collect();
353
354 let mut operators: BTreeMap<String, usize> = BTreeMap::new();
355 let mut operands: BTreeSet<String> = BTreeSet::new();
356 let mut total_operators = 0usize;
357 let mut total_operands = 0usize;
358
359 for line in text.lines() {
360 let trimmed = line.trim();
361 if trimmed.is_empty()
363 || trimmed.starts_with("//")
364 || trimmed.starts_with('#')
365 || trimmed.starts_with("/*")
366 || trimmed.starts_with('*')
367 {
368 continue;
369 }
370
371 let mut chars = trimmed.chars().peekable();
373 while let Some(&ch) = chars.peek() {
374 if ch.is_whitespace() {
375 chars.next();
376 continue;
377 }
378
379 if ch.is_ascii_punctuation() && ch != '_' && ch != '"' && ch != '\'' {
381 let remaining: String = chars.clone().take(4).collect();
382 let mut matched = false;
383 for len in (1..=remaining.len().min(4)).rev() {
384 let candidate = &remaining[..len];
385 if op_set.contains(candidate) {
386 *operators.entry(candidate.to_string()).or_insert(0) += 1;
387 total_operators += 1;
388 for _ in 0..len {
389 chars.next();
390 }
391 matched = true;
392 break;
393 }
394 }
395 if !matched {
396 chars.next();
397 }
398 continue;
399 }
400
401 if ch.is_alphanumeric() || ch == '_' {
403 let mut word = String::new();
404 while let Some(&c) = chars.peek() {
405 if c.is_alphanumeric() || c == '_' {
406 word.push(c);
407 chars.next();
408 } else {
409 break;
410 }
411 }
412 if op_set.contains(word.as_str()) {
413 *operators.entry(word).or_insert(0) += 1;
414 total_operators += 1;
415 } else {
416 operands.insert(word);
417 total_operands += 1;
418 }
419 continue;
420 }
421
422 if ch == '"' || ch == '\'' {
424 chars.next();
425 let quote = ch;
426 while let Some(&c) = chars.peek() {
427 chars.next();
428 if c == '\\' {
429 chars.next(); } else if c == quote {
431 break;
432 }
433 }
434 total_operands += 1; operands.insert("<string>".to_string());
436 continue;
437 }
438
439 chars.next();
440 }
441 }
442
443 FileTokenCounts {
444 operators,
445 operands,
446 total_operators,
447 total_operands,
448 }
449}
450
451pub fn build_halstead_report(
453 root: &Path,
454 files: &[PathBuf],
455 export: &ExportData,
456 limits: &AnalysisLimits,
457) -> Result<HalsteadMetrics> {
458 let mut row_map: BTreeMap<String, &FileRow> = BTreeMap::new();
459 for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
460 row_map.insert(normalize_path(&row.path, root), row);
461 }
462
463 let mut all_operators: BTreeMap<String, usize> = BTreeMap::new();
464 let mut all_operands: BTreeSet<String> = BTreeSet::new();
465 let mut total_ops = 0usize;
466 let mut total_opds = 0usize;
467 let mut total_bytes = 0u64;
468 let max_total = limits.max_bytes;
469 let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;
470
471 for rel in files {
472 if max_total.is_some_and(|limit| total_bytes >= limit) {
473 break;
474 }
475 let rel_str = rel.to_string_lossy().replace('\\', "/");
476 let row = match row_map.get(&rel_str) {
477 Some(r) => *r,
478 None => continue,
479 };
480 if !is_halstead_lang(&row.lang) {
481 continue;
482 }
483
484 let path = root.join(rel);
485 let bytes = match tokmd_content::read_head(&path, per_file_limit) {
486 Ok(b) => b,
487 Err(_) => continue,
488 };
489 total_bytes += bytes.len() as u64;
490
491 if !tokmd_content::is_text_like(&bytes) {
492 continue;
493 }
494
495 let text = String::from_utf8_lossy(&bytes);
496 let lang_lower = row.lang.to_lowercase();
497 let counts = tokenize_for_halstead(&text, &lang_lower);
498
499 for (op, count) in counts.operators {
500 *all_operators.entry(op).or_insert(0) += count;
501 }
502 all_operands.extend(counts.operands);
503 total_ops += counts.total_operators;
504 total_opds += counts.total_operands;
505 }
506
507 let n1 = all_operators.len(); let n2 = all_operands.len(); let vocabulary = n1 + n2;
510 let length = total_ops + total_opds;
511
512 let volume = if vocabulary > 0 {
513 length as f64 * (vocabulary as f64).log2()
514 } else {
515 0.0
516 };
517
518 let difficulty = if n2 > 0 {
519 (n1 as f64 / 2.0) * (total_opds as f64 / n2 as f64)
520 } else {
521 0.0
522 };
523
524 let effort = difficulty * volume;
525 let time_seconds = effort / 18.0;
526 let estimated_bugs = volume / 3000.0;
527
528 Ok(HalsteadMetrics {
529 distinct_operators: n1,
530 distinct_operands: n2,
531 total_operators: total_ops,
532 total_operands: total_opds,
533 vocabulary,
534 length,
535 volume: round_f64(volume, 2),
536 difficulty: round_f64(difficulty, 2),
537 effort: round_f64(effort, 2),
538 time_seconds: round_f64(time_seconds, 2),
539 estimated_bugs: round_f64(estimated_bugs, 4),
540 })
541}
542
543pub fn round_f64(val: f64, decimals: u32) -> f64 {
545 let factor = 10f64.powi(decimals as i32);
546 (val * factor).round() / factor
547}
548
549#[cfg(test)]
550mod tests {
551 use super::*;
552
553 #[test]
554 fn test_tokenize_rust() {
555 let code = r#"
556fn add(a: i32, b: i32) -> i32 {
557 if a > b {
558 a + b
559 } else {
560 a - b
561 }
562}
563"#;
564 let counts = tokenize_for_halstead(code, "rust");
565 assert!(counts.total_operators > 0);
566 assert!(counts.total_operands > 0);
567 assert!(!counts.operators.is_empty());
568 assert!(!counts.operands.is_empty());
569 }
570
571 #[test]
572 fn test_tokenize_python() {
573 let code = r#"
574def add(a, b):
575 if a > b:
576 return a + b
577 else:
578 return a - b
579"#;
580 let counts = tokenize_for_halstead(code, "python");
581 assert!(counts.total_operators > 0);
582 assert!(counts.total_operands > 0);
583 }
584
585 #[test]
586 fn test_halstead_computation() {
587 let n1 = 2;
594 let n2 = 3;
595 let total_ops = 4;
596 let total_opds = 6;
597 let vocabulary = n1 + n2;
598 let length = total_ops + total_opds;
599 let volume = length as f64 * (vocabulary as f64).log2();
600 let difficulty = (n1 as f64 / 2.0) * (total_opds as f64 / n2 as f64);
601 let effort = difficulty * volume;
602
603 assert!((volume - 23.22).abs() < 0.1);
604 assert!((difficulty - 2.0).abs() < 0.01);
605 assert!((effort - 46.44).abs() < 0.2);
606 }
607
608 #[test]
609 fn test_empty_input() {
610 let counts = tokenize_for_halstead("", "rust");
611 assert_eq!(counts.total_operators, 0);
612 assert_eq!(counts.total_operands, 0);
613 }
614}