Skip to main content

tokmd_analysis_halstead/
lib.rs

1use std::collections::{BTreeMap, BTreeSet};
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::HalsteadMetrics;
6use tokmd_types::{ExportData, FileKind, FileRow};
7
8use tokmd_analysis_util::{AnalysisLimits, normalize_path};
9
10const DEFAULT_MAX_FILE_BYTES: u64 = 128 * 1024;
11
12/// Languages that support Halstead analysis.
13pub fn is_halstead_lang(lang: &str) -> bool {
14    matches!(
15        lang.to_lowercase().as_str(),
16        "rust"
17            | "javascript"
18            | "typescript"
19            | "python"
20            | "go"
21            | "c"
22            | "c++"
23            | "java"
24            | "c#"
25            | "php"
26            | "ruby"
27    )
28}
29
30/// Operators for a given language.
31pub fn operators_for_lang(lang: &str) -> &'static [&'static str] {
32    match lang.to_lowercase().as_str() {
33        "rust" => &[
34            "fn", "let", "mut", "if", "else", "match", "while", "for", "loop", "return", "break",
35            "continue", "struct", "enum", "impl", "trait", "pub", "use", "mod", "async", "await",
36            "move", "ref", "self", "Self", "where", "type", "const", "static", "unsafe", "extern",
37            "crate", "super", "as", "in", "->", "=>", "::", "=", "==", "!=", "<", ">", "<=", ">=",
38            "+", "-", "*", "/", "%", "&", "|", "^", "!", "&&", "||", "<<", ">>", "+=", "-=", "*=",
39            "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "..", "..=", "?",
40        ],
41        "javascript" | "typescript" => &[
42            "function",
43            "var",
44            "let",
45            "const",
46            "if",
47            "else",
48            "switch",
49            "case",
50            "while",
51            "for",
52            "do",
53            "return",
54            "break",
55            "continue",
56            "class",
57            "new",
58            "this",
59            "super",
60            "import",
61            "export",
62            "default",
63            "try",
64            "catch",
65            "finally",
66            "throw",
67            "async",
68            "await",
69            "yield",
70            "typeof",
71            "instanceof",
72            "in",
73            "of",
74            "delete",
75            "void",
76            "=>",
77            "=",
78            "==",
79            "===",
80            "!=",
81            "!==",
82            "<",
83            ">",
84            "<=",
85            ">=",
86            "+",
87            "-",
88            "*",
89            "/",
90            "%",
91            "&",
92            "|",
93            "^",
94            "!",
95            "&&",
96            "||",
97            "~",
98            "<<",
99            ">>",
100            ">>>",
101            "+=",
102            "-=",
103            "*=",
104            "/=",
105            "%=",
106            "&=",
107            "|=",
108            "^=",
109            "<<=",
110            ">>=",
111            ">>>=",
112            "**",
113            "**=",
114            "?.",
115            "??",
116            "??=",
117            "++",
118            "--",
119            "...",
120        ],
121        "python" => &[
122            "def", "class", "if", "elif", "else", "while", "for", "return", "break", "continue",
123            "import", "from", "as", "try", "except", "finally", "raise", "with", "yield", "lambda",
124            "pass", "del", "global", "nonlocal", "assert", "async", "await", "and", "or", "not",
125            "is", "in", "=", "==", "!=", "<", ">", "<=", ">=", "+", "-", "*", "/", "//", "%", "**",
126            "&", "|", "^", "~", "<<", ">>", "+=", "-=", "*=", "/=", "//=", "%=", "**=", "&=", "|=",
127            "^=", "<<=", ">>=", ":=",
128        ],
129        "go" => &[
130            "func",
131            "var",
132            "const",
133            "type",
134            "struct",
135            "interface",
136            "if",
137            "else",
138            "switch",
139            "case",
140            "for",
141            "range",
142            "return",
143            "break",
144            "continue",
145            "goto",
146            "defer",
147            "go",
148            "select",
149            "chan",
150            "map",
151            "package",
152            "import",
153            "fallthrough",
154            ":=",
155            "=",
156            "==",
157            "!=",
158            "<",
159            ">",
160            "<=",
161            ">=",
162            "+",
163            "-",
164            "*",
165            "/",
166            "%",
167            "&",
168            "|",
169            "^",
170            "!",
171            "&&",
172            "||",
173            "<<",
174            ">>",
175            "&^",
176            "+=",
177            "-=",
178            "*=",
179            "/=",
180            "%=",
181            "&=",
182            "|=",
183            "^=",
184            "<<=",
185            ">>=",
186            "&^=",
187            "++",
188            "--",
189            "<-",
190        ],
191        "c" | "c++" | "java" | "c#" | "php" => &[
192            "if",
193            "else",
194            "switch",
195            "case",
196            "while",
197            "for",
198            "do",
199            "return",
200            "break",
201            "continue",
202            "class",
203            "struct",
204            "new",
205            "delete",
206            "try",
207            "catch",
208            "throw",
209            "finally",
210            "void",
211            "static",
212            "const",
213            "public",
214            "private",
215            "protected",
216            "virtual",
217            "override",
218            "abstract",
219            "interface",
220            "extends",
221            "implements",
222            "import",
223            "include",
224            "using",
225            "namespace",
226            "typedef",
227            "template",
228            "=",
229            "==",
230            "!=",
231            "<",
232            ">",
233            "<=",
234            ">=",
235            "+",
236            "-",
237            "*",
238            "/",
239            "%",
240            "&",
241            "|",
242            "^",
243            "!",
244            "&&",
245            "||",
246            "~",
247            "<<",
248            ">>",
249            "+=",
250            "-=",
251            "*=",
252            "/=",
253            "%=",
254            "&=",
255            "|=",
256            "^=",
257            "<<=",
258            ">>=",
259            "++",
260            "--",
261            "->",
262            "::",
263            "sizeof",
264            "typeof",
265        ],
266        "ruby" => &[
267            "def",
268            "class",
269            "module",
270            "if",
271            "elsif",
272            "else",
273            "unless",
274            "while",
275            "until",
276            "for",
277            "do",
278            "return",
279            "break",
280            "next",
281            "begin",
282            "rescue",
283            "ensure",
284            "raise",
285            "yield",
286            "require",
287            "include",
288            "extend",
289            "attr_reader",
290            "attr_writer",
291            "attr_accessor",
292            "self",
293            "super",
294            "end",
295            "and",
296            "or",
297            "not",
298            "=",
299            "==",
300            "!=",
301            "<",
302            ">",
303            "<=",
304            ">=",
305            "+",
306            "-",
307            "*",
308            "/",
309            "%",
310            "&",
311            "|",
312            "^",
313            "!",
314            "&&",
315            "||",
316            "~",
317            "<<",
318            ">>",
319            "+=",
320            "-=",
321            "*=",
322            "/=",
323            "%=",
324            "&=",
325            "|=",
326            "^=",
327            "<<=",
328            ">>=",
329            "**",
330            "**=",
331            "<=>",
332            "=~",
333            "!~",
334            "..",
335            "...",
336        ],
337        _ => &[],
338    }
339}
340
341/// Per-file Halstead token counts.
342pub struct FileTokenCounts {
343    pub operators: BTreeMap<String, usize>,
344    pub operands: BTreeSet<String>,
345    pub total_operators: usize,
346    pub total_operands: usize,
347}
348
349/// Tokenize source code into operators and operands for Halstead analysis.
350pub fn tokenize_for_halstead(text: &str, lang: &str) -> FileTokenCounts {
351    let ops = operators_for_lang(lang);
352    let op_set: BTreeSet<&str> = ops.iter().copied().collect();
353
354    let mut operators: BTreeMap<String, usize> = BTreeMap::new();
355    let mut operands: BTreeSet<String> = BTreeSet::new();
356    let mut total_operators = 0usize;
357    let mut total_operands = 0usize;
358
359    for line in text.lines() {
360        let trimmed = line.trim();
361        // Skip comments and empty lines
362        if trimmed.is_empty()
363            || trimmed.starts_with("//")
364            || trimmed.starts_with('#')
365            || trimmed.starts_with("/*")
366            || trimmed.starts_with('*')
367        {
368            continue;
369        }
370
371        // Extract words and symbols
372        let mut chars = trimmed.chars().peekable();
373        while let Some(&ch) = chars.peek() {
374            if ch.is_whitespace() {
375                chars.next();
376                continue;
377            }
378
379            // Try to match multi-char operators (longest first)
380            if ch.is_ascii_punctuation() && ch != '_' && ch != '"' && ch != '\'' {
381                let remaining: String = chars.clone().take(4).collect();
382                let mut matched = false;
383                for len in (1..=remaining.len().min(4)).rev() {
384                    let candidate = &remaining[..len];
385                    if op_set.contains(candidate) {
386                        *operators.entry(candidate.to_string()).or_insert(0) += 1;
387                        total_operators += 1;
388                        for _ in 0..len {
389                            chars.next();
390                        }
391                        matched = true;
392                        break;
393                    }
394                }
395                if !matched {
396                    chars.next();
397                }
398                continue;
399            }
400
401            // Identifiers/keywords
402            if ch.is_alphanumeric() || ch == '_' {
403                let mut word = String::new();
404                while let Some(&c) = chars.peek() {
405                    if c.is_alphanumeric() || c == '_' {
406                        word.push(c);
407                        chars.next();
408                    } else {
409                        break;
410                    }
411                }
412                if op_set.contains(word.as_str()) {
413                    *operators.entry(word).or_insert(0) += 1;
414                    total_operators += 1;
415                } else {
416                    operands.insert(word);
417                    total_operands += 1;
418                }
419                continue;
420            }
421
422            // String literals - skip
423            if ch == '"' || ch == '\'' {
424                chars.next();
425                let quote = ch;
426                while let Some(&c) = chars.peek() {
427                    chars.next();
428                    if c == '\\' {
429                        chars.next(); // skip escaped char
430                    } else if c == quote {
431                        break;
432                    }
433                }
434                total_operands += 1; // count the string literal as one operand
435                operands.insert("<string>".to_string());
436                continue;
437            }
438
439            chars.next();
440        }
441    }
442
443    FileTokenCounts {
444        operators,
445        operands,
446        total_operators,
447        total_operands,
448    }
449}
450
451/// Build aggregated Halstead metrics from source files.
452pub fn build_halstead_report(
453    root: &Path,
454    files: &[PathBuf],
455    export: &ExportData,
456    limits: &AnalysisLimits,
457) -> Result<HalsteadMetrics> {
458    let mut row_map: BTreeMap<String, &FileRow> = BTreeMap::new();
459    for row in export.rows.iter().filter(|r| r.kind == FileKind::Parent) {
460        row_map.insert(normalize_path(&row.path, root), row);
461    }
462
463    let mut all_operators: BTreeMap<String, usize> = BTreeMap::new();
464    let mut all_operands: BTreeSet<String> = BTreeSet::new();
465    let mut total_ops = 0usize;
466    let mut total_opds = 0usize;
467    let mut total_bytes = 0u64;
468    let max_total = limits.max_bytes;
469    let per_file_limit = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_FILE_BYTES) as usize;
470
471    for rel in files {
472        if max_total.is_some_and(|limit| total_bytes >= limit) {
473            break;
474        }
475        let rel_str = rel.to_string_lossy().replace('\\', "/");
476        let row = match row_map.get(&rel_str) {
477            Some(r) => *r,
478            None => continue,
479        };
480        if !is_halstead_lang(&row.lang) {
481            continue;
482        }
483
484        let path = root.join(rel);
485        let bytes = match tokmd_content::read_head(&path, per_file_limit) {
486            Ok(b) => b,
487            Err(_) => continue,
488        };
489        total_bytes += bytes.len() as u64;
490
491        if !tokmd_content::is_text_like(&bytes) {
492            continue;
493        }
494
495        let text = String::from_utf8_lossy(&bytes);
496        let lang_lower = row.lang.to_lowercase();
497        let counts = tokenize_for_halstead(&text, &lang_lower);
498
499        for (op, count) in counts.operators {
500            *all_operators.entry(op).or_insert(0) += count;
501        }
502        all_operands.extend(counts.operands);
503        total_ops += counts.total_operators;
504        total_opds += counts.total_operands;
505    }
506
507    let n1 = all_operators.len(); // distinct operators
508    let n2 = all_operands.len(); // distinct operands
509    let vocabulary = n1 + n2;
510    let length = total_ops + total_opds;
511
512    let volume = if vocabulary > 0 {
513        length as f64 * (vocabulary as f64).log2()
514    } else {
515        0.0
516    };
517
518    let difficulty = if n2 > 0 {
519        (n1 as f64 / 2.0) * (total_opds as f64 / n2 as f64)
520    } else {
521        0.0
522    };
523
524    let effort = difficulty * volume;
525    let time_seconds = effort / 18.0;
526    let estimated_bugs = volume / 3000.0;
527
528    Ok(HalsteadMetrics {
529        distinct_operators: n1,
530        distinct_operands: n2,
531        total_operators: total_ops,
532        total_operands: total_opds,
533        vocabulary,
534        length,
535        volume: round_f64(volume, 2),
536        difficulty: round_f64(difficulty, 2),
537        effort: round_f64(effort, 2),
538        time_seconds: round_f64(time_seconds, 2),
539        estimated_bugs: round_f64(estimated_bugs, 4),
540    })
541}
542
543/// Round an f64 to a given number of decimal places.
544pub fn round_f64(val: f64, decimals: u32) -> f64 {
545    let factor = 10f64.powi(decimals as i32);
546    (val * factor).round() / factor
547}
548
549#[cfg(test)]
550mod tests {
551    use super::*;
552
553    #[test]
554    fn test_tokenize_rust() {
555        let code = r#"
556fn add(a: i32, b: i32) -> i32 {
557    if a > b {
558        a + b
559    } else {
560        a - b
561    }
562}
563"#;
564        let counts = tokenize_for_halstead(code, "rust");
565        assert!(counts.total_operators > 0);
566        assert!(counts.total_operands > 0);
567        assert!(!counts.operators.is_empty());
568        assert!(!counts.operands.is_empty());
569    }
570
571    #[test]
572    fn test_tokenize_python() {
573        let code = r#"
574def add(a, b):
575    if a > b:
576        return a + b
577    else:
578        return a - b
579"#;
580        let counts = tokenize_for_halstead(code, "python");
581        assert!(counts.total_operators > 0);
582        assert!(counts.total_operands > 0);
583    }
584
585    #[test]
586    fn test_halstead_computation() {
587        // Known values: 2 distinct operators, 3 distinct operands
588        // n1=2, n2=3, N1=4, N2=6
589        // vocabulary = 5, length = 10
590        // volume = 10 * log2(5) ≈ 23.22
591        // difficulty = (2/2) * (6/3) = 2.0
592        // effort = 2.0 * 23.22 ≈ 46.44
593        let n1 = 2;
594        let n2 = 3;
595        let total_ops = 4;
596        let total_opds = 6;
597        let vocabulary = n1 + n2;
598        let length = total_ops + total_opds;
599        let volume = length as f64 * (vocabulary as f64).log2();
600        let difficulty = (n1 as f64 / 2.0) * (total_opds as f64 / n2 as f64);
601        let effort = difficulty * volume;
602
603        assert!((volume - 23.22).abs() < 0.1);
604        assert!((difficulty - 2.0).abs() < 0.01);
605        assert!((effort - 46.44).abs() < 0.2);
606    }
607
608    #[test]
609    fn test_empty_input() {
610        let counts = tokenize_for_halstead("", "rust");
611        assert_eq!(counts.total_operators, 0);
612        assert_eq!(counts.total_operands, 0);
613    }
614}