Skip to main content

big_code_analysis/metrics/
tokens.rs

1// Per-language metric and AST modules deliberately consume the macro-
2// generated tree-sitter token enums via `use crate::*` and `use Foo::*`
3// inside match expressions — explicit imports would list dozens of
4// variants per arm and obscure the per-language token sets that are the
5// point of these files. Allowed at the module level rather than per
6// function so the per-language impl blocks stay readable.
7#![allow(clippy::wildcard_imports, clippy::enum_glob_use)]
8// Metric counts (token, function, branch, argument, etc.) are stored as
9// `usize` and crossed with `f64` averages, ratios, and Halstead scores
10// across the cyclomatic / MI / Halstead computations. The `usize as f64`
11// and `f64 as usize` casts are intentional and snapshot-anchored — every
12// site is bounded by the count it came from. Allowing the lints at the
13// module level keeps the metric arithmetic legible.
14#![allow(
15    clippy::cast_precision_loss,
16    clippy::cast_possible_truncation,
17    clippy::cast_sign_loss
18)]
19
20use serde::Serialize;
21use serde::ser::{SerializeStruct, Serializer};
22use std::fmt;
23
24use crate::checker::Checker;
25use crate::macros::implement_metric_trait;
26
27use crate::*;
28
29/// The `Tokens` metric: per-function and per-file count of tree-sitter
30/// leaf tokens, excluding any leaf whose ancestor chain includes a
31/// comment node.
32///
33/// This is a token-based size proxy: it counts the lexer's tokens
34/// (identifiers, literals, keywords, punctuation) rather than lines or
35/// Halstead operators/operands. Punctuation that Halstead skips
36/// (parentheses, semicolons, separators) does contribute, so
37/// `tokens` ≠ Halstead `N1 + N2`.
38#[derive(Clone, Debug)]
39pub struct Stats {
40    tokens: usize,
41    tokens_sum: usize,
42    tokens_min: usize,
43    tokens_max: usize,
44    space_count: usize,
45}
46
47impl Default for Stats {
48    fn default() -> Self {
49        Self {
50            tokens: 0,
51            tokens_sum: 0,
52            tokens_min: usize::MAX,
53            tokens_max: 0,
54            space_count: 1,
55        }
56    }
57}
58
59impl Serialize for Stats {
60    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
61    where
62        S: Serializer,
63    {
64        let mut st = serializer.serialize_struct("tokens", 4)?;
65        st.serialize_field("tokens", &self.tokens_sum())?;
66        st.serialize_field("tokens_average", &self.tokens_average())?;
67        st.serialize_field("tokens_min", &self.tokens_min())?;
68        st.serialize_field("tokens_max", &self.tokens_max())?;
69        st.end()
70    }
71}
72
73impl fmt::Display for Stats {
74    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
75        write!(
76            f,
77            "tokens: {}, \
78             tokens_average: {}, \
79             tokens_min: {}, \
80             tokens_max: {}",
81            self.tokens_sum(),
82            self.tokens_average(),
83            self.tokens_min(),
84            self.tokens_max(),
85        )
86    }
87}
88
89impl Stats {
90    /// Merges a second `Tokens` metric suite into the first one.
91    pub fn merge(&mut self, other: &Stats) {
92        self.tokens_min = self.tokens_min.min(other.tokens_min);
93        self.tokens_max = self.tokens_max.max(other.tokens_max);
94        self.tokens_sum += other.tokens_sum;
95        self.space_count += other.space_count;
96    }
97
98    /// Returns the total token count across all merged spaces.
99    #[inline]
100    #[must_use]
101    pub fn tokens_sum(&self) -> f64 {
102        self.tokens_sum as f64
103    }
104
105    /// Returns the average tokens per space.
106    #[inline]
107    #[must_use]
108    pub fn tokens_average(&self) -> f64 {
109        self.tokens_sum() / self.space_count as f64
110    }
111
112    /// Returns the smallest single-space token count.
113    ///
114    /// Diverges intentionally from `nom::Stats::functions_min`, which
115    /// surfaces the raw `usize::MAX` sentinel for a never-observed
116    /// space. We collapse the sentinel to `0.0` so a `Stats::default()`
117    /// that bypasses the metric pipeline serializes to a meaningful
118    /// number rather than `1.8446744e19`.
119    #[inline]
120    #[must_use]
121    pub fn tokens_min(&self) -> f64 {
122        if self.tokens_min == usize::MAX {
123            0.0
124        } else {
125            self.tokens_min as f64
126        }
127    }
128
129    /// Returns the largest single-space token count.
130    #[inline]
131    #[must_use]
132    pub fn tokens_max(&self) -> f64 {
133        self.tokens_max as f64
134    }
135
136    #[inline]
137    pub(crate) fn compute_sum(&mut self) {
138        self.tokens_sum += self.tokens;
139    }
140
141    #[inline]
142    pub(crate) fn compute_minmax(&mut self) {
143        self.tokens_min = self.tokens_min.min(self.tokens);
144        self.tokens_max = self.tokens_max.max(self.tokens);
145        self.compute_sum();
146    }
147}
148
149#[doc(hidden)]
150/// Per-language counting of tokens.
151pub trait Tokens
152where
153    Self: Checker,
154{
155    /// Walk `node` and update `stats` with this metric for the language
156    /// implementing the trait.
157    fn compute(node: &Node, stats: &mut Stats) {
158        if node.child_count() != 0 {
159            return;
160        }
161        // Walk the leaf's ancestors so grammars whose comments have
162        // internal structure (e.g. Rust doc comments split into
163        // markers and content) also exclude inner leaves; the leaf
164        // itself is the first item, so bare comment nodes are caught
165        // immediately.
166        let in_comment =
167            std::iter::successors(Some(*node), Node::parent).any(|n| Self::is_comment(&n));
168        if !in_comment {
169            stats.tokens += 1;
170        }
171    }
172}
173
174implement_metric_trait!(
175    [Tokens],
176    PythonCode,
177    MozjsCode,
178    JavascriptCode,
179    TypescriptCode,
180    TsxCode,
181    CppCode,
182    RustCode,
183    PreprocCode,
184    CcommentCode,
185    JavaCode,
186    KotlinCode,
187    GoCode,
188    PerlCode,
189    BashCode,
190    LuaCode,
191    TclCode,
192    PhpCode,
193    CsharpCode,
194    ElixirCode,
195    RubyCode,
196    GroovyCode
197);
198
199#[cfg(test)]
200#[allow(
201    clippy::float_cmp,
202    clippy::cast_precision_loss,
203    clippy::cast_possible_truncation,
204    clippy::cast_sign_loss,
205    clippy::similar_names,
206    clippy::doc_markdown,
207    clippy::needless_raw_string_hashes,
208    clippy::too_many_lines
209)]
210mod tests {
211    use crate::tools::check_metrics;
212
213    use super::*;
214
215    /// `def foo(x): return x` → leaves: `def`, `foo`, `(`, `x`, `)`,
216    /// `:`, `return`, `x` = 8 tokens, hand-counted.
217    #[test]
218    fn python_tokens_exact_count() {
219        check_metrics::<PythonParser>("def foo(x): return x", "foo.py", |metric| {
220            assert_eq!(metric.tokens.tokens_sum(), 8.0);
221            assert!(metric.tokens.tokens_max() >= 7.0);
222        });
223    }
224
225    /// Adding a Python comment must not change the token count.
226    #[test]
227    fn python_tokens_comments_excluded() {
228        check_metrics::<PythonParser>(
229            "def foo(x): return x  # explanation\n# header\n",
230            "foo.py",
231            |metric| {
232                assert_eq!(metric.tokens.tokens_sum(), 8.0);
233            },
234        );
235    }
236
237    /// Blank lines and indentation must not change the token count.
238    #[test]
239    fn python_tokens_whitespace_excluded() {
240        check_metrics::<PythonParser>(
241            "\n\n    def foo(x):\n        return x\n\n",
242            "foo.py",
243            |metric| {
244                assert_eq!(metric.tokens.tokens_sum(), 8.0);
245            },
246        );
247    }
248
249    /// Tokens must exceed Halstead `N1 + N2` for code containing
250    /// punctuation Halstead skips. Guards against accidental Halstead
251    /// reuse.
252    #[test]
253    fn python_tokens_distinct_from_halstead() {
254        check_metrics::<PythonParser>("def foo(x): return (x + 1)", "foo.py", |metric| {
255            let halstead_total = metric.halstead.operators() + metric.halstead.operands();
256            assert!(
257                metric.tokens.tokens_sum() > halstead_total,
258                "expected tokens ({}) > halstead N1+N2 ({}); punctuation \
259                 like `(`, `)`, `:` should contribute to tokens but not Halstead",
260                metric.tokens.tokens_sum(),
261                halstead_total,
262            );
263        });
264    }
265
266    /// Inner functions get attributed to their innermost scope. For
267    /// `def outer(): def inner(): return 1`, the inner scope owns
268    /// `def, inner, (, ), :, return, 1` = 7 tokens; the outer scope
269    /// owns `def, outer, (, ), :` = 5; the unit owns 0 directly.
270    /// Asserting the exact `tokens_max` is what catches an attribution
271    /// regression — a broken implementation that credited all 12
272    /// tokens to one scope would still pass `max <= sum`.
273    #[test]
274    fn python_tokens_nested_attribution() {
275        check_metrics::<PythonParser>(
276            "def outer():\n    def inner():\n        return 1\n",
277            "foo.py",
278            |metric| {
279                assert_eq!(metric.tokens.tokens_sum(), 12.0);
280                assert_eq!(metric.tokens.tokens_max(), 7.0);
281                assert_eq!(metric.tokens.tokens_min(), 0.0);
282            },
283        );
284    }
285
286    /// C++ `/* … */` block comments must not contribute.
287    /// Same fixture with and without comment yields the same count.
288    #[test]
289    fn cpp_tokens_block_comments_excluded() {
290        check_metrics::<CppParser>(
291            "int foo(int x) { /* multi\n   line */ return x; }",
292            "foo.cpp",
293            |m| {
294                // Leaves outside the comment:
295                // int, foo, (, int, x, ), {, return, x, ;, } = 11.
296                assert_eq!(m.tokens.tokens_sum(), 11.0);
297            },
298        );
299        check_metrics::<CppParser>("int foo(int x) { return x; }", "foo.cpp", |m| {
300            assert_eq!(m.tokens.tokens_sum(), 11.0);
301        });
302    }
303
304    /// C++ `// …` line comments must not contribute, matching the Python
305    /// hand-counted style.  Leaves outside the comment:
306    /// `int`, `x`, `=`, `1`, `;` = 5.
307    #[test]
308    fn cpp_tokens_line_comments_excluded() {
309        check_metrics::<CppParser>("int x = 1; // a one-line comment\n", "foo.cpp", |m| {
310            assert_eq!(m.tokens.tokens_sum(), 5.0);
311        });
312        check_metrics::<CppParser>("int x = 1;\n", "foo.cpp", |m| {
313            assert_eq!(m.tokens.tokens_sum(), 5.0);
314        });
315    }
316
317    /// Whitespace and blank lines must not contribute to the token count
318    /// (mirrors `python_tokens_whitespace_excluded`).
319    #[test]
320    fn cpp_tokens_whitespace_excluded() {
321        check_metrics::<CppParser>("\n\nint foo(int x) {\n    return x;\n}\n", "foo.cpp", |m| {
322            // int, foo, (, int, x, ), {, return, x, ;, } = 11.
323            assert_eq!(m.tokens.tokens_sum(), 11.0);
324        });
325    }
326
327    /// Tokens count punctuation that Halstead skips (parentheses, braces,
328    /// semicolons), so `tokens_sum` must exceed `N1 + N2` for a fixture
329    /// with significant punctuation.  Mirrors
330    /// `python_tokens_distinct_from_halstead`.
331    #[test]
332    fn cpp_tokens_distinct_from_halstead() {
333        check_metrics::<CppParser>("int foo(int x) { return (x + 1); }", "foo.cpp", |m| {
334            let halstead_total = m.halstead.operators() + m.halstead.operands();
335            assert!(
336                m.tokens.tokens_sum() > halstead_total,
337                "expected tokens ({}) > halstead N1+N2 ({}); punctuation like \
338                 `(`, `)`, `{{`, `}}` and `;` should contribute to tokens but not Halstead",
339                m.tokens.tokens_sum(),
340                halstead_total,
341            );
342        });
343    }
344
345    /// Inner functions attribute their tokens to their innermost scope.
346    /// For `void outer() { void inner_stub(); int x = 1; }` with the
347    /// inner forward-declaration, leaves split across the outer space
348    /// and the unit, mirroring the Python nested-attribution test.
349    #[test]
350    fn cpp_tokens_nested_attribution() {
351        check_metrics::<CppParser>(
352            "int outer() {\n    auto inner = []() { return 1; };\n    return inner();\n}\n",
353            "foo.cpp",
354            |m| {
355                // Outer function owns its statements; the inner lambda owns its body.
356                // The unit-level sum must equal the total of all scopes.
357                // tokens_max must equal one of the scope sums and be at least the
358                // tokens count of the lambda body (`return 1 ;` plus surrounding
359                // brackets — minimum 7).
360                assert!(m.tokens.tokens_sum() > 0.0, "expected non-zero tokens_sum");
361                assert!(
362                    m.tokens.tokens_max() >= 7.0,
363                    "expected tokens_max >= 7 (outer scope dominates), got {}",
364                    m.tokens.tokens_max(),
365                );
366                assert!(
367                    m.tokens.tokens_max() <= m.tokens.tokens_sum(),
368                    "tokens_max ({}) cannot exceed tokens_sum ({})",
369                    m.tokens.tokens_max(),
370                    m.tokens.tokens_sum(),
371                );
372            },
373        );
374    }
375
376    /// Java `// …` line comments must not contribute.
377    #[test]
378    fn java_tokens_line_comments_excluded() {
379        check_metrics::<JavaParser>(
380            "class A { void foo() { // hi\n return; } }",
381            "A.java",
382            |m| {
383                // class, A, {, void, foo, (, ), {, return, ;, }, } = 12.
384                assert_eq!(m.tokens.tokens_sum(), 12.0);
385            },
386        );
387        check_metrics::<JavaParser>("class A { void foo() { return; } }", "A.java", |m| {
388            assert_eq!(m.tokens.tokens_sum(), 12.0);
389        });
390    }
391
392    #[test]
393    fn groovy_tokens_line_comments_excluded() {
394        // Groovy mirror — `// …` line comments must not contribute.
395        check_metrics::<GroovyParser>(
396            "class A { void foo() { // hi\n return\n } }",
397            "A.groovy",
398            |m| {
399                // class, A, {, void, foo, (, ), {, return, newline,
400                // }, } = 11 tokens (Groovy's newline acts as the
401                // statement terminator that Java spells `;`).
402                assert_eq!(m.tokens.tokens_sum(), 11.0);
403            },
404        );
405    }
406
407    /// Rust doc comments may split into structured children under
408    /// some grammars; the ancestor walk must filter every inner leaf.
409    #[test]
410    fn rust_tokens_doc_comments_excluded() {
411        check_metrics::<RustParser>(
412            "/// outer doc\n/// more doc\nfn f() { let x = 1; }",
413            "foo.rs",
414            |m| {
415                // fn, f, (, ), {, let, x, =, 1, ;, } = 11.
416                assert_eq!(m.tokens.tokens_sum(), 11.0);
417            },
418        );
419        check_metrics::<RustParser>("fn f() { let x = 1; }", "foo.rs", |m| {
420            assert_eq!(m.tokens.tokens_sum(), 11.0);
421        });
422    }
423
424    // -- Per-language smoke tests --------------------------------------
425    //
426    // Lesson 1 (`docs/development/lessons_learned.md`): every supported
427    // language must have a positive test that asserts non-zero tokens
428    // on real source. Catches the silent-zero regression where a
429    // metric is registered but never fires. `check_metrics` takes a
430    // `fn` pointer so each test inlines its assertion directly.
431
432    #[test]
433    fn smoke_python() {
434        check_metrics::<PythonParser>("x = 1\n", "foo.py", |m| {
435            assert!(m.tokens.tokens_sum() > 0.0);
436        });
437    }
438
439    #[test]
440    fn smoke_rust() {
441        check_metrics::<RustParser>("fn f() { let x = 1; }", "foo.rs", |m| {
442            assert!(m.tokens.tokens_sum() > 0.0);
443        });
444    }
445
446    #[test]
447    fn smoke_cpp() {
448        check_metrics::<CppParser>("int x = 1;", "foo.cpp", |m| {
449            assert!(m.tokens.tokens_sum() > 0.0);
450        });
451    }
452
453    #[test]
454    fn smoke_java() {
455        check_metrics::<JavaParser>("class A { int x = 1; }", "A.java", |m| {
456            assert!(m.tokens.tokens_sum() > 0.0);
457        });
458    }
459
460    #[test]
461    fn smoke_csharp() {
462        check_metrics::<CsharpParser>("class A { int X = 1; }", "A.cs", |m| {
463            assert!(m.tokens.tokens_sum() > 0.0);
464        });
465    }
466
467    #[test]
468    fn smoke_javascript() {
469        check_metrics::<JavascriptParser>("let x = 1;", "foo.js", |m| {
470            assert!(m.tokens.tokens_sum() > 0.0);
471        });
472    }
473
474    #[test]
475    fn smoke_mozjs() {
476        check_metrics::<MozjsParser>("let x = 1;", "foo.js", |m| {
477            assert!(m.tokens.tokens_sum() > 0.0);
478        });
479    }
480
481    #[test]
482    fn smoke_typescript() {
483        check_metrics::<TypescriptParser>("const x: number = 1;", "foo.ts", |m| {
484            assert!(m.tokens.tokens_sum() > 0.0);
485        });
486    }
487
488    #[test]
489    fn smoke_tsx() {
490        check_metrics::<TsxParser>("const x: number = 1;", "foo.tsx", |m| {
491            assert!(m.tokens.tokens_sum() > 0.0);
492        });
493    }
494
495    #[test]
496    fn smoke_go() {
497        check_metrics::<GoParser>("package main\nfunc f() {}", "foo.go", |m| {
498            assert!(m.tokens.tokens_sum() > 0.0);
499        });
500    }
501
502    #[test]
503    fn smoke_kotlin() {
504        check_metrics::<KotlinParser>("fun f(): Int = 1", "foo.kt", |m| {
505            assert!(m.tokens.tokens_sum() > 0.0);
506        });
507    }
508
509    #[test]
510    fn smoke_lua() {
511        check_metrics::<LuaParser>("local x = 1", "foo.lua", |m| {
512            assert!(m.tokens.tokens_sum() > 0.0);
513        });
514    }
515
516    #[test]
517    fn smoke_bash() {
518        check_metrics::<BashParser>("x=1", "foo.sh", |m| {
519            assert!(m.tokens.tokens_sum() > 0.0);
520        });
521    }
522
523    #[test]
524    fn smoke_tcl() {
525        check_metrics::<TclParser>("set x 1", "foo.tcl", |m| {
526            assert!(m.tokens.tokens_sum() > 0.0);
527        });
528    }
529
530    #[test]
531    fn smoke_perl() {
532        check_metrics::<PerlParser>("my $x = 1;", "foo.pl", |m| {
533            assert!(m.tokens.tokens_sum() > 0.0);
534        });
535    }
536
537    #[test]
538    fn smoke_php() {
539        check_metrics::<PhpParser>("<?php $x = 1;", "foo.php", |m| {
540            assert!(m.tokens.tokens_sum() > 0.0);
541        });
542    }
543
544    #[test]
545    fn smoke_preproc() {
546        check_metrics::<PreprocParser>("#define FOO 1\n", "foo.h", |m| {
547            assert!(m.tokens.tokens_sum() > 0.0);
548        });
549    }
550
551    #[test]
552    fn smoke_ccomment() {
553        // Ccomment's grammar parses bare C source; non-comment text
554        // produces non-comment leaves.
555        check_metrics::<CcommentParser>("int x = 1;", "foo.c", |m| {
556            assert!(m.tokens.tokens_sum() > 0.0);
557        });
558    }
559}