Skip to main content

alint_rules/
commented_out_code.rs

1//! `commented_out_code` — heuristic detector for blocks of
2//! commented-out source code (as opposed to prose comments,
3//! license headers, doc comments, or ASCII banners).
4//!
5//! Targets the "agent left dead code behind" pattern: agents
6//! tend to comment-rather-than-delete during iteration, and
7//! the leftovers accumulate. Existing primitives can ban
8//! specific phrasings but can't catch the generic
9//! "block-of-code-shaped-comments" pattern.
10//!
11//! Design doc: `docs/design/v0.7/commented_out_code.md`.
12//!
13//! ## Heuristic
14//!
15//! For each consecutive run of comment lines (≥ `min_lines`),
16//! count the fraction of non-whitespace characters that are
17//! **structural punctuation strongly biased toward code**:
18//!
19//! ```text
20//!   strong_chars = ( ) { } [ ] ; = < > & | ^
21//!   raw_density  = count(strong_chars) / non-whitespace-char-count
22//! ```
23//!
24//! Backticks and quotes are deliberately excluded — backticks
25//! show up constantly in rustdoc / `TSDoc` prose to delimit code
26//! references (`` `foo` matches `bar` ``), and double quotes
27//! appear in normal English. Including either inflates the
28//! score on legitimate prose comments.
29//!
30//! Then normalise so the user-facing `threshold` field has a
31//! useful midpoint at `0.5`:
32//!
33//! ```text
34//!   density = min(raw_density / 0.20, 1.0)
35//! ```
36//!
37//! At `raw_density = 0.20` (i.e. one-fifth of non-whitespace
38//! chars are strong-code chars), the normalised density is
39//! `1.0`. Real code blocks comfortably exceed this; English
40//! prose is well below it because everyday writing rarely
41//! uses brackets, semicolons, or assignment operators.
42//!
43//! Density ≥ `threshold` (default 0.5) marks the block as
44//! code-shaped. Doc-comment markers (`///`, `/** */`) and
45//! the file's first `skip_leading_lines` lines (license
46//! headers) are excluded by construction.
47//!
48//! The score deliberately does NOT use identifier-token
49//! density: English prose is dominated by 3+-letter words
50//! that look identifier-shaped, so identifier counts can't
51//! discriminate code from explanation. Punctuation can.
52
53use std::path::Path;
54
55use alint_core::{
56    Context, Error, Level, PerFileRule, Result, Rule, RuleSpec, Scope, ScopeFilter, Violation,
57};
58use serde::Deserialize;
59
60#[derive(Debug, Deserialize)]
61struct Options {
62    /// `auto` (default) infers the comment-marker set from
63    /// each file's extension. Explicit override useful for
64    /// embedded DSLs or cases where the extension lies.
65    #[serde(default)]
66    language: Language,
67    /// Minimum consecutive comment-line count for a block to
68    /// be considered. 1-2 line comments are almost always
69    /// prose; 3+ starts looking like dead code. Default 3.
70    #[serde(default = "default_min_lines")]
71    min_lines: usize,
72    /// Token-density floor (0.0-1.0). Higher = stricter (only
73    /// the most code-shaped blocks fire). Default 0.5.
74    #[serde(default = "default_threshold")]
75    threshold: f64,
76    /// Skip the first N lines of any file. Defaults to 30 to
77    /// pass over license headers without false-positive
78    /// flagging them as commented-out code.
79    #[serde(default = "default_skip_leading_lines")]
80    skip_leading_lines: usize,
81}
82
83fn default_min_lines() -> usize {
84    3
85}
86fn default_threshold() -> f64 {
87    0.5
88}
89fn default_skip_leading_lines() -> usize {
90    30
91}
92
93#[derive(Debug, Deserialize, Default, Clone, Copy, PartialEq, Eq)]
94#[serde(rename_all = "snake_case")]
95enum Language {
96    #[default]
97    Auto,
98    Rust,
99    Typescript,
100    Javascript,
101    Python,
102    Go,
103    Java,
104    C,
105    Cpp,
106    Ruby,
107    Shell,
108}
109
110impl Language {
111    /// Resolve a language to its concrete value (never `Auto`)
112    /// based on a file extension.
113    fn resolve(self, path: &Path) -> Self {
114        if self != Self::Auto {
115            return self;
116        }
117        let ext = path
118            .extension()
119            .and_then(|s| s.to_str())
120            .unwrap_or("")
121            .to_ascii_lowercase();
122        match ext.as_str() {
123            "rs" => Self::Rust,
124            "ts" | "tsx" => Self::Typescript,
125            "js" | "jsx" | "mjs" | "cjs" => Self::Javascript,
126            "py" => Self::Python,
127            "go" => Self::Go,
128            "java" | "kt" | "kts" | "scala" => Self::Java,
129            "c" | "h" => Self::C,
130            "cc" | "cpp" | "cxx" | "hpp" | "hh" => Self::Cpp,
131            "rb" => Self::Ruby,
132            "sh" | "bash" | "zsh" | "fish" => Self::Shell,
133            _ => Self::Auto, // unknown — caller skips
134        }
135    }
136
137    /// The set of line-comment markers for this language.
138    /// Returned in priority order; the longest-match wins.
139    fn line_markers(self) -> &'static [&'static str] {
140        match self {
141            // Doc-comment markers (`///`, `//!`) are ALSO line comments — we
142            // identify them separately below to skip rather than score.
143            Self::Rust
144            | Self::Typescript
145            | Self::Javascript
146            | Self::Go
147            | Self::Java
148            | Self::C
149            | Self::Cpp => &["//"],
150            Self::Python | Self::Shell | Self::Ruby => &["#"],
151            Self::Auto => &[],
152        }
153    }
154
155    /// Inner-line markers that indicate a DOC comment, not a
156    /// regular line comment. Blocks made entirely of these
157    /// are excluded from density scoring.
158    fn doc_line_markers(self) -> &'static [&'static str] {
159        // `TSDoc` / JSDoc / Javadoc live in `/** */` block comments,
160        // not line comments — they fall through to the empty default.
161        match self {
162            Self::Rust => &["///", "//!"],
163            _ => &[],
164        }
165    }
166
167    /// Block-comment delimiters: (open, close).
168    fn block_delim(self) -> Option<(&'static str, &'static str)> {
169        match self {
170            Self::Rust
171            | Self::Typescript
172            | Self::Javascript
173            | Self::Go
174            | Self::Java
175            | Self::C
176            | Self::Cpp => Some(("/*", "*/")),
177            _ => None,
178        }
179    }
180
181    /// Block-comment delimiters that mark a DOC block (Javadoc
182    /// / `TSDoc` / rustdoc inner block). Skipped, not scored.
183    fn doc_block_delim(self) -> Option<(&'static str, &'static str)> {
184        match self {
185            // /** … */ is Javadoc / `TSDoc` / rustdoc-inner.
186            Self::Rust | Self::Typescript | Self::Javascript | Self::Java | Self::Cpp => {
187                Some(("/**", "*/"))
188            }
189            _ => None,
190        }
191    }
192}
193
194#[derive(Debug)]
195pub struct CommentedOutCodeRule {
196    id: String,
197    level: Level,
198    policy_url: Option<String>,
199    message: Option<String>,
200    scope: Scope,
201    scope_filter: Option<ScopeFilter>,
202    language: Language,
203    min_lines: usize,
204    threshold: f64,
205    skip_leading_lines: usize,
206}
207
208impl Rule for CommentedOutCodeRule {
209    fn id(&self) -> &str {
210        &self.id
211    }
212    fn level(&self) -> Level {
213        self.level
214    }
215    fn policy_url(&self) -> Option<&str> {
216        self.policy_url.as_deref()
217    }
218    fn path_scope(&self) -> Option<&Scope> {
219        Some(&self.scope)
220    }
221
222    fn evaluate(&self, ctx: &Context<'_>) -> Result<Vec<Violation>> {
223        let mut violations = Vec::new();
224        for entry in ctx.index.files() {
225            if !self.scope.matches(&entry.path) {
226                continue;
227            }
228            if let Some(filter) = &self.scope_filter
229                && !filter.matches(&entry.path, ctx.index)
230            {
231                continue;
232            }
233            let full = ctx.root.join(&entry.path);
234            let Ok(bytes) = std::fs::read(&full) else {
235                continue;
236            };
237            violations.extend(self.evaluate_file(ctx, &entry.path, &bytes)?);
238        }
239        Ok(violations)
240    }
241
242    fn as_per_file(&self) -> Option<&dyn PerFileRule> {
243        Some(self)
244    }
245
246    fn scope_filter(&self) -> Option<&ScopeFilter> {
247        self.scope_filter.as_ref()
248    }
249}
250
251impl PerFileRule for CommentedOutCodeRule {
252    fn path_scope(&self) -> &Scope {
253        &self.scope
254    }
255
256    fn evaluate_file(
257        &self,
258        _ctx: &Context<'_>,
259        path: &Path,
260        bytes: &[u8],
261    ) -> Result<Vec<Violation>> {
262        let lang = self.language.resolve(path);
263        if lang == Language::Auto {
264            return Ok(Vec::new()); // unknown extension — skip silently
265        }
266        let Ok(text) = std::str::from_utf8(bytes) else {
267            return Ok(Vec::new());
268        };
269        let mut violations = Vec::new();
270        for block in find_comment_blocks(text, lang) {
271            if block.lines.len() < self.min_lines {
272                continue;
273            }
274            if block.start_line <= self.skip_leading_lines {
275                continue;
276            }
277            if block.is_doc_comment {
278                continue;
279            }
280            let density = score_density(&block.content);
281            if density >= self.threshold {
282                let msg = self.message.clone().unwrap_or_else(|| {
283                    format!(
284                        "block of {} commented-out lines (density {:.2}); remove or convert to runtime-checked branch",
285                        block.lines.len(),
286                        density,
287                    )
288                });
289                violations.push(
290                    Violation::new(msg)
291                        .with_path(std::sync::Arc::<Path>::from(path))
292                        .with_location(block.start_line, 1),
293                );
294            }
295        }
296        Ok(violations)
297    }
298}
299
300pub fn build(spec: &RuleSpec) -> Result<Box<dyn Rule>> {
301    let Some(paths) = &spec.paths else {
302        return Err(Error::rule_config(
303            &spec.id,
304            "commented_out_code requires a `paths` field",
305        ));
306    };
307    let opts: Options = spec
308        .deserialize_options()
309        .map_err(|e| Error::rule_config(&spec.id, format!("invalid options: {e}")))?;
310    if opts.min_lines < 2 {
311        return Err(Error::rule_config(
312            &spec.id,
313            "commented_out_code `min_lines` must be ≥ 2",
314        ));
315    }
316    if !(0.0..=1.0).contains(&opts.threshold) {
317        return Err(Error::rule_config(
318            &spec.id,
319            "commented_out_code `threshold` must be between 0.0 and 1.0",
320        ));
321    }
322    Ok(Box::new(CommentedOutCodeRule {
323        id: spec.id.clone(),
324        level: spec.level,
325        policy_url: spec.policy_url.clone(),
326        message: spec.message.clone(),
327        scope: Scope::from_paths_spec(paths)?,
328        scope_filter: spec.parse_scope_filter()?,
329        language: opts.language,
330        min_lines: opts.min_lines,
331        threshold: opts.threshold,
332        skip_leading_lines: opts.skip_leading_lines,
333    }))
334}
335
336// ─── block detection ───────────────────────────────────────────
337
338#[derive(Debug)]
339struct CommentBlock {
340    start_line: usize,
341    lines: Vec<String>,
342    /// Concatenated comment content with markers stripped.
343    /// This is what the density scorer sees.
344    content: String,
345    /// True if every comment marker in the block is a
346    /// doc-comment marker (e.g. `///`, `/** */`).
347    is_doc_comment: bool,
348}
349
350fn find_comment_blocks(text: &str, lang: Language) -> Vec<CommentBlock> {
351    let mut blocks = Vec::new();
352    let line_markers = lang.line_markers();
353    let doc_line_markers = lang.doc_line_markers();
354    let block_delim = lang.block_delim();
355    let doc_block_delim = lang.doc_block_delim();
356
357    let lines: Vec<&str> = text.lines().collect();
358    let mut i = 0;
359    while i < lines.len() {
360        let line = lines[i];
361        let trimmed = line.trim_start();
362
363        // Block-comment open (`/* … */`) — consume until close.
364        if let Some((open, close)) = block_delim {
365            if trimmed.starts_with(open) {
366                let is_doc = doc_block_delim.is_some_and(|(d_open, _)| trimmed.starts_with(d_open));
367                let start_line = i + 1;
368                let mut block_lines = Vec::new();
369                let mut block_content = String::new();
370                let mut closed = false;
371                let mut j = i;
372                while j < lines.len() {
373                    let l = lines[j];
374                    block_lines.push(l.to_string());
375                    let stripped = strip_block_comment_markers(l, open, close);
376                    block_content.push_str(&stripped);
377                    block_content.push('\n');
378                    if l.contains(close) && (j > i || trimmed.matches(close).count() > 0) {
379                        closed = true;
380                        j += 1;
381                        break;
382                    }
383                    j += 1;
384                }
385                if closed {
386                    blocks.push(CommentBlock {
387                        start_line,
388                        lines: block_lines,
389                        content: block_content,
390                        is_doc_comment: is_doc,
391                    });
392                }
393                i = j;
394                continue;
395            }
396        }
397
398        // Line-comment run (consecutive `//` / `#` lines).
399        if line_markers.iter().any(|m| trimmed.starts_with(*m)) {
400            let start_line = i + 1;
401            let mut block_lines = Vec::new();
402            let mut block_content = String::new();
403            let mut all_doc = !doc_line_markers.is_empty();
404            let mut j = i;
405            while j < lines.len() {
406                let l = lines[j];
407                let lt = l.trim_start();
408                let Some(m) = line_markers.iter().find(|mk| lt.starts_with(*mk)).copied() else {
409                    break;
410                };
411                let is_doc_line = doc_line_markers.iter().any(|d| {
412                    lt.starts_with(d)
413                        && (lt.len() == d.len()
414                            || !lt[d.len()..].starts_with(m.chars().next().unwrap_or(' ')))
415                });
416                if !is_doc_line {
417                    all_doc = false;
418                }
419                block_lines.push(l.to_string());
420                block_content.push_str(strip_line_marker(lt, m));
421                block_content.push('\n');
422                j += 1;
423            }
424            blocks.push(CommentBlock {
425                start_line,
426                lines: block_lines,
427                content: block_content,
428                is_doc_comment: all_doc,
429            });
430            i = j;
431            continue;
432        }
433
434        i += 1;
435    }
436    blocks
437}
438
439fn strip_line_marker<'a>(line: &'a str, marker: &str) -> &'a str {
440    let after = line.strip_prefix(marker).unwrap_or(line);
441    after.strip_prefix(' ').unwrap_or(after)
442}
443
444fn strip_block_comment_markers(line: &str, open: &str, close: &str) -> String {
445    let mut s = line.trim().to_string();
446    if let Some(rest) = s.strip_prefix(open) {
447        s = rest.to_string();
448    }
449    if let Some(rest) = s.strip_suffix(close) {
450        s = rest.to_string();
451    }
452    // Trim leading ` * ` (Javadoc / rustdoc continuation).
453    let trimmed = s.trim_start();
454    if let Some(rest) = trimmed.strip_prefix("* ") {
455        return rest.to_string();
456    }
457    if trimmed == "*" {
458        return String::new();
459    }
460    s
461}
462
463// ─── density scoring ───────────────────────────────────────────
464
465/// Characters strongly biased toward code over English prose.
466/// Brackets and assignment / comparison operators show up
467/// constantly in code and almost never in normal writing.
468/// Backticks and quotes are NOT included — backticks delimit
469/// code references in rustdoc / `TSDoc` prose
470/// (`` `foo` matches `bar` ``), double quotes appear in normal
471/// English. Either would inflate the score on legitimate prose
472/// comments.
473const STRONG_CODE_CHARS: &[char] = &[
474    '(', ')', '{', '}', '[', ']', ';', '=', '<', '>', '&', '|', '^',
475];
476
477/// `raw_density / SATURATION_POINT` is clamped to 1.0, so this
478/// is the raw-density value that maps to a normalised density
479/// of 1.0. 0.20 was chosen empirically by sampling: typical
480/// Rust / TS / Python code blocks sit at 0.18-0.30; pure
481/// English prose sits below 0.05.
482const SATURATION_POINT: f64 = 0.20;
483
484/// Punctuation-density score in [0.0, 1.0]. See module-level
485/// rustdoc for the design rationale — the short version is
486/// "count brackets / semicolons / assignment operators, ignore
487/// identifier tokens (prose has identifier-shaped words too)."
488///
489/// Pre-pass: any run of 5+ identical characters gets dropped
490/// before scoring, so ASCII-art separators
491/// (`============================================`, `----`,
492/// `####`) don't inflate the structural-char count and
493/// flag a banner comment as "looks like code."
494fn score_density(content: &str) -> f64 {
495    let collapsed = drop_long_runs(content);
496    let nonws_count = collapsed.chars().filter(|c| !c.is_whitespace()).count();
497    if nonws_count == 0 {
498        return 0.0;
499    }
500    let strong_count = collapsed
501        .chars()
502        .filter(|c| STRONG_CODE_CHARS.contains(c))
503        .count();
504    #[allow(clippy::cast_precision_loss)]
505    let raw = strong_count as f64 / nonws_count as f64;
506    (raw / SATURATION_POINT).min(1.0)
507}
508
509/// Strip runs of 5+ identical characters. Used to defang
510/// ASCII-art separators / banners (`==========`, `----`,
511/// `####`) before density scoring — those are layout, not
512/// code structure, and inflate the strong-char count.
513fn drop_long_runs(s: &str) -> String {
514    let mut out = String::with_capacity(s.len());
515    let mut buf: Vec<char> = Vec::new();
516    let mut prev: Option<char> = None;
517    for ch in s.chars() {
518        if Some(ch) == prev {
519            buf.push(ch);
520        } else {
521            if buf.len() < 5 {
522                out.extend(buf.iter());
523            }
524            buf.clear();
525            buf.push(ch);
526            prev = Some(ch);
527        }
528    }
529    if buf.len() < 5 {
530        out.extend(buf.iter());
531    }
532    out
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538
539    #[test]
540    fn density_high_for_code_low_for_prose() {
541        // Real code: high density.
542        let code = "let x = compute(y, z); if x > 0 { return x; }";
543        let d_code = score_density(code);
544        assert!(d_code > 0.5, "code density {d_code} should be > 0.5");
545
546        // Prose: low density.
547        let prose = "This module parses RFC 9535 JSONPath expressions and resolves them.";
548        let d_prose = score_density(prose);
549        assert!(d_prose < 0.5, "prose density {d_prose} should be < 0.5");
550    }
551
552    #[test]
553    fn line_block_in_rust_detected_with_markers_stripped() {
554        let src = "fn main() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log(\"unused\");\nfn other() {}";
555        let blocks = find_comment_blocks(src, Language::Rust);
556        assert_eq!(blocks.len(), 1);
557        let b = &blocks[0];
558        assert_eq!(b.lines.len(), 3);
559        assert_eq!(b.start_line, 2);
560        assert!(b.content.contains("let x = compute(y);"));
561        assert!(!b.is_doc_comment);
562    }
563
564    #[test]
565    fn rust_doc_line_comments_marked_as_doc() {
566        let src = "/// Documents the next item.\n/// More docs.\n/// Even more.\nfn foo() {}";
567        let blocks = find_comment_blocks(src, Language::Rust);
568        assert_eq!(blocks.len(), 1);
569        assert!(blocks[0].is_doc_comment, "/// block must be marked as doc");
570    }
571
572    #[test]
573    fn block_comment_javadoc_marked_as_doc() {
574        let src = "/**\n * Documented.\n * @param x foo\n */\nfunction bar() {}";
575        let blocks = find_comment_blocks(src, Language::Typescript);
576        assert!(!blocks.is_empty());
577        assert!(blocks[0].is_doc_comment, "/** … */ must be marked as doc");
578    }
579
580    #[test]
581    fn python_hash_block_detected() {
582        let src = "x = 1\n# old = compute(x)\n# if old > 0:\n#    print(old)\nprint(x)";
583        let blocks = find_comment_blocks(src, Language::Python);
584        assert_eq!(blocks.len(), 1);
585        assert!(blocks[0].content.contains("old = compute(x)"));
586    }
587
588    #[test]
589    fn end_to_end_threshold_filters_prose() {
590        // A 3-line // block of prose: should NOT score above default.
591        let prose_src = "fn foo() {}\n// This is a normal explanatory comment\n// describing what foo does.\n// Multiple lines of prose.";
592        let blocks = find_comment_blocks(prose_src, Language::Rust);
593        assert_eq!(blocks.len(), 1);
594        let d = score_density(&blocks[0].content);
595        assert!(d < 0.5, "prose comment density {d} should be < 0.5");
596
597        // A 3-line // block of code: should score above default.
598        let code_src = "fn foo() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log_metric(\"path-a\", x);";
599        let blocks = find_comment_blocks(code_src, Language::Rust);
600        assert_eq!(blocks.len(), 1);
601        let d = score_density(&blocks[0].content);
602        assert!(d >= 0.5, "code comment density {d} should be >= 0.5");
603    }
604
605    #[test]
606    fn banner_separators_dont_score_as_code() {
607        // Common pattern: ASCII-art banner around a section title.
608        let banner = "// ============================================\n\
609                      // Section Title\n\
610                      // ============================================";
611        let blocks = find_comment_blocks(banner, Language::Rust);
612        assert_eq!(blocks.len(), 1);
613        let d = score_density(&blocks[0].content);
614        assert!(d < 0.5, "banner density {d} should be < 0.5");
615    }
616
617    #[test]
618    fn drop_long_runs_strips_banners() {
619        assert_eq!(drop_long_runs("foo ============= bar"), "foo  bar");
620        assert_eq!(drop_long_runs("a==b"), "a==b"); // run of 2, kept
621        assert_eq!(drop_long_runs("a===b"), "a===b"); // run of 3, kept
622        assert_eq!(drop_long_runs("a====b"), "a====b"); // run of 4, kept
623        assert_eq!(drop_long_runs("a=====b"), "ab"); // run of 5, dropped
624    }
625
626    #[test]
627    fn language_extension_resolution() {
628        let path = Path::new("foo.rs");
629        assert_eq!(Language::Auto.resolve(path), Language::Rust);
630        let path = Path::new("foo.py");
631        assert_eq!(Language::Auto.resolve(path), Language::Python);
632        let path = Path::new("foo.tsx");
633        assert_eq!(Language::Auto.resolve(path), Language::Typescript);
634        let path = Path::new("unknown");
635        assert_eq!(Language::Auto.resolve(path), Language::Auto);
636    }
637}