Skip to main content

alint_rules/
commented_out_code.rs

1//! `commented_out_code` — heuristic detector for blocks of
2//! commented-out source code (as opposed to prose comments,
3//! license headers, doc comments, or ASCII banners).
4//!
5//! Targets the "agent left dead code behind" pattern: agents
6//! tend to comment-rather-than-delete during iteration, and
7//! the leftovers accumulate. Existing primitives can ban
8//! specific phrasings but can't catch the generic
9//! "block-of-code-shaped-comments" pattern.
10//!
11//! Design doc: `docs/design/v0.7/commented_out_code.md`.
12//!
13//! ## Heuristic
14//!
15//! For each consecutive run of comment lines (≥ `min_lines`),
16//! count the fraction of non-whitespace characters that are
17//! **structural punctuation strongly biased toward code**:
18//!
19//! ```text
20//!   strong_chars = ( ) { } [ ] ; = < > & | ^
21//!   raw_density  = count(strong_chars) / non-whitespace-char-count
22//! ```
23//!
24//! Backticks and quotes are deliberately excluded — backticks
25//! show up constantly in rustdoc / `TSDoc` prose to delimit code
26//! references (`` `foo` matches `bar` ``), and double quotes
27//! appear in normal English. Including either inflates the
28//! score on legitimate prose comments.
29//!
30//! Then normalise so the user-facing `threshold` field has a
31//! useful midpoint at `0.5`:
32//!
33//! ```text
34//!   density = min(raw_density / 0.20, 1.0)
35//! ```
36//!
37//! At `raw_density = 0.20` (i.e. one-fifth of non-whitespace
38//! chars are strong-code chars), the normalised density is
39//! `1.0`. Real code blocks comfortably exceed this; English
40//! prose is well below it because everyday writing rarely
41//! uses brackets, semicolons, or assignment operators.
42//!
43//! Density ≥ `threshold` (default 0.5) marks the block as
44//! code-shaped. Doc-comment markers (`///`, `/** */`) and
45//! the file's first `skip_leading_lines` lines (license
46//! headers) are excluded by construction.
47//!
48//! The score deliberately does NOT use identifier-token
49//! density: English prose is dominated by 3+-letter words
50//! that look identifier-shaped, so identifier counts can't
51//! discriminate code from explanation. Punctuation can.
52
53use std::path::Path;
54
55use alint_core::{Context, Error, Level, Result, Rule, RuleSpec, Scope, Violation};
56use serde::Deserialize;
57
58#[derive(Debug, Deserialize)]
59struct Options {
60    /// `auto` (default) infers the comment-marker set from
61    /// each file's extension. Explicit override useful for
62    /// embedded DSLs or cases where the extension lies.
63    #[serde(default)]
64    language: Language,
65    /// Minimum consecutive comment-line count for a block to
66    /// be considered. 1-2 line comments are almost always
67    /// prose; 3+ starts looking like dead code. Default 3.
68    #[serde(default = "default_min_lines")]
69    min_lines: usize,
70    /// Token-density floor (0.0-1.0). Higher = stricter (only
71    /// the most code-shaped blocks fire). Default 0.5.
72    #[serde(default = "default_threshold")]
73    threshold: f64,
74    /// Skip the first N lines of any file. Defaults to 30 to
75    /// pass over license headers without false-positive
76    /// flagging them as commented-out code.
77    #[serde(default = "default_skip_leading_lines")]
78    skip_leading_lines: usize,
79}
80
81fn default_min_lines() -> usize {
82    3
83}
84fn default_threshold() -> f64 {
85    0.5
86}
87fn default_skip_leading_lines() -> usize {
88    30
89}
90
91#[derive(Debug, Deserialize, Default, Clone, Copy, PartialEq, Eq)]
92#[serde(rename_all = "snake_case")]
93enum Language {
94    #[default]
95    Auto,
96    Rust,
97    Typescript,
98    Javascript,
99    Python,
100    Go,
101    Java,
102    C,
103    Cpp,
104    Ruby,
105    Shell,
106}
107
108impl Language {
109    /// Resolve a language to its concrete value (never `Auto`)
110    /// based on a file extension.
111    fn resolve(self, path: &Path) -> Self {
112        if self != Self::Auto {
113            return self;
114        }
115        let ext = path
116            .extension()
117            .and_then(|s| s.to_str())
118            .unwrap_or("")
119            .to_ascii_lowercase();
120        match ext.as_str() {
121            "rs" => Self::Rust,
122            "ts" | "tsx" => Self::Typescript,
123            "js" | "jsx" | "mjs" | "cjs" => Self::Javascript,
124            "py" => Self::Python,
125            "go" => Self::Go,
126            "java" | "kt" | "kts" | "scala" => Self::Java,
127            "c" | "h" => Self::C,
128            "cc" | "cpp" | "cxx" | "hpp" | "hh" => Self::Cpp,
129            "rb" => Self::Ruby,
130            "sh" | "bash" | "zsh" | "fish" => Self::Shell,
131            _ => Self::Auto, // unknown — caller skips
132        }
133    }
134
135    /// The set of line-comment markers for this language.
136    /// Returned in priority order; the longest-match wins.
137    fn line_markers(self) -> &'static [&'static str] {
138        match self {
139            // Doc-comment markers (`///`, `//!`) are ALSO line comments — we
140            // identify them separately below to skip rather than score.
141            Self::Rust
142            | Self::Typescript
143            | Self::Javascript
144            | Self::Go
145            | Self::Java
146            | Self::C
147            | Self::Cpp => &["//"],
148            Self::Python | Self::Shell | Self::Ruby => &["#"],
149            Self::Auto => &[],
150        }
151    }
152
153    /// Inner-line markers that indicate a DOC comment, not a
154    /// regular line comment. Blocks made entirely of these
155    /// are excluded from density scoring.
156    fn doc_line_markers(self) -> &'static [&'static str] {
157        // `TSDoc` / JSDoc / Javadoc live in `/** */` block comments,
158        // not line comments — they fall through to the empty default.
159        match self {
160            Self::Rust => &["///", "//!"],
161            _ => &[],
162        }
163    }
164
165    /// Block-comment delimiters: (open, close).
166    fn block_delim(self) -> Option<(&'static str, &'static str)> {
167        match self {
168            Self::Rust
169            | Self::Typescript
170            | Self::Javascript
171            | Self::Go
172            | Self::Java
173            | Self::C
174            | Self::Cpp => Some(("/*", "*/")),
175            _ => None,
176        }
177    }
178
179    /// Block-comment delimiters that mark a DOC block (Javadoc
180    /// / `TSDoc` / rustdoc inner block). Skipped, not scored.
181    fn doc_block_delim(self) -> Option<(&'static str, &'static str)> {
182        match self {
183            // /** … */ is Javadoc / `TSDoc` / rustdoc-inner.
184            Self::Rust | Self::Typescript | Self::Javascript | Self::Java | Self::Cpp => {
185                Some(("/**", "*/"))
186            }
187            _ => None,
188        }
189    }
190}
191
192#[derive(Debug)]
193pub struct CommentedOutCodeRule {
194    id: String,
195    level: Level,
196    policy_url: Option<String>,
197    message: Option<String>,
198    scope: Scope,
199    language: Language,
200    min_lines: usize,
201    threshold: f64,
202    skip_leading_lines: usize,
203}
204
205impl Rule for CommentedOutCodeRule {
206    fn id(&self) -> &str {
207        &self.id
208    }
209    fn level(&self) -> Level {
210        self.level
211    }
212    fn policy_url(&self) -> Option<&str> {
213        self.policy_url.as_deref()
214    }
215    fn path_scope(&self) -> Option<&Scope> {
216        Some(&self.scope)
217    }
218
219    fn evaluate(&self, ctx: &Context<'_>) -> Result<Vec<Violation>> {
220        let mut violations = Vec::new();
221        for entry in ctx.index.files() {
222            if !self.scope.matches(&entry.path) {
223                continue;
224            }
225            let lang = self.language.resolve(&entry.path);
226            if lang == Language::Auto {
227                continue; // unknown extension — skip silently
228            }
229            let full = ctx.root.join(&entry.path);
230            let Ok(bytes) = std::fs::read(&full) else {
231                continue;
232            };
233            let Ok(text) = std::str::from_utf8(&bytes) else {
234                continue;
235            };
236            for block in find_comment_blocks(text, lang) {
237                if block.lines.len() < self.min_lines {
238                    continue;
239                }
240                if block.start_line <= self.skip_leading_lines {
241                    continue;
242                }
243                if block.is_doc_comment {
244                    continue;
245                }
246                let density = score_density(&block.content);
247                if density >= self.threshold {
248                    let msg = self.message.clone().unwrap_or_else(|| {
249                        format!(
250                            "block of {} commented-out lines (density {:.2}); remove or convert to runtime-checked branch",
251                            block.lines.len(),
252                            density,
253                        )
254                    });
255                    violations.push(
256                        Violation::new(msg)
257                            .with_path(entry.path.clone())
258                            .with_location(block.start_line, 1),
259                    );
260                }
261            }
262        }
263        Ok(violations)
264    }
265}
266
267pub fn build(spec: &RuleSpec) -> Result<Box<dyn Rule>> {
268    let Some(paths) = &spec.paths else {
269        return Err(Error::rule_config(
270            &spec.id,
271            "commented_out_code requires a `paths` field",
272        ));
273    };
274    let opts: Options = spec
275        .deserialize_options()
276        .map_err(|e| Error::rule_config(&spec.id, format!("invalid options: {e}")))?;
277    if opts.min_lines < 2 {
278        return Err(Error::rule_config(
279            &spec.id,
280            "commented_out_code `min_lines` must be ≥ 2",
281        ));
282    }
283    if !(0.0..=1.0).contains(&opts.threshold) {
284        return Err(Error::rule_config(
285            &spec.id,
286            "commented_out_code `threshold` must be between 0.0 and 1.0",
287        ));
288    }
289    Ok(Box::new(CommentedOutCodeRule {
290        id: spec.id.clone(),
291        level: spec.level,
292        policy_url: spec.policy_url.clone(),
293        message: spec.message.clone(),
294        scope: Scope::from_paths_spec(paths)?,
295        language: opts.language,
296        min_lines: opts.min_lines,
297        threshold: opts.threshold,
298        skip_leading_lines: opts.skip_leading_lines,
299    }))
300}
301
302// ─── block detection ───────────────────────────────────────────
303
304#[derive(Debug)]
305struct CommentBlock {
306    start_line: usize,
307    lines: Vec<String>,
308    /// Concatenated comment content with markers stripped.
309    /// This is what the density scorer sees.
310    content: String,
311    /// True if every comment marker in the block is a
312    /// doc-comment marker (e.g. `///`, `/** */`).
313    is_doc_comment: bool,
314}
315
316fn find_comment_blocks(text: &str, lang: Language) -> Vec<CommentBlock> {
317    let mut blocks = Vec::new();
318    let line_markers = lang.line_markers();
319    let doc_line_markers = lang.doc_line_markers();
320    let block_delim = lang.block_delim();
321    let doc_block_delim = lang.doc_block_delim();
322
323    let lines: Vec<&str> = text.lines().collect();
324    let mut i = 0;
325    while i < lines.len() {
326        let line = lines[i];
327        let trimmed = line.trim_start();
328
329        // Block-comment open (`/* … */`) — consume until close.
330        if let Some((open, close)) = block_delim {
331            if trimmed.starts_with(open) {
332                let is_doc = doc_block_delim.is_some_and(|(d_open, _)| trimmed.starts_with(d_open));
333                let start_line = i + 1;
334                let mut block_lines = Vec::new();
335                let mut block_content = String::new();
336                let mut closed = false;
337                let mut j = i;
338                while j < lines.len() {
339                    let l = lines[j];
340                    block_lines.push(l.to_string());
341                    let stripped = strip_block_comment_markers(l, open, close);
342                    block_content.push_str(&stripped);
343                    block_content.push('\n');
344                    if l.contains(close) && (j > i || trimmed.matches(close).count() > 0) {
345                        closed = true;
346                        j += 1;
347                        break;
348                    }
349                    j += 1;
350                }
351                if closed {
352                    blocks.push(CommentBlock {
353                        start_line,
354                        lines: block_lines,
355                        content: block_content,
356                        is_doc_comment: is_doc,
357                    });
358                }
359                i = j;
360                continue;
361            }
362        }
363
364        // Line-comment run (consecutive `//` / `#` lines).
365        if line_markers.iter().any(|m| trimmed.starts_with(*m)) {
366            let start_line = i + 1;
367            let mut block_lines = Vec::new();
368            let mut block_content = String::new();
369            let mut all_doc = !doc_line_markers.is_empty();
370            let mut j = i;
371            while j < lines.len() {
372                let l = lines[j];
373                let lt = l.trim_start();
374                let Some(m) = line_markers.iter().find(|mk| lt.starts_with(*mk)).copied() else {
375                    break;
376                };
377                let is_doc_line = doc_line_markers.iter().any(|d| {
378                    lt.starts_with(d)
379                        && (lt.len() == d.len()
380                            || !lt[d.len()..].starts_with(m.chars().next().unwrap_or(' ')))
381                });
382                if !is_doc_line {
383                    all_doc = false;
384                }
385                block_lines.push(l.to_string());
386                block_content.push_str(strip_line_marker(lt, m));
387                block_content.push('\n');
388                j += 1;
389            }
390            blocks.push(CommentBlock {
391                start_line,
392                lines: block_lines,
393                content: block_content,
394                is_doc_comment: all_doc,
395            });
396            i = j;
397            continue;
398        }
399
400        i += 1;
401    }
402    blocks
403}
404
405fn strip_line_marker<'a>(line: &'a str, marker: &str) -> &'a str {
406    let after = line.strip_prefix(marker).unwrap_or(line);
407    after.strip_prefix(' ').unwrap_or(after)
408}
409
410fn strip_block_comment_markers(line: &str, open: &str, close: &str) -> String {
411    let mut s = line.trim().to_string();
412    if let Some(rest) = s.strip_prefix(open) {
413        s = rest.to_string();
414    }
415    if let Some(rest) = s.strip_suffix(close) {
416        s = rest.to_string();
417    }
418    // Trim leading ` * ` (Javadoc / rustdoc continuation).
419    let trimmed = s.trim_start();
420    if let Some(rest) = trimmed.strip_prefix("* ") {
421        return rest.to_string();
422    }
423    if trimmed == "*" {
424        return String::new();
425    }
426    s
427}
428
429// ─── density scoring ───────────────────────────────────────────
430
431/// Characters strongly biased toward code over English prose.
432/// Brackets and assignment / comparison operators show up
433/// constantly in code and almost never in normal writing.
434/// Backticks and quotes are NOT included — backticks delimit
435/// code references in rustdoc / `TSDoc` prose
436/// (`` `foo` matches `bar` ``), double quotes appear in normal
437/// English. Either would inflate the score on legitimate prose
438/// comments.
439const STRONG_CODE_CHARS: &[char] = &[
440    '(', ')', '{', '}', '[', ']', ';', '=', '<', '>', '&', '|', '^',
441];
442
443/// `raw_density / SATURATION_POINT` is clamped to 1.0, so this
444/// is the raw-density value that maps to a normalised density
445/// of 1.0. 0.20 was chosen empirically by sampling: typical
446/// Rust / TS / Python code blocks sit at 0.18-0.30; pure
447/// English prose sits below 0.05.
448const SATURATION_POINT: f64 = 0.20;
449
450/// Punctuation-density score in [0.0, 1.0]. See module-level
451/// rustdoc for the design rationale — the short version is
452/// "count brackets / semicolons / assignment operators, ignore
453/// identifier tokens (prose has identifier-shaped words too)."
454///
455/// Pre-pass: any run of 5+ identical characters gets dropped
456/// before scoring, so ASCII-art separators
457/// (`============================================`, `----`,
458/// `####`) don't inflate the structural-char count and
459/// flag a banner comment as "looks like code."
460fn score_density(content: &str) -> f64 {
461    let collapsed = drop_long_runs(content);
462    let nonws_count = collapsed.chars().filter(|c| !c.is_whitespace()).count();
463    if nonws_count == 0 {
464        return 0.0;
465    }
466    let strong_count = collapsed
467        .chars()
468        .filter(|c| STRONG_CODE_CHARS.contains(c))
469        .count();
470    #[allow(clippy::cast_precision_loss)]
471    let raw = strong_count as f64 / nonws_count as f64;
472    (raw / SATURATION_POINT).min(1.0)
473}
474
475/// Strip runs of 5+ identical characters. Used to defang
476/// ASCII-art separators / banners (`==========`, `----`,
477/// `####`) before density scoring — those are layout, not
478/// code structure, and inflate the strong-char count.
479fn drop_long_runs(s: &str) -> String {
480    let mut out = String::with_capacity(s.len());
481    let mut buf: Vec<char> = Vec::new();
482    let mut prev: Option<char> = None;
483    for ch in s.chars() {
484        if Some(ch) == prev {
485            buf.push(ch);
486        } else {
487            if buf.len() < 5 {
488                out.extend(buf.iter());
489            }
490            buf.clear();
491            buf.push(ch);
492            prev = Some(ch);
493        }
494    }
495    if buf.len() < 5 {
496        out.extend(buf.iter());
497    }
498    out
499}
500
501#[cfg(test)]
502mod tests {
503    use super::*;
504
505    #[test]
506    fn density_high_for_code_low_for_prose() {
507        // Real code: high density.
508        let code = "let x = compute(y, z); if x > 0 { return x; }";
509        let d_code = score_density(code);
510        assert!(d_code > 0.5, "code density {d_code} should be > 0.5");
511
512        // Prose: low density.
513        let prose = "This module parses RFC 9535 JSONPath expressions and resolves them.";
514        let d_prose = score_density(prose);
515        assert!(d_prose < 0.5, "prose density {d_prose} should be < 0.5");
516    }
517
518    #[test]
519    fn line_block_in_rust_detected_with_markers_stripped() {
520        let src = "fn main() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log(\"unused\");\nfn other() {}";
521        let blocks = find_comment_blocks(src, Language::Rust);
522        assert_eq!(blocks.len(), 1);
523        let b = &blocks[0];
524        assert_eq!(b.lines.len(), 3);
525        assert_eq!(b.start_line, 2);
526        assert!(b.content.contains("let x = compute(y);"));
527        assert!(!b.is_doc_comment);
528    }
529
530    #[test]
531    fn rust_doc_line_comments_marked_as_doc() {
532        let src = "/// Documents the next item.\n/// More docs.\n/// Even more.\nfn foo() {}";
533        let blocks = find_comment_blocks(src, Language::Rust);
534        assert_eq!(blocks.len(), 1);
535        assert!(blocks[0].is_doc_comment, "/// block must be marked as doc");
536    }
537
538    #[test]
539    fn block_comment_javadoc_marked_as_doc() {
540        let src = "/**\n * Documented.\n * @param x foo\n */\nfunction bar() {}";
541        let blocks = find_comment_blocks(src, Language::Typescript);
542        assert!(!blocks.is_empty());
543        assert!(blocks[0].is_doc_comment, "/** … */ must be marked as doc");
544    }
545
546    #[test]
547    fn python_hash_block_detected() {
548        let src = "x = 1\n# old = compute(x)\n# if old > 0:\n#    print(old)\nprint(x)";
549        let blocks = find_comment_blocks(src, Language::Python);
550        assert_eq!(blocks.len(), 1);
551        assert!(blocks[0].content.contains("old = compute(x)"));
552    }
553
554    #[test]
555    fn end_to_end_threshold_filters_prose() {
556        // A 3-line // block of prose: should NOT score above default.
557        let prose_src = "fn foo() {}\n// This is a normal explanatory comment\n// describing what foo does.\n// Multiple lines of prose.";
558        let blocks = find_comment_blocks(prose_src, Language::Rust);
559        assert_eq!(blocks.len(), 1);
560        let d = score_density(&blocks[0].content);
561        assert!(d < 0.5, "prose comment density {d} should be < 0.5");
562
563        // A 3-line // block of code: should score above default.
564        let code_src = "fn foo() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log_metric(\"path-a\", x);";
565        let blocks = find_comment_blocks(code_src, Language::Rust);
566        assert_eq!(blocks.len(), 1);
567        let d = score_density(&blocks[0].content);
568        assert!(d >= 0.5, "code comment density {d} should be >= 0.5");
569    }
570
571    #[test]
572    fn banner_separators_dont_score_as_code() {
573        // Common pattern: ASCII-art banner around a section title.
574        let banner = "// ============================================\n\
575                      // Section Title\n\
576                      // ============================================";
577        let blocks = find_comment_blocks(banner, Language::Rust);
578        assert_eq!(blocks.len(), 1);
579        let d = score_density(&blocks[0].content);
580        assert!(d < 0.5, "banner density {d} should be < 0.5");
581    }
582
583    #[test]
584    fn drop_long_runs_strips_banners() {
585        assert_eq!(drop_long_runs("foo ============= bar"), "foo  bar");
586        assert_eq!(drop_long_runs("a==b"), "a==b"); // run of 2, kept
587        assert_eq!(drop_long_runs("a===b"), "a===b"); // run of 3, kept
588        assert_eq!(drop_long_runs("a====b"), "a====b"); // run of 4, kept
589        assert_eq!(drop_long_runs("a=====b"), "ab"); // run of 5, dropped
590    }
591
592    #[test]
593    fn language_extension_resolution() {
594        let path = Path::new("foo.rs");
595        assert_eq!(Language::Auto.resolve(path), Language::Rust);
596        let path = Path::new("foo.py");
597        assert_eq!(Language::Auto.resolve(path), Language::Python);
598        let path = Path::new("foo.tsx");
599        assert_eq!(Language::Auto.resolve(path), Language::Typescript);
600        let path = Path::new("unknown");
601        assert_eq!(Language::Auto.resolve(path), Language::Auto);
602    }
603}