Skip to main content

alint_rules/
commented_out_code.rs

1//! `commented_out_code` — heuristic detector for blocks of
2//! commented-out source code (as opposed to prose comments,
3//! license headers, doc comments, or ASCII banners).
4//!
5//! Targets the "agent left dead code behind" pattern: agents
6//! tend to comment-rather-than-delete during iteration, and
7//! the leftovers accumulate. Existing primitives can ban
8//! specific phrasings but can't catch the generic
9//! "block-of-code-shaped-comments" pattern.
10//!
11//! Design doc: `docs/design/v0.7/commented_out_code.md`.
12//!
13//! ## Heuristic
14//!
15//! For each consecutive run of comment lines (≥ `min_lines`),
16//! count the fraction of non-whitespace characters that are
17//! **structural punctuation strongly biased toward code**:
18//!
19//! ```text
20//!   strong_chars = ( ) { } [ ] ; = < > & | ^
21//!   raw_density  = count(strong_chars) / non-whitespace-char-count
22//! ```
23//!
24//! Backticks and quotes are deliberately excluded — backticks
25//! show up constantly in rustdoc / `TSDoc` prose to delimit code
26//! references (`` `foo` matches `bar` ``), and double quotes
27//! appear in normal English. Including either inflates the
28//! score on legitimate prose comments.
29//!
30//! Then normalise so the user-facing `threshold` field has a
31//! useful midpoint at `0.5`:
32//!
33//! ```text
34//!   density = min(raw_density / 0.20, 1.0)
35//! ```
36//!
37//! At `raw_density = 0.20` (i.e. one-fifth of non-whitespace
38//! chars are strong-code chars), the normalised density is
39//! `1.0`. Real code blocks comfortably exceed this; English
40//! prose is well below it because everyday writing rarely
41//! uses brackets, semicolons, or assignment operators.
42//!
43//! Density ≥ `threshold` (default 0.5) marks the block as
44//! code-shaped. Doc-comment markers (`///`, `/** */`) and
45//! the file's first `skip_leading_lines` lines (license
46//! headers) are excluded by construction.
47//!
48//! The score deliberately does NOT use identifier-token
49//! density: English prose is dominated by 3+-letter words
50//! that look identifier-shaped, so identifier counts can't
51//! discriminate code from explanation. Punctuation can.
52
53use std::path::Path;
54
55use alint_core::{Context, Error, Level, PerFileRule, Result, Rule, RuleSpec, Scope, Violation};
56use serde::Deserialize;
57
58#[derive(Debug, Deserialize)]
59#[serde(deny_unknown_fields)]
60struct Options {
61    /// `auto` (default) infers the comment-marker set from
62    /// each file's extension. Explicit override useful for
63    /// embedded DSLs or cases where the extension lies.
64    #[serde(default)]
65    language: Language,
66    /// Minimum consecutive comment-line count for a block to
67    /// be considered. 1-2 line comments are almost always
68    /// prose; 3+ starts looking like dead code. Default 3.
69    #[serde(default = "default_min_lines")]
70    min_lines: usize,
71    /// Token-density floor (0.0-1.0). Higher = stricter (only
72    /// the most code-shaped blocks fire). Default 0.5.
73    #[serde(default = "default_threshold")]
74    threshold: f64,
75    /// Skip the first N lines of any file. Defaults to 30 to
76    /// pass over license headers without false-positive
77    /// flagging them as commented-out code.
78    #[serde(default = "default_skip_leading_lines")]
79    skip_leading_lines: usize,
80}
81
82fn default_min_lines() -> usize {
83    3
84}
85fn default_threshold() -> f64 {
86    0.5
87}
88fn default_skip_leading_lines() -> usize {
89    30
90}
91
92#[derive(Debug, Deserialize, Default, Clone, Copy, PartialEq, Eq)]
93#[serde(rename_all = "snake_case")]
94enum Language {
95    #[default]
96    Auto,
97    Rust,
98    Typescript,
99    Javascript,
100    Python,
101    Go,
102    Java,
103    C,
104    Cpp,
105    Ruby,
106    Shell,
107}
108
109impl Language {
110    /// Resolve a language to its concrete value (never `Auto`)
111    /// based on a file extension.
112    fn resolve(self, path: &Path) -> Self {
113        if self != Self::Auto {
114            return self;
115        }
116        let ext = path
117            .extension()
118            .and_then(|s| s.to_str())
119            .unwrap_or("")
120            .to_ascii_lowercase();
121        match ext.as_str() {
122            "rs" => Self::Rust,
123            "ts" | "tsx" => Self::Typescript,
124            "js" | "jsx" | "mjs" | "cjs" => Self::Javascript,
125            "py" => Self::Python,
126            "go" => Self::Go,
127            "java" | "kt" | "kts" | "scala" => Self::Java,
128            "c" | "h" => Self::C,
129            "cc" | "cpp" | "cxx" | "hpp" | "hh" => Self::Cpp,
130            "rb" => Self::Ruby,
131            "sh" | "bash" | "zsh" | "fish" => Self::Shell,
132            _ => Self::Auto, // unknown — caller skips
133        }
134    }
135
136    /// The set of line-comment markers for this language.
137    /// Returned in priority order; the longest-match wins.
138    fn line_markers(self) -> &'static [&'static str] {
139        match self {
140            // Doc-comment markers (`///`, `//!`) are ALSO line comments — we
141            // identify them separately below to skip rather than score.
142            Self::Rust
143            | Self::Typescript
144            | Self::Javascript
145            | Self::Go
146            | Self::Java
147            | Self::C
148            | Self::Cpp => &["//"],
149            Self::Python | Self::Shell | Self::Ruby => &["#"],
150            Self::Auto => &[],
151        }
152    }
153
154    /// Inner-line markers that indicate a DOC comment, not a
155    /// regular line comment. Blocks made entirely of these
156    /// are excluded from density scoring.
157    fn doc_line_markers(self) -> &'static [&'static str] {
158        // `TSDoc` / JSDoc / Javadoc live in `/** */` block comments,
159        // not line comments — they fall through to the empty default.
160        match self {
161            Self::Rust => &["///", "//!"],
162            _ => &[],
163        }
164    }
165
166    /// Block-comment delimiters: (open, close).
167    fn block_delim(self) -> Option<(&'static str, &'static str)> {
168        match self {
169            Self::Rust
170            | Self::Typescript
171            | Self::Javascript
172            | Self::Go
173            | Self::Java
174            | Self::C
175            | Self::Cpp => Some(("/*", "*/")),
176            _ => None,
177        }
178    }
179
180    /// Block-comment delimiters that mark a DOC block (Javadoc
181    /// / `TSDoc` / rustdoc inner block). Skipped, not scored.
182    fn doc_block_delim(self) -> Option<(&'static str, &'static str)> {
183        match self {
184            // /** … */ is Javadoc / `TSDoc` / rustdoc-inner.
185            Self::Rust | Self::Typescript | Self::Javascript | Self::Java | Self::Cpp => {
186                Some(("/**", "*/"))
187            }
188            _ => None,
189        }
190    }
191}
192
193#[derive(Debug)]
194pub struct CommentedOutCodeRule {
195    id: String,
196    level: Level,
197    policy_url: Option<String>,
198    message: Option<String>,
199    scope: Scope,
200    language: Language,
201    min_lines: usize,
202    threshold: f64,
203    skip_leading_lines: usize,
204}
205
206impl Rule for CommentedOutCodeRule {
207    fn id(&self) -> &str {
208        &self.id
209    }
210    fn level(&self) -> Level {
211        self.level
212    }
213    fn policy_url(&self) -> Option<&str> {
214        self.policy_url.as_deref()
215    }
216    fn path_scope(&self) -> Option<&Scope> {
217        Some(&self.scope)
218    }
219
220    fn evaluate(&self, ctx: &Context<'_>) -> Result<Vec<Violation>> {
221        let mut violations = Vec::new();
222        for entry in ctx.index.files() {
223            if !self.scope.matches(&entry.path, ctx.index) {
224                continue;
225            }
226            let full = ctx.root.join(&entry.path);
227            let Ok(bytes) = std::fs::read(&full) else {
228                continue;
229            };
230            violations.extend(self.evaluate_file(ctx, &entry.path, &bytes)?);
231        }
232        Ok(violations)
233    }
234
235    fn as_per_file(&self) -> Option<&dyn PerFileRule> {
236        Some(self)
237    }
238}
239
240impl PerFileRule for CommentedOutCodeRule {
241    fn path_scope(&self) -> &Scope {
242        &self.scope
243    }
244
245    fn evaluate_file(
246        &self,
247        _ctx: &Context<'_>,
248        path: &Path,
249        bytes: &[u8],
250    ) -> Result<Vec<Violation>> {
251        let lang = self.language.resolve(path);
252        if lang == Language::Auto {
253            return Ok(Vec::new()); // unknown extension — skip silently
254        }
255        let Ok(text) = std::str::from_utf8(bytes) else {
256            return Ok(Vec::new());
257        };
258        let mut violations = Vec::new();
259        for block in find_comment_blocks(text, lang) {
260            if block.lines.len() < self.min_lines {
261                continue;
262            }
263            if block.start_line <= self.skip_leading_lines {
264                continue;
265            }
266            if block.is_doc_comment {
267                continue;
268            }
269            let density = score_density(&block.content);
270            if density >= self.threshold {
271                let msg = self.message.clone().unwrap_or_else(|| {
272                    format!(
273                        "block of {} commented-out lines (density {:.2}); remove or convert to runtime-checked branch",
274                        block.lines.len(),
275                        density,
276                    )
277                });
278                violations.push(
279                    Violation::new(msg)
280                        .with_path(std::sync::Arc::<Path>::from(path))
281                        .with_location(block.start_line, 1),
282                );
283            }
284        }
285        Ok(violations)
286    }
287}
288
289pub fn build(spec: &RuleSpec) -> Result<Box<dyn Rule>> {
290    let Some(_paths) = &spec.paths else {
291        return Err(Error::rule_config(
292            &spec.id,
293            "commented_out_code requires a `paths` field",
294        ));
295    };
296    let opts: Options = spec
297        .deserialize_options()
298        .map_err(|e| Error::rule_config(&spec.id, format!("invalid options: {e}")))?;
299    if opts.min_lines < 2 {
300        return Err(Error::rule_config(
301            &spec.id,
302            "commented_out_code `min_lines` must be ≥ 2",
303        ));
304    }
305    if !(0.0..=1.0).contains(&opts.threshold) {
306        return Err(Error::rule_config(
307            &spec.id,
308            "commented_out_code `threshold` must be between 0.0 and 1.0",
309        ));
310    }
311    Ok(Box::new(CommentedOutCodeRule {
312        id: spec.id.clone(),
313        level: spec.level,
314        policy_url: spec.policy_url.clone(),
315        message: spec.message.clone(),
316        scope: Scope::from_spec(spec)?,
317        language: opts.language,
318        min_lines: opts.min_lines,
319        threshold: opts.threshold,
320        skip_leading_lines: opts.skip_leading_lines,
321    }))
322}
323
324// ─── block detection ───────────────────────────────────────────
325
326#[derive(Debug)]
327struct CommentBlock {
328    start_line: usize,
329    lines: Vec<String>,
330    /// Concatenated comment content with markers stripped.
331    /// This is what the density scorer sees.
332    content: String,
333    /// True if every comment marker in the block is a
334    /// doc-comment marker (e.g. `///`, `/** */`).
335    is_doc_comment: bool,
336}
337
338fn find_comment_blocks(text: &str, lang: Language) -> Vec<CommentBlock> {
339    let mut blocks = Vec::new();
340    let line_markers = lang.line_markers();
341    let doc_line_markers = lang.doc_line_markers();
342    let block_delim = lang.block_delim();
343    let doc_block_delim = lang.doc_block_delim();
344
345    let lines: Vec<&str> = text.lines().collect();
346    let mut i = 0;
347    while i < lines.len() {
348        let line = lines[i];
349        let trimmed = line.trim_start();
350
351        // Block-comment open (`/* … */`) — consume until close.
352        if let Some((open, close)) = block_delim {
353            if trimmed.starts_with(open) {
354                let is_doc = doc_block_delim.is_some_and(|(d_open, _)| trimmed.starts_with(d_open));
355                let start_line = i + 1;
356                let mut block_lines = Vec::new();
357                let mut block_content = String::new();
358                let mut closed = false;
359                let mut j = i;
360                while j < lines.len() {
361                    let l = lines[j];
362                    block_lines.push(l.to_string());
363                    let stripped = strip_block_comment_markers(l, open, close);
364                    block_content.push_str(&stripped);
365                    block_content.push('\n');
366                    if l.contains(close) && (j > i || trimmed.matches(close).count() > 0) {
367                        closed = true;
368                        j += 1;
369                        break;
370                    }
371                    j += 1;
372                }
373                if closed {
374                    blocks.push(CommentBlock {
375                        start_line,
376                        lines: block_lines,
377                        content: block_content,
378                        is_doc_comment: is_doc,
379                    });
380                }
381                i = j;
382                continue;
383            }
384        }
385
386        // Line-comment run (consecutive `//` / `#` lines).
387        if line_markers.iter().any(|m| trimmed.starts_with(*m)) {
388            let start_line = i + 1;
389            let mut block_lines = Vec::new();
390            let mut block_content = String::new();
391            let mut all_doc = !doc_line_markers.is_empty();
392            let mut j = i;
393            while j < lines.len() {
394                let l = lines[j];
395                let lt = l.trim_start();
396                let Some(m) = line_markers.iter().find(|mk| lt.starts_with(*mk)).copied() else {
397                    break;
398                };
399                let is_doc_line = doc_line_markers.iter().any(|d| {
400                    lt.starts_with(d)
401                        && (lt.len() == d.len()
402                            || !lt[d.len()..].starts_with(m.chars().next().unwrap_or(' ')))
403                });
404                if !is_doc_line {
405                    all_doc = false;
406                }
407                block_lines.push(l.to_string());
408                block_content.push_str(strip_line_marker(lt, m));
409                block_content.push('\n');
410                j += 1;
411            }
412            blocks.push(CommentBlock {
413                start_line,
414                lines: block_lines,
415                content: block_content,
416                is_doc_comment: all_doc,
417            });
418            i = j;
419            continue;
420        }
421
422        i += 1;
423    }
424    blocks
425}
426
427fn strip_line_marker<'a>(line: &'a str, marker: &str) -> &'a str {
428    let after = line.strip_prefix(marker).unwrap_or(line);
429    after.strip_prefix(' ').unwrap_or(after)
430}
431
432fn strip_block_comment_markers(line: &str, open: &str, close: &str) -> String {
433    let mut s = line.trim().to_string();
434    if let Some(rest) = s.strip_prefix(open) {
435        s = rest.to_string();
436    }
437    if let Some(rest) = s.strip_suffix(close) {
438        s = rest.to_string();
439    }
440    // Trim leading ` * ` (Javadoc / rustdoc continuation).
441    let trimmed = s.trim_start();
442    if let Some(rest) = trimmed.strip_prefix("* ") {
443        return rest.to_string();
444    }
445    if trimmed == "*" {
446        return String::new();
447    }
448    s
449}
450
451// ─── density scoring ───────────────────────────────────────────
452
453/// Characters strongly biased toward code over English prose.
454/// Brackets and assignment / comparison operators show up
455/// constantly in code and almost never in normal writing.
456/// Backticks and quotes are NOT included — backticks delimit
457/// code references in rustdoc / `TSDoc` prose
458/// (`` `foo` matches `bar` ``), double quotes appear in normal
459/// English. Either would inflate the score on legitimate prose
460/// comments.
461const STRONG_CODE_CHARS: &[char] = &[
462    '(', ')', '{', '}', '[', ']', ';', '=', '<', '>', '&', '|', '^',
463];
464
465/// `raw_density / SATURATION_POINT` is clamped to 1.0, so this
466/// is the raw-density value that maps to a normalised density
467/// of 1.0. 0.20 was chosen empirically by sampling: typical
468/// Rust / TS / Python code blocks sit at 0.18-0.30; pure
469/// English prose sits below 0.05.
470const SATURATION_POINT: f64 = 0.20;
471
472/// Punctuation-density score in [0.0, 1.0]. See module-level
473/// rustdoc for the design rationale — the short version is
474/// "count brackets / semicolons / assignment operators, ignore
475/// identifier tokens (prose has identifier-shaped words too)."
476///
477/// Pre-pass: any run of 5+ identical characters gets dropped
478/// before scoring, so ASCII-art separators
479/// (`============================================`, `----`,
480/// `####`) don't inflate the structural-char count and
481/// flag a banner comment as "looks like code."
482fn score_density(content: &str) -> f64 {
483    let collapsed = drop_long_runs(content);
484    let nonws_count = collapsed.chars().filter(|c| !c.is_whitespace()).count();
485    if nonws_count == 0 {
486        return 0.0;
487    }
488    let strong_count = collapsed
489        .chars()
490        .filter(|c| STRONG_CODE_CHARS.contains(c))
491        .count();
492    #[allow(clippy::cast_precision_loss)]
493    let raw = strong_count as f64 / nonws_count as f64;
494    (raw / SATURATION_POINT).min(1.0)
495}
496
497/// Strip runs of 5+ identical characters. Used to defang
498/// ASCII-art separators / banners (`==========`, `----`,
499/// `####`) before density scoring — those are layout, not
500/// code structure, and inflate the strong-char count.
501fn drop_long_runs(s: &str) -> String {
502    let mut out = String::with_capacity(s.len());
503    let mut buf: Vec<char> = Vec::new();
504    let mut prev: Option<char> = None;
505    for ch in s.chars() {
506        if Some(ch) == prev {
507            buf.push(ch);
508        } else {
509            if buf.len() < 5 {
510                out.extend(buf.iter());
511            }
512            buf.clear();
513            buf.push(ch);
514            prev = Some(ch);
515        }
516    }
517    if buf.len() < 5 {
518        out.extend(buf.iter());
519    }
520    out
521}
522
523#[cfg(test)]
524mod tests {
525    use super::*;
526
527    #[test]
528    fn density_high_for_code_low_for_prose() {
529        // Real code: high density.
530        let code = "let x = compute(y, z); if x > 0 { return x; }";
531        let d_code = score_density(code);
532        assert!(d_code > 0.5, "code density {d_code} should be > 0.5");
533
534        // Prose: low density.
535        let prose = "This module parses RFC 9535 JSONPath expressions and resolves them.";
536        let d_prose = score_density(prose);
537        assert!(d_prose < 0.5, "prose density {d_prose} should be < 0.5");
538    }
539
540    #[test]
541    fn line_block_in_rust_detected_with_markers_stripped() {
542        let src = "fn main() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log(\"unused\");\nfn other() {}";
543        let blocks = find_comment_blocks(src, Language::Rust);
544        assert_eq!(blocks.len(), 1);
545        let b = &blocks[0];
546        assert_eq!(b.lines.len(), 3);
547        assert_eq!(b.start_line, 2);
548        assert!(b.content.contains("let x = compute(y);"));
549        assert!(!b.is_doc_comment);
550    }
551
552    #[test]
553    fn rust_doc_line_comments_marked_as_doc() {
554        let src = "/// Documents the next item.\n/// More docs.\n/// Even more.\nfn foo() {}";
555        let blocks = find_comment_blocks(src, Language::Rust);
556        assert_eq!(blocks.len(), 1);
557        assert!(blocks[0].is_doc_comment, "/// block must be marked as doc");
558    }
559
560    #[test]
561    fn block_comment_javadoc_marked_as_doc() {
562        let src = "/**\n * Documented.\n * @param x foo\n */\nfunction bar() {}";
563        let blocks = find_comment_blocks(src, Language::Typescript);
564        assert!(!blocks.is_empty());
565        assert!(blocks[0].is_doc_comment, "/** … */ must be marked as doc");
566    }
567
568    #[test]
569    fn python_hash_block_detected() {
570        let src = "x = 1\n# old = compute(x)\n# if old > 0:\n#    print(old)\nprint(x)";
571        let blocks = find_comment_blocks(src, Language::Python);
572        assert_eq!(blocks.len(), 1);
573        assert!(blocks[0].content.contains("old = compute(x)"));
574    }
575
576    #[test]
577    fn end_to_end_threshold_filters_prose() {
578        // A 3-line // block of prose: should NOT score above default.
579        let prose_src = "fn foo() {}\n// This is a normal explanatory comment\n// describing what foo does.\n// Multiple lines of prose.";
580        let blocks = find_comment_blocks(prose_src, Language::Rust);
581        assert_eq!(blocks.len(), 1);
582        let d = score_density(&blocks[0].content);
583        assert!(d < 0.5, "prose comment density {d} should be < 0.5");
584
585        // A 3-line // block of code: should score above default.
586        let code_src = "fn foo() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log_metric(\"path-a\", x);";
587        let blocks = find_comment_blocks(code_src, Language::Rust);
588        assert_eq!(blocks.len(), 1);
589        let d = score_density(&blocks[0].content);
590        assert!(d >= 0.5, "code comment density {d} should be >= 0.5");
591    }
592
593    #[test]
594    fn banner_separators_dont_score_as_code() {
595        // Common pattern: ASCII-art banner around a section title.
596        let banner = "// ============================================\n\
597                      // Section Title\n\
598                      // ============================================";
599        let blocks = find_comment_blocks(banner, Language::Rust);
600        assert_eq!(blocks.len(), 1);
601        let d = score_density(&blocks[0].content);
602        assert!(d < 0.5, "banner density {d} should be < 0.5");
603    }
604
605    #[test]
606    fn drop_long_runs_strips_banners() {
607        assert_eq!(drop_long_runs("foo ============= bar"), "foo  bar");
608        assert_eq!(drop_long_runs("a==b"), "a==b"); // run of 2, kept
609        assert_eq!(drop_long_runs("a===b"), "a===b"); // run of 3, kept
610        assert_eq!(drop_long_runs("a====b"), "a====b"); // run of 4, kept
611        assert_eq!(drop_long_runs("a=====b"), "ab"); // run of 5, dropped
612    }
613
614    #[test]
615    fn language_extension_resolution() {
616        let path = Path::new("foo.rs");
617        assert_eq!(Language::Auto.resolve(path), Language::Rust);
618        let path = Path::new("foo.py");
619        assert_eq!(Language::Auto.resolve(path), Language::Python);
620        let path = Path::new("foo.tsx");
621        assert_eq!(Language::Auto.resolve(path), Language::Typescript);
622        let path = Path::new("unknown");
623        assert_eq!(Language::Auto.resolve(path), Language::Auto);
624    }
625}