Skip to main content

alint_rules/
commented_out_code.rs

1//! `commented_out_code` — heuristic detector for blocks of
2//! commented-out source code (as opposed to prose comments,
3//! license headers, doc comments, or ASCII banners).
4//!
5//! Targets the "agent left dead code behind" pattern: agents
6//! tend to comment-rather-than-delete during iteration, and
7//! the leftovers accumulate. Existing primitives can ban
8//! specific phrasings but can't catch the generic
9//! "block-of-code-shaped-comments" pattern.
10//!
11//! Design doc: `docs/design/v0.7/commented_out_code.md`.
12//!
13//! ## Heuristic
14//!
15//! For each consecutive run of comment lines (≥ `min_lines`),
16//! count the fraction of non-whitespace characters that are
17//! **structural punctuation strongly biased toward code**:
18//!
19//! ```text
20//!   strong_chars = ( ) { } [ ] ; = < > & | ^
21//!   raw_density  = count(strong_chars) / non-whitespace-char-count
22//! ```
23//!
24//! Backticks and quotes are deliberately excluded — backticks
25//! show up constantly in rustdoc / `TSDoc` prose to delimit code
26//! references (`` `foo` matches `bar` ``), and double quotes
27//! appear in normal English. Including either inflates the
28//! score on legitimate prose comments.
29//!
30//! Then normalise so the user-facing `threshold` field has a
31//! useful midpoint at `0.5`:
32//!
33//! ```text
34//!   density = min(raw_density / 0.20, 1.0)
35//! ```
36//!
37//! At `raw_density = 0.20` (i.e. one-fifth of non-whitespace
38//! chars are strong-code chars), the normalised density is
39//! `1.0`. Real code blocks comfortably exceed this; English
40//! prose is well below it because everyday writing rarely
41//! uses brackets, semicolons, or assignment operators.
42//!
43//! Density ≥ `threshold` (default 0.5) marks the block as
44//! code-shaped. Doc-comment markers (`///`, `/** */`) and
45//! the file's first `skip_leading_lines` lines (license
46//! headers) are excluded by construction.
47//!
48//! The score deliberately does NOT use identifier-token
49//! density: English prose is dominated by 3+-letter words
50//! that look identifier-shaped, so identifier counts can't
51//! discriminate code from explanation. Punctuation can.
52
53use std::path::Path;
54
55use alint_core::{
56    Context, Error, Level, PerFileRule, Result, Rule, RuleSpec, Scope, Violation, eval_per_file,
57};
58use serde::Deserialize;
59
60#[derive(Debug, Deserialize)]
61#[serde(deny_unknown_fields)]
62struct Options {
63    /// `auto` (default) infers the comment-marker set from
64    /// each file's extension. Explicit override useful for
65    /// embedded DSLs or cases where the extension lies.
66    #[serde(default)]
67    language: Language,
68    /// Minimum consecutive comment-line count for a block to
69    /// be considered. 1-2 line comments are almost always
70    /// prose; 3+ starts looking like dead code. Default 3.
71    #[serde(default = "default_min_lines")]
72    min_lines: usize,
73    /// Token-density floor (0.0-1.0). Higher = stricter (only
74    /// the most code-shaped blocks fire). Default 0.5.
75    #[serde(default = "default_threshold")]
76    threshold: f64,
77    /// Skip the first N lines of any file. Defaults to 30 to
78    /// pass over license headers without false-positive
79    /// flagging them as commented-out code.
80    #[serde(default = "default_skip_leading_lines")]
81    skip_leading_lines: usize,
82}
83
84fn default_min_lines() -> usize {
85    3
86}
87fn default_threshold() -> f64 {
88    0.5
89}
90fn default_skip_leading_lines() -> usize {
91    30
92}
93
94#[derive(Debug, Deserialize, Default, Clone, Copy, PartialEq, Eq)]
95#[serde(rename_all = "snake_case")]
96enum Language {
97    #[default]
98    Auto,
99    Rust,
100    Typescript,
101    Javascript,
102    Python,
103    Go,
104    Java,
105    C,
106    Cpp,
107    Ruby,
108    Shell,
109}
110
111impl Language {
112    /// Resolve a language to its concrete value (never `Auto`)
113    /// based on a file extension.
114    fn resolve(self, path: &Path) -> Self {
115        if self != Self::Auto {
116            return self;
117        }
118        let ext = path
119            .extension()
120            .and_then(|s| s.to_str())
121            .unwrap_or("")
122            .to_ascii_lowercase();
123        match ext.as_str() {
124            "rs" => Self::Rust,
125            "ts" | "tsx" => Self::Typescript,
126            "js" | "jsx" | "mjs" | "cjs" => Self::Javascript,
127            "py" => Self::Python,
128            "go" => Self::Go,
129            "java" | "kt" | "kts" | "scala" => Self::Java,
130            "c" | "h" => Self::C,
131            "cc" | "cpp" | "cxx" | "hpp" | "hh" => Self::Cpp,
132            "rb" => Self::Ruby,
133            "sh" | "bash" | "zsh" | "fish" => Self::Shell,
134            _ => Self::Auto, // unknown — caller skips
135        }
136    }
137
138    /// The set of line-comment markers for this language.
139    /// Returned in priority order; the longest-match wins.
140    fn line_markers(self) -> &'static [&'static str] {
141        match self {
142            // Doc-comment markers (`///`, `//!`) are ALSO line comments — we
143            // identify them separately below to skip rather than score.
144            Self::Rust
145            | Self::Typescript
146            | Self::Javascript
147            | Self::Go
148            | Self::Java
149            | Self::C
150            | Self::Cpp => &["//"],
151            Self::Python | Self::Shell | Self::Ruby => &["#"],
152            Self::Auto => &[],
153        }
154    }
155
156    /// Inner-line markers that indicate a DOC comment, not a
157    /// regular line comment. Blocks made entirely of these
158    /// are excluded from density scoring.
159    fn doc_line_markers(self) -> &'static [&'static str] {
160        // `TSDoc` / JSDoc / Javadoc live in `/** */` block comments,
161        // not line comments — they fall through to the empty default.
162        match self {
163            Self::Rust => &["///", "//!"],
164            _ => &[],
165        }
166    }
167
168    /// Block-comment delimiters: (open, close).
169    fn block_delim(self) -> Option<(&'static str, &'static str)> {
170        match self {
171            Self::Rust
172            | Self::Typescript
173            | Self::Javascript
174            | Self::Go
175            | Self::Java
176            | Self::C
177            | Self::Cpp => Some(("/*", "*/")),
178            _ => None,
179        }
180    }
181
182    /// Block-comment delimiters that mark a DOC block (Javadoc
183    /// / `TSDoc` / rustdoc inner block). Skipped, not scored.
184    fn doc_block_delim(self) -> Option<(&'static str, &'static str)> {
185        match self {
186            // /** … */ is Javadoc / `TSDoc` / rustdoc-inner.
187            Self::Rust | Self::Typescript | Self::Javascript | Self::Java | Self::Cpp => {
188                Some(("/**", "*/"))
189            }
190            _ => None,
191        }
192    }
193}
194
195#[derive(Debug)]
196pub struct CommentedOutCodeRule {
197    id: String,
198    level: Level,
199    policy_url: Option<String>,
200    message: Option<String>,
201    scope: Scope,
202    language: Language,
203    min_lines: usize,
204    threshold: f64,
205    skip_leading_lines: usize,
206}
207
208impl Rule for CommentedOutCodeRule {
209    alint_core::rule_common_impl!();
210    fn path_scope(&self) -> Option<&Scope> {
211        Some(&self.scope)
212    }
213
214    fn evaluate(&self, ctx: &Context<'_>) -> Result<Vec<Violation>> {
215        eval_per_file(self, ctx)
216    }
217
218    fn as_per_file(&self) -> Option<&dyn PerFileRule> {
219        Some(self)
220    }
221}
222
223impl PerFileRule for CommentedOutCodeRule {
224    fn path_scope(&self) -> &Scope {
225        &self.scope
226    }
227
228    fn evaluate_file(
229        &self,
230        _ctx: &Context<'_>,
231        path: &Path,
232        bytes: &[u8],
233    ) -> Result<Vec<Violation>> {
234        let lang = self.language.resolve(path);
235        if lang == Language::Auto {
236            return Ok(Vec::new()); // unknown extension — skip silently
237        }
238        let Ok(text) = std::str::from_utf8(bytes) else {
239            return Ok(Vec::new());
240        };
241        let mut violations = Vec::new();
242        for block in find_comment_blocks(text, lang) {
243            if block.lines.len() < self.min_lines {
244                continue;
245            }
246            if block.start_line <= self.skip_leading_lines {
247                continue;
248            }
249            if block.is_doc_comment {
250                continue;
251            }
252            let density = score_density(&block.content);
253            if density >= self.threshold {
254                let msg = self.message.clone().unwrap_or_else(|| {
255                    format!(
256                        "block of {} commented-out lines (density {:.2}); remove or convert to runtime-checked branch",
257                        block.lines.len(),
258                        density,
259                    )
260                });
261                violations.push(
262                    Violation::new(msg)
263                        .with_path(std::sync::Arc::<Path>::from(path))
264                        .with_location(block.start_line, 1),
265                );
266            }
267        }
268        Ok(violations)
269    }
270}
271
272pub fn build(spec: &RuleSpec) -> Result<Box<dyn Rule>> {
273    let Some(_paths) = &spec.paths else {
274        return Err(Error::rule_config(
275            &spec.id,
276            "commented_out_code requires a `paths` field",
277        ));
278    };
279    let opts: Options = spec
280        .deserialize_options()
281        .map_err(|e| Error::rule_config(&spec.id, format!("invalid options: {e}")))?;
282    if opts.min_lines < 2 {
283        return Err(Error::rule_config(
284            &spec.id,
285            "commented_out_code `min_lines` must be ≥ 2",
286        ));
287    }
288    if !(0.0..=1.0).contains(&opts.threshold) {
289        return Err(Error::rule_config(
290            &spec.id,
291            "commented_out_code `threshold` must be between 0.0 and 1.0",
292        ));
293    }
294    Ok(Box::new(CommentedOutCodeRule {
295        id: spec.id.clone(),
296        level: spec.level,
297        policy_url: spec.policy_url.clone(),
298        message: spec.message.clone(),
299        scope: Scope::from_spec(spec)?,
300        language: opts.language,
301        min_lines: opts.min_lines,
302        threshold: opts.threshold,
303        skip_leading_lines: opts.skip_leading_lines,
304    }))
305}
306
307// ─── block detection ───────────────────────────────────────────
308
309#[derive(Debug)]
310struct CommentBlock {
311    start_line: usize,
312    lines: Vec<String>,
313    /// Concatenated comment content with markers stripped.
314    /// This is what the density scorer sees.
315    content: String,
316    /// True if every comment marker in the block is a
317    /// doc-comment marker (e.g. `///`, `/** */`).
318    is_doc_comment: bool,
319}
320
321fn find_comment_blocks(text: &str, lang: Language) -> Vec<CommentBlock> {
322    let mut blocks = Vec::new();
323    let line_markers = lang.line_markers();
324    let doc_line_markers = lang.doc_line_markers();
325    let block_delim = lang.block_delim();
326    let doc_block_delim = lang.doc_block_delim();
327
328    let lines: Vec<&str> = text.lines().collect();
329    let mut i = 0;
330    while i < lines.len() {
331        let line = lines[i];
332        let trimmed = line.trim_start();
333
334        // Block-comment open (`/* … */`) — consume until close.
335        if let Some((open, close)) = block_delim {
336            if trimmed.starts_with(open) {
337                let is_doc = doc_block_delim.is_some_and(|(d_open, _)| trimmed.starts_with(d_open));
338                let start_line = i + 1;
339                let mut block_lines = Vec::new();
340                let mut block_content = String::new();
341                let mut closed = false;
342                let mut j = i;
343                while j < lines.len() {
344                    let l = lines[j];
345                    block_lines.push(l.to_string());
346                    let stripped = strip_block_comment_markers(l, open, close);
347                    block_content.push_str(&stripped);
348                    block_content.push('\n');
349                    if l.contains(close) && (j > i || trimmed.matches(close).count() > 0) {
350                        closed = true;
351                        j += 1;
352                        break;
353                    }
354                    j += 1;
355                }
356                if closed {
357                    blocks.push(CommentBlock {
358                        start_line,
359                        lines: block_lines,
360                        content: block_content,
361                        is_doc_comment: is_doc,
362                    });
363                }
364                i = j;
365                continue;
366            }
367        }
368
369        // Line-comment run (consecutive `//` / `#` lines).
370        if line_markers.iter().any(|m| trimmed.starts_with(*m)) {
371            let start_line = i + 1;
372            let mut block_lines = Vec::new();
373            let mut block_content = String::new();
374            let mut all_doc = !doc_line_markers.is_empty();
375            let mut j = i;
376            while j < lines.len() {
377                let l = lines[j];
378                let lt = l.trim_start();
379                let Some(m) = line_markers.iter().find(|mk| lt.starts_with(*mk)).copied() else {
380                    break;
381                };
382                let is_doc_line = doc_line_markers.iter().any(|d| {
383                    lt.starts_with(d)
384                        && (lt.len() == d.len()
385                            || !lt[d.len()..].starts_with(m.chars().next().unwrap_or(' ')))
386                });
387                if !is_doc_line {
388                    all_doc = false;
389                }
390                block_lines.push(l.to_string());
391                block_content.push_str(strip_line_marker(lt, m));
392                block_content.push('\n');
393                j += 1;
394            }
395            blocks.push(CommentBlock {
396                start_line,
397                lines: block_lines,
398                content: block_content,
399                is_doc_comment: all_doc,
400            });
401            i = j;
402            continue;
403        }
404
405        i += 1;
406    }
407    blocks
408}
409
410fn strip_line_marker<'a>(line: &'a str, marker: &str) -> &'a str {
411    let after = line.strip_prefix(marker).unwrap_or(line);
412    after.strip_prefix(' ').unwrap_or(after)
413}
414
415fn strip_block_comment_markers(line: &str, open: &str, close: &str) -> String {
416    let mut s = line.trim().to_string();
417    if let Some(rest) = s.strip_prefix(open) {
418        s = rest.to_string();
419    }
420    if let Some(rest) = s.strip_suffix(close) {
421        s = rest.to_string();
422    }
423    // Trim leading ` * ` (Javadoc / rustdoc continuation).
424    let trimmed = s.trim_start();
425    if let Some(rest) = trimmed.strip_prefix("* ") {
426        return rest.to_string();
427    }
428    if trimmed == "*" {
429        return String::new();
430    }
431    s
432}
433
434// ─── density scoring ───────────────────────────────────────────
435
436/// Characters strongly biased toward code over English prose.
437/// Brackets and assignment / comparison operators show up
438/// constantly in code and almost never in normal writing.
439/// Backticks and quotes are NOT included — backticks delimit
440/// code references in rustdoc / `TSDoc` prose
441/// (`` `foo` matches `bar` ``), double quotes appear in normal
442/// English. Either would inflate the score on legitimate prose
443/// comments.
444const STRONG_CODE_CHARS: &[char] = &[
445    '(', ')', '{', '}', '[', ']', ';', '=', '<', '>', '&', '|', '^',
446];
447
448/// `raw_density / SATURATION_POINT` is clamped to 1.0, so this
449/// is the raw-density value that maps to a normalised density
450/// of 1.0. 0.20 was chosen empirically by sampling: typical
451/// Rust / TS / Python code blocks sit at 0.18-0.30; pure
452/// English prose sits below 0.05.
453const SATURATION_POINT: f64 = 0.20;
454
455/// Punctuation-density score in [0.0, 1.0]. See module-level
456/// rustdoc for the design rationale — the short version is
457/// "count brackets / semicolons / assignment operators, ignore
458/// identifier tokens (prose has identifier-shaped words too)."
459///
460/// Pre-pass: any run of 5+ identical characters gets dropped
461/// before scoring, so ASCII-art separators
462/// (`============================================`, `----`,
463/// `####`) don't inflate the structural-char count and
464/// flag a banner comment as "looks like code."
465fn score_density(content: &str) -> f64 {
466    let collapsed = drop_long_runs(content);
467    let nonws_count = collapsed.chars().filter(|c| !c.is_whitespace()).count();
468    if nonws_count == 0 {
469        return 0.0;
470    }
471    let strong_count = collapsed
472        .chars()
473        .filter(|c| STRONG_CODE_CHARS.contains(c))
474        .count();
475    #[allow(clippy::cast_precision_loss)]
476    let raw = strong_count as f64 / nonws_count as f64;
477    (raw / SATURATION_POINT).min(1.0)
478}
479
480/// Strip runs of 5+ identical characters. Used to defang
481/// ASCII-art separators / banners (`==========`, `----`,
482/// `####`) before density scoring — those are layout, not
483/// code structure, and inflate the strong-char count.
484fn drop_long_runs(s: &str) -> String {
485    let mut out = String::with_capacity(s.len());
486    let mut buf: Vec<char> = Vec::new();
487    let mut prev: Option<char> = None;
488    for ch in s.chars() {
489        if Some(ch) == prev {
490            buf.push(ch);
491        } else {
492            if buf.len() < 5 {
493                out.extend(buf.iter());
494            }
495            buf.clear();
496            buf.push(ch);
497            prev = Some(ch);
498        }
499    }
500    if buf.len() < 5 {
501        out.extend(buf.iter());
502    }
503    out
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn density_high_for_code_low_for_prose() {
512        // Real code: high density.
513        let code = "let x = compute(y, z); if x > 0 { return x; }";
514        let d_code = score_density(code);
515        assert!(d_code > 0.5, "code density {d_code} should be > 0.5");
516
517        // Prose: low density.
518        let prose = "This module parses RFC 9535 JSONPath expressions and resolves them.";
519        let d_prose = score_density(prose);
520        assert!(d_prose < 0.5, "prose density {d_prose} should be < 0.5");
521    }
522
523    #[test]
524    fn line_block_in_rust_detected_with_markers_stripped() {
525        let src = "fn main() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log(\"unused\");\nfn other() {}";
526        let blocks = find_comment_blocks(src, Language::Rust);
527        assert_eq!(blocks.len(), 1);
528        let b = &blocks[0];
529        assert_eq!(b.lines.len(), 3);
530        assert_eq!(b.start_line, 2);
531        assert!(b.content.contains("let x = compute(y);"));
532        assert!(!b.is_doc_comment);
533    }
534
535    #[test]
536    fn rust_doc_line_comments_marked_as_doc() {
537        let src = "/// Documents the next item.\n/// More docs.\n/// Even more.\nfn foo() {}";
538        let blocks = find_comment_blocks(src, Language::Rust);
539        assert_eq!(blocks.len(), 1);
540        assert!(blocks[0].is_doc_comment, "/// block must be marked as doc");
541    }
542
543    #[test]
544    fn block_comment_javadoc_marked_as_doc() {
545        let src = "/**\n * Documented.\n * @param x foo\n */\nfunction bar() {}";
546        let blocks = find_comment_blocks(src, Language::Typescript);
547        assert!(!blocks.is_empty());
548        assert!(blocks[0].is_doc_comment, "/** … */ must be marked as doc");
549    }
550
551    #[test]
552    fn python_hash_block_detected() {
553        let src = "x = 1\n# old = compute(x)\n# if old > 0:\n#    print(old)\nprint(x)";
554        let blocks = find_comment_blocks(src, Language::Python);
555        assert_eq!(blocks.len(), 1);
556        assert!(blocks[0].content.contains("old = compute(x)"));
557    }
558
559    #[test]
560    fn end_to_end_threshold_filters_prose() {
561        // A 3-line // block of prose: should NOT score above default.
562        let prose_src = "fn foo() {}\n// This is a normal explanatory comment\n// describing what foo does.\n// Multiple lines of prose.";
563        let blocks = find_comment_blocks(prose_src, Language::Rust);
564        assert_eq!(blocks.len(), 1);
565        let d = score_density(&blocks[0].content);
566        assert!(d < 0.5, "prose comment density {d} should be < 0.5");
567
568        // A 3-line // block of code: should score above default.
569        let code_src = "fn foo() {}\n// let x = compute(y);\n// if x > 0 { return x; }\n// log_metric(\"path-a\", x);";
570        let blocks = find_comment_blocks(code_src, Language::Rust);
571        assert_eq!(blocks.len(), 1);
572        let d = score_density(&blocks[0].content);
573        assert!(d >= 0.5, "code comment density {d} should be >= 0.5");
574    }
575
576    #[test]
577    fn banner_separators_dont_score_as_code() {
578        // Common pattern: ASCII-art banner around a section title.
579        let banner = "// ============================================\n\
580                      // Section Title\n\
581                      // ============================================";
582        let blocks = find_comment_blocks(banner, Language::Rust);
583        assert_eq!(blocks.len(), 1);
584        let d = score_density(&blocks[0].content);
585        assert!(d < 0.5, "banner density {d} should be < 0.5");
586    }
587
588    #[test]
589    fn drop_long_runs_strips_banners() {
590        assert_eq!(drop_long_runs("foo ============= bar"), "foo  bar");
591        assert_eq!(drop_long_runs("a==b"), "a==b"); // run of 2, kept
592        assert_eq!(drop_long_runs("a===b"), "a===b"); // run of 3, kept
593        assert_eq!(drop_long_runs("a====b"), "a====b"); // run of 4, kept
594        assert_eq!(drop_long_runs("a=====b"), "ab"); // run of 5, dropped
595    }
596
597    #[test]
598    fn language_extension_resolution() {
599        let path = Path::new("foo.rs");
600        assert_eq!(Language::Auto.resolve(path), Language::Rust);
601        let path = Path::new("foo.py");
602        assert_eq!(Language::Auto.resolve(path), Language::Python);
603        let path = Path::new("foo.tsx");
604        assert_eq!(Language::Auto.resolve(path), Language::Typescript);
605        let path = Path::new("unknown");
606        assert_eq!(Language::Auto.resolve(path), Language::Auto);
607    }
608}