Skip to main content

comment_stripper_rs/
lib.rs

1use anyhow::{Context, Result};
2use globset::{GlobBuilder, GlobSet, GlobSetBuilder};
3use ra_ap_rustc_lexer::{strip_shebang, tokenize, FrontmatterAllowed, TokenKind};
4use std::collections::HashSet;
5use std::ffi::OsStr;
6use std::fs;
7use std::path::{Path, PathBuf};
8use walkdir::{DirEntry, WalkDir};
9
10#[derive(Debug, Clone)]
11pub struct Config {
12    /// One or more files or directories to walk. Each is traversed independently;
13    /// a file that is reachable from more than one root is processed only once.
14    pub roots: Vec<PathBuf>,
15    pub check: bool,
16    pub verbose: bool,
17    pub hidden: bool,
18    pub follow_links: bool,
19    pub no_backup: bool,
20    /// When true, rustdoc comments (`///`, `//!`, `/** */`, `/*! */`) are stripped
21    /// as well; otherwise they are preserved.
22    pub strip_doc_comments: bool,
23    pub backup_suffix: String,
24    pub exclude_dirs: Vec<String>,
25    /// Glob patterns (matched against each file's path relative to `root`). When
26    /// non-empty, only files matching at least one pattern are stripped. Excluded
27    /// directories are pruned before this check, so they always win over includes.
28    pub include_globs: Vec<String>,
29}
30
31#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
32pub struct RunStats {
33    pub files_seen: usize,
34    pub files_changed: usize,
35}
36
37impl Config {
38    pub fn validate(&self) -> Result<()> {
39        anyhow::ensure!(
40            self.no_backup || !self.backup_suffix.is_empty(),
41            "backup suffix must not be empty"
42        );
43        Ok(())
44    }
45}
46
47pub fn run(cfg: &Config) -> Result<RunStats> {
48    cfg.validate()?;
49    let include = build_include_set(&cfg.include_globs)?;
50    let mut stats = RunStats::default();
51    let mut seen = HashSet::new();
52
53    for root in &cfg.roots {
54        let walker = WalkDir::new(root)
55            .follow_links(cfg.follow_links)
56            .into_iter()
57            .filter_entry(|e| !should_skip_dir(e, cfg.hidden, &cfg.exclude_dirs));
58
59        for entry in walker {
60            let entry =
61                entry.with_context(|| format!("failed while walking {}", root.display()))?;
62            if entry.file_type().is_file()
63                && is_rust_file(entry.path())
64                && is_included(entry.path(), root, &include)
65                && seen.insert(dedup_key(entry.path()))
66            {
67                stats.files_seen += 1;
68                if process_file(entry.path(), cfg)? {
69                    stats.files_changed += 1;
70                }
71            }
72        }
73    }
74
75    if cfg.check && stats.files_changed > 0 {
76        anyhow::bail!("{} file(s) would change", stats.files_changed);
77    }
78
79    Ok(stats)
80}
81
82/// Walk the tree and delete every file whose name ends with the backup suffix.
83pub fn remove_backups(cfg: &Config) -> Result<RunStats> {
84    anyhow::ensure!(
85        !cfg.backup_suffix.is_empty(),
86        "backup suffix must not be empty"
87    );
88    let mut stats = RunStats::default();
89    let mut seen = HashSet::new();
90
91    for root in &cfg.roots {
92        let walker = WalkDir::new(root)
93            .follow_links(cfg.follow_links)
94            .into_iter()
95            .filter_entry(|e| !should_skip_dir(e, cfg.hidden, &cfg.exclude_dirs));
96
97        for entry in walker {
98            let entry =
99                entry.with_context(|| format!("failed while walking {}", root.display()))?;
100            if entry.file_type().is_file()
101                && is_backup_file(entry.path(), &cfg.backup_suffix)
102                && seen.insert(dedup_key(entry.path()))
103            {
104                stats.files_seen += 1;
105                if cfg.verbose {
106                    println!("{}", entry.path().display());
107                }
108                fs::remove_file(entry.path())
109                    .with_context(|| format!("failed to remove {}", entry.path().display()))?;
110                stats.files_changed += 1;
111            }
112        }
113    }
114
115    Ok(stats)
116}
117
118/// Walk the tree and restore every backup file over its original, then delete the
119/// backup. A backup named `foo.rs<suffix>` is moved back to `foo.rs`, undoing a
120/// previous strip.
121pub fn restore_backups(cfg: &Config) -> Result<RunStats> {
122    anyhow::ensure!(
123        !cfg.backup_suffix.is_empty(),
124        "backup suffix must not be empty"
125    );
126    let mut stats = RunStats::default();
127    let mut seen = HashSet::new();
128
129    for root in &cfg.roots {
130        let walker = WalkDir::new(root)
131            .follow_links(cfg.follow_links)
132            .into_iter()
133            .filter_entry(|e| !should_skip_dir(e, cfg.hidden, &cfg.exclude_dirs));
134
135        for entry in walker {
136            let entry =
137                entry.with_context(|| format!("failed while walking {}", root.display()))?;
138            if entry.file_type().is_file()
139                && is_backup_file(entry.path(), &cfg.backup_suffix)
140                && seen.insert(dedup_key(entry.path()))
141            {
142                let backup = entry.path();
143                let original = original_path(backup, &cfg.backup_suffix)?;
144                stats.files_seen += 1;
145                if cfg.verbose {
146                    println!("{} -> {}", backup.display(), original.display());
147                }
148                fs::rename(backup, &original).with_context(|| {
149                    format!(
150                        "failed to restore {} -> {}",
151                        backup.display(),
152                        original.display()
153                    )
154                })?;
155                stats.files_changed += 1;
156            }
157        }
158    }
159
160    Ok(stats)
161}
162
163/// Strip all non-rustdoc comments, preserving rustdoc comments (`///`, `//!`,
164/// `/** */`, `/*! */`). Equivalent to `strip_comments(input, false)`.
165pub fn strip_non_doc_comments(input: &str) -> Result<String> {
166    strip_comments(input, false)
167}
168
169/// Strip comments from Rust source. When `strip_docs` is true, rustdoc comments
170/// are removed as well; otherwise they are preserved.
171pub fn strip_comments(input: &str, strip_docs: bool) -> Result<String> {
172    let mut output = String::with_capacity(input.len());
173
174    let mut protected: Vec<(usize, usize)> = Vec::new();
175    let mut offset = 0usize;
176
177    if let Some(shebang_len) = strip_shebang(input) {
178        output.push_str(&input[..shebang_len]);
179        offset = shebang_len;
180    }
181
182    let rest = &input[offset..];
183    let mut pos = 0usize;
184
185    // Set after removing a comment that was alone on its line; tells the next
186    // whitespace token to drop the newline that terminated that line, so the
187    // whole line disappears instead of being left blank.
188    let mut swallow_newline = false;
189
190    for token in tokenize(rest, FrontmatterAllowed::Yes) {
191        let len = token.len as usize;
192        let end = pos + len;
193        let text = &rest[pos..end];
194        pos = end;
195
196        let swallow = swallow_newline;
197        swallow_newline = false;
198
199        match token.kind {
200            TokenKind::LineComment { doc_style } | TokenKind::BlockComment { doc_style, .. } => {
201                if doc_style.is_some() && !strip_docs {
202                    push_protected(&mut output, &mut protected, text);
203                } else if let Some(line_start) = blank_line_start(&output) {
204                    // The comment is the only content on its line. Drop the
205                    // leading indentation we already emitted and swallow the
206                    // trailing newline so no blank line is left behind.
207                    output.truncate(line_start);
208                    swallow_newline = true;
209                } else {
210                    preserve_removed_comment(text, &mut output);
211                }
212            }
213            TokenKind::Literal { .. } | TokenKind::Frontmatter { .. } => {
214                push_protected(&mut output, &mut protected, text);
215            }
216            _ => {
217                let text = if swallow {
218                    strip_one_leading_newline(text)
219                } else {
220                    text
221                };
222                output.push_str(text);
223            }
224        }
225    }
226
227    anyhow::ensure!(pos == rest.len(), "lexer did not consume full input");
228    Ok(strip_trailing_whitespace(&output, &protected))
229}
230
231fn push_protected(output: &mut String, protected: &mut Vec<(usize, usize)>, text: &str) {
232    let start = output.len();
233    output.push_str(text);
234    protected.push((start, output.len()));
235}
236
237/// Remove spaces and tabs that sit at the end of a line (before a newline) or at
238/// end of input. Skips bytes inside `protected` ranges so whitespace that lives
239/// inside a string literal, frontmatter, or rustdoc comment is preserved exactly.
240fn strip_trailing_whitespace(output: &str, protected: &[(usize, usize)]) -> String {
241    let bytes = output.as_bytes();
242    let n = bytes.len();
243
244    let is_protected = |idx: usize| {
245        let i = protected.partition_point(|&(s, _)| s <= idx);
246        i > 0 && protected[i - 1].1 > idx
247    };
248
249    let mut result = String::with_capacity(n);
250    let mut seg_start = 0usize;
251    let mut i = 0usize;
252    while i < n {
253        if (bytes[i] == b' ' || bytes[i] == b'\t') && !is_protected(i) {
254            let mut j = i;
255            while j < n && (bytes[j] == b' ' || bytes[j] == b'\t') && !is_protected(j) {
256                j += 1;
257            }
258            let ends_line = j >= n || bytes[j] == b'\n' || bytes[j] == b'\r';
259            if ends_line {
260                result.push_str(&output[seg_start..i]);
261                seg_start = j;
262            }
263            i = j;
264        } else {
265            i += 1;
266        }
267    }
268    result.push_str(&output[seg_start..]);
269    result
270}
271
272/// If everything after the last newline in `output` is blank (spaces/tabs or
273/// nothing), return the byte index where that trailing run begins. A comment
274/// reached in this state is alone on its line, so the caller can drop the whole
275/// line. Returns `None` when there is other content on the line.
276fn blank_line_start(output: &str) -> Option<usize> {
277    let start = output.rfind('\n').map(|i| i + 1).unwrap_or(0);
278    if output[start..].bytes().all(|b| b == b' ' || b == b'\t') {
279        Some(start)
280    } else {
281        None
282    }
283}
284
285/// Drop a single leading newline (`\n` or `\r\n`) from `text`, leaving any
286/// further whitespace intact so originally-blank lines are preserved.
287fn strip_one_leading_newline(text: &str) -> &str {
288    text.strip_prefix("\r\n")
289        .or_else(|| text.strip_prefix('\n'))
290        .unwrap_or(text)
291}
292
293fn preserve_removed_comment(comment: &str, out: &mut String) {
294    if comment.starts_with("//") {
295        if comment.ends_with('\n') {
296            out.push('\n');
297        }
298        return;
299    }
300
301    for ch in comment.chars() {
302        if ch == '\n' {
303            out.push('\n');
304        }
305    }
306}
307
308fn process_file(path: &Path, cfg: &Config) -> Result<bool> {
309    let original =
310        fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))?;
311    let stripped = strip_comments(&original, cfg.strip_doc_comments)
312        .with_context(|| format!("failed to strip comments in {}", path.display()))?;
313
314    if stripped == original {
315        return Ok(false);
316    }
317
318    if cfg.verbose || cfg.check {
319        println!("{}", path.display());
320    }
321
322    if !cfg.check {
323        if !cfg.no_backup {
324            let backup = backup_path(path, &cfg.backup_suffix)?;
325            fs::copy(path, &backup).with_context(|| {
326                format!(
327                    "failed to create backup {} -> {}",
328                    path.display(),
329                    backup.display()
330                )
331            })?;
332        }
333        fs::write(path, stripped).with_context(|| format!("failed to write {}", path.display()))?;
334    }
335
336    Ok(true)
337}
338
339fn backup_path(path: &Path, suffix: &str) -> Result<PathBuf> {
340    let file_name = path
341        .file_name()
342        .and_then(|s| s.to_str())
343        .context("invalid UTF-8 file name")?;
344    Ok(path.with_file_name(format!("{file_name}{suffix}")))
345}
346
347/// Reverse of [`backup_path`]: strip `suffix` from a backup file's name to recover
348/// the original path. Callers ensure `path` ends with `suffix` (via
349/// [`is_backup_file`]).
350fn original_path(path: &Path, suffix: &str) -> Result<PathBuf> {
351    let file_name = path
352        .file_name()
353        .and_then(|s| s.to_str())
354        .context("invalid UTF-8 file name")?;
355    let stem = file_name
356        .strip_suffix(suffix)
357        .with_context(|| format!("{file_name} does not end with backup suffix {suffix}"))?;
358    Ok(path.with_file_name(stem))
359}
360
361fn is_hidden(entry: &DirEntry) -> bool {
362    entry
363        .file_name()
364        .to_str()
365        .map(|s| s.starts_with('.'))
366        .unwrap_or(false)
367}
368
369fn should_skip_dir(entry: &DirEntry, hidden: bool, excluded: &[String]) -> bool {
370    if !entry.file_type().is_dir() {
371        return false;
372    }
373    if entry.depth() > 0 && !hidden && is_hidden(entry) {
374        return true;
375    }
376    let name = entry.file_name().to_string_lossy();
377    excluded.iter().any(|x| x == &name)
378}
379
380fn is_rust_file(path: &Path) -> bool {
381    path.extension() == Some(OsStr::new("rs"))
382}
383
384/// Compile the include patterns into a matcher. Returns `Ok(None)` when no
385/// patterns are given, which the caller treats as "match everything". Patterns
386/// use `literal_separator` semantics, so `*` stops at `/` and `**` is required to
387/// cross directory boundaries (e.g. `src/**/*.rs`).
388fn build_include_set(globs: &[String]) -> Result<Option<GlobSet>> {
389    if globs.is_empty() {
390        return Ok(None);
391    }
392    let mut builder = GlobSetBuilder::new();
393    for pattern in globs {
394        let glob = GlobBuilder::new(pattern)
395            .literal_separator(true)
396            .build()
397            .with_context(|| format!("invalid include glob: {pattern}"))?;
398        builder.add(glob);
399    }
400    Ok(Some(
401        builder.build().context("failed to compile include globs")?,
402    ))
403}
404
405/// A file is included when there are no patterns, or when at least one pattern
406/// matches the file's path relative to `root`, its raw walked path, or its
407/// canonical absolute path. Matching the absolute form lets an include like
408/// `-i /abs/path/foo.rs` work even though the walker yields relative paths.
409/// Excluded directories are pruned by the walker before this is reached, so an
410/// exclude always beats an include.
411fn is_included(path: &Path, root: &Path, include: &Option<GlobSet>) -> bool {
412    match include {
413        None => true,
414        Some(set) => {
415            let rel = path.strip_prefix(root).unwrap_or(path);
416            set.is_match(rel)
417                || set.is_match(path)
418                || fs::canonicalize(path)
419                    .map(|abs| set.is_match(abs))
420                    .unwrap_or(false)
421        }
422    }
423}
424
425/// Identity used to skip files reachable from more than one root. Canonicalizing
426/// collapses `.`, `./src`, symlinks, etc. to the same key; if that fails (e.g. the
427/// file vanished mid-walk) we fall back to the raw path.
428fn dedup_key(path: &Path) -> PathBuf {
429    fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf())
430}
431
432fn is_backup_file(path: &Path, suffix: &str) -> bool {
433    path.file_name()
434        .and_then(|s| s.to_str())
435        .map(|name| name.ends_with(suffix))
436        .unwrap_or(false)
437}
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442
443    #[test]
444    fn keeps_doc_comments_and_removes_normal_comments() {
445        let src = "/// docs\nfn main() { // kill\n    let x = 1; /* gone */\n}\n";
446        let out = strip_non_doc_comments(src).unwrap();
447        assert!(out.contains("/// docs"));
448        assert!(!out.contains("kill"));
449        assert!(!out.contains("gone"));
450    }
451
452    #[test]
453    fn strips_doc_comments_when_requested() {
454        let src = "/// docs\nfn main() { // kill\n    let x = 1; /* gone */\n}\n/*! crate */\n";
455        let out = strip_comments(src, true).unwrap();
456        assert!(!out.contains("docs"));
457        assert!(!out.contains("crate"));
458        assert!(!out.contains("kill"));
459        assert!(!out.contains("gone"));
460
461        // Doc comments that sat alone on a line are removed line and all, rather
462        // than left behind as blank lines.
463        assert_eq!(out, "fn main() {\n    let x = 1;\n}\n");
464    }
465
466    #[test]
467    fn removes_blank_line_left_by_standalone_comment() {
468        let src = "fn main() {\n    // explain\n    let x = 1;\n}\n";
469        let out = strip_non_doc_comments(src).unwrap();
470        assert_eq!(out, "fn main() {\n    let x = 1;\n}\n");
471    }
472
473    #[test]
474    fn keeps_originally_blank_line_after_standalone_comment() {
475        let src = "// header\n\nfn main() {}\n";
476        let out = strip_non_doc_comments(src).unwrap();
477        assert_eq!(out, "\nfn main() {}\n");
478    }
479
480    #[test]
481    fn removes_standalone_block_comment_line() {
482        let src = "fn main() {\n    /* a\n       b */\n    let x = 1;\n}\n";
483        let out = strip_non_doc_comments(src).unwrap();
484        assert_eq!(out, "fn main() {\n    let x = 1;\n}\n");
485    }
486
487    #[test]
488    fn keeps_raw_string_comment_like_text() {
489        let src = "fn main() { let s = r#\"// not a comment /* no */\"#; }\n";
490        let out = strip_non_doc_comments(src).unwrap();
491        assert_eq!(src, out);
492    }
493
494    #[test]
495    fn keeps_block_doc_comments() {
496        let src = "/** docs */\nfn main() {}\n/*! crate docs */\n";
497        let out = strip_non_doc_comments(src).unwrap();
498        assert_eq!(src, out);
499    }
500
501    #[test]
502    fn preserves_line_count_for_block_comments() {
503        let src = "fn main() { /* a\n b\n c */ let x = 1; }\n";
504        let out = strip_non_doc_comments(src).unwrap();
505        assert_eq!(src.lines().count(), out.lines().count());
506    }
507
508    #[test]
509    fn preserves_shebang() {
510        let src = "#!/usr/bin/env rust-script\n// hi\nfn main() {}\n";
511        let out = strip_non_doc_comments(src).unwrap();
512        assert!(out.starts_with("#!/usr/bin/env rust-script\n"));
513    }
514
515    #[test]
516    fn no_trailing_whitespace_left_after_removing_comments() {
517        let src = "let x = 1; // foo\nlet y = 2; /* bar */   \n    // indented\n";
518        let out = strip_non_doc_comments(src).unwrap();
519        // The trailing `// indented` line is removed entirely, not left blank.
520        assert_eq!(out, "let x = 1;\nlet y = 2;\n");
521        for line in out.lines() {
522            assert_eq!(
523                line,
524                line.trim_end(),
525                "line has trailing whitespace: {line:?}"
526            );
527        }
528    }
529
530    #[test]
531    fn keeps_trailing_whitespace_inside_string_literal() {
532        let src = "let s = \"foo   \nbar\"; // gone\n";
533        let out = strip_non_doc_comments(src).unwrap();
534        assert_eq!(out, "let s = \"foo   \nbar\";\n");
535    }
536
537    fn cfg_for(root: &Path, include: Vec<String>, exclude: Vec<String>) -> Config {
538        Config {
539            roots: vec![root.to_path_buf()],
540            check: false,
541            verbose: false,
542            hidden: false,
543            follow_links: false,
544            no_backup: true,
545            strip_doc_comments: false,
546            backup_suffix: ".bak".into(),
547            exclude_dirs: exclude,
548            include_globs: include,
549        }
550    }
551
552    const COMMENTED: &str = "fn f() { // strip me\n}\n";
553
554    #[test]
555    fn include_globs_limit_processed_files() {
556        let dir = tempfile::tempdir().unwrap();
557        let root = dir.path();
558        fs::create_dir_all(root.join("src")).unwrap();
559        fs::create_dir_all(root.join("examples")).unwrap();
560        fs::write(root.join("src/a.rs"), COMMENTED).unwrap();
561        fs::write(root.join("examples/b.rs"), COMMENTED).unwrap();
562
563        let cfg = cfg_for(root, vec!["src/**/*.rs".into()], vec![]);
564        let stats = run(&cfg).unwrap();
565
566        assert_eq!(stats.files_seen, 1);
567        assert_eq!(stats.files_changed, 1);
568        assert!(!fs::read_to_string(root.join("src/a.rs"))
569            .unwrap()
570            .contains("strip me"));
571
572        assert_eq!(
573            fs::read_to_string(root.join("examples/b.rs")).unwrap(),
574            COMMENTED
575        );
576    }
577
578    #[test]
579    fn empty_include_processes_everything() {
580        let dir = tempfile::tempdir().unwrap();
581        let root = dir.path();
582        fs::write(root.join("a.rs"), COMMENTED).unwrap();
583
584        let cfg = cfg_for(root, vec![], vec![]);
585        let stats = run(&cfg).unwrap();
586        assert_eq!(stats.files_seen, 1);
587        assert_eq!(stats.files_changed, 1);
588    }
589
590    #[test]
591    fn include_matches_absolute_path() {
592        let dir = tempfile::tempdir().unwrap();
593        let root = dir.path();
594        fs::create_dir_all(root.join("src")).unwrap();
595        fs::write(root.join("src/a.rs"), COMMENTED).unwrap();
596        fs::write(root.join("src/b.rs"), COMMENTED).unwrap();
597
598        let target = fs::canonicalize(root.join("src/a.rs")).unwrap();
599        let cfg = cfg_for(root, vec![target.to_string_lossy().into_owned()], vec![]);
600        let stats = run(&cfg).unwrap();
601
602        assert_eq!(stats.files_seen, 1);
603        assert_eq!(stats.files_changed, 1);
604        assert!(!fs::read_to_string(root.join("src/a.rs"))
605            .unwrap()
606            .contains("strip me"));
607        assert_eq!(
608            fs::read_to_string(root.join("src/b.rs")).unwrap(),
609            COMMENTED
610        );
611    }
612
613    #[test]
614    fn exclude_wins_over_include() {
615        let dir = tempfile::tempdir().unwrap();
616        let root = dir.path();
617        fs::create_dir_all(root.join("vendor")).unwrap();
618        fs::write(root.join("vendor/c.rs"), COMMENTED).unwrap();
619
620        let cfg = cfg_for(root, vec!["**/*.rs".into()], vec!["vendor".into()]);
621        let stats = run(&cfg).unwrap();
622
623        assert_eq!(stats.files_seen, 0);
624        assert_eq!(
625            fs::read_to_string(root.join("vendor/c.rs")).unwrap(),
626            COMMENTED
627        );
628    }
629
630    #[test]
631    fn invalid_include_glob_is_reported() {
632        let err = build_include_set(&["src/[".into()]).unwrap_err();
633        assert!(err.to_string().contains("invalid include glob"));
634    }
635}