use crate::ast::visitor::{CommentInfo, CommentVisitor};
use crate::config::{ConfigManager, ResolvedConfig};
use crate::grammar::GrammarManager;
use crate::languages::registry::LanguageRegistry;
use crate::rules::preservation::PreservationRule;
use anyhow::{Context, Result};
use std::path::Path;
use tree_sitter::Parser;
#[derive(Debug, Clone)]
pub struct ProcessingOptions {
pub remove_todo: bool,
pub remove_fixme: bool,
pub remove_doc: bool,
pub custom_preserve_patterns: Vec<String>,
pub use_default_ignores: bool,
pub dry_run: bool,
pub show_diff: bool,
pub respect_gitignore: bool,
pub traverse_git_repos: bool,
}
pub struct Processor {
parser: Parser,
registry: LanguageRegistry,
grammar_manager: GrammarManager,
}
impl Default for Processor {
fn default() -> Self {
Self::new()
}
}
impl Processor {
pub fn new() -> Self {
Self {
parser: Parser::new(),
registry: LanguageRegistry::new(),
grammar_manager: GrammarManager::new().expect("Failed to initialize GrammarManager"),
}
}
pub fn new_with_config(config_manager: &ConfigManager) -> Self {
let mut registry = LanguageRegistry::new();
let all_languages = config_manager.get_all_languages();
registry.register_configured_languages(&all_languages);
Self {
parser: Parser::new(),
registry,
grammar_manager: GrammarManager::new().expect("Failed to initialize GrammarManager"),
}
}
pub fn process_file_with_config(
&mut self,
path: &Path,
config_manager: &ConfigManager,
cli_overrides: Option<&ProcessingOptions>,
) -> Result<ProcessedFile> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read file: {}", path.display()))?;
let language_config = self
.registry
.detect_language(path)
.with_context(|| format!("Unsupported file type: {}", path.display()))?
.clone();
let language_name = language_config.name.to_lowercase();
let mut resolved_config =
config_manager.get_config_for_file_with_language(path, &language_name);
if let Some(overrides) = cli_overrides {
if overrides.remove_doc {
resolved_config.remove_docs = true;
}
resolved_config.use_default_ignores = overrides.use_default_ignores;
if overrides.remove_todo {
resolved_config.remove_todos = true;
}
if overrides.remove_fixme {
resolved_config.remove_fixme = true;
}
if !overrides.custom_preserve_patterns.is_empty() {
resolved_config
.preserve_patterns
.extend(overrides.custom_preserve_patterns.clone());
}
resolved_config.respect_gitignore = overrides.respect_gitignore;
resolved_config.traverse_git_repos = overrides.traverse_git_repos;
}
let (processed_content, comments_removed, important_removals) =
self.process_content_with_config(&content, &language_config, &resolved_config)?;
Ok(ProcessedFile {
path: path.to_path_buf(),
original_content: content,
processed_content,
modified: false,
comments_removed,
important_removals,
})
}
fn process_content_with_config(
&mut self,
content: &str,
language_config: &crate::languages::config::LanguageConfig,
resolved_config: &ResolvedConfig,
) -> Result<(String, usize, Vec<ImportantRemoval>)> {
let language = if let Some(grammar_config) = &resolved_config.grammar_config {
self.grammar_manager
.get_language(&language_config.name, grammar_config)
.with_context(|| {
format!(
"Failed to load dynamic grammar for {}",
language_config.name
)
})?
} else {
language_config.tree_sitter_language()
};
self.parser
.set_language(&language)
.context("Failed to set parser language")?;
let tree = self
.parser
.parse(content, None)
.context("Failed to parse source code")?;
let preservation_rules = self.create_preservation_rules_from_config(resolved_config);
let mut visitor = CommentVisitor::new_with_language(
content,
&preservation_rules,
language_config.comment_types.clone(),
language_config.doc_comment_types.clone(),
language_config.name.clone(),
);
visitor.visit_node(tree.root_node());
let comments_to_remove = visitor.get_comments_to_remove();
let comments_removed = comments_to_remove.len();
let important_removals = detect_important_removals(&comments_to_remove);
let output = self.remove_comments_from_content(content, &comments_to_remove);
Ok((output, comments_removed, important_removals))
}
fn create_preservation_rules_from_config(
&self,
config: &ResolvedConfig,
) -> Vec<PreservationRule> {
let mut rules = Vec::new();
rules.push(PreservationRule::shebang());
rules.push(PreservationRule::pattern("~keep"));
if !config.remove_todos {
rules.push(PreservationRule::pattern("TODO"));
rules.push(PreservationRule::pattern("todo"));
}
if !config.remove_fixme {
rules.push(PreservationRule::pattern("FIXME"));
rules.push(PreservationRule::pattern("fixme"));
}
if !config.remove_docs {
rules.push(PreservationRule::documentation());
}
for pattern in &config.preserve_patterns {
rules.push(PreservationRule::pattern(pattern));
}
if config.use_default_ignores {
let mut comprehensive_rules = PreservationRule::comprehensive_rules();
if config.remove_todos {
comprehensive_rules
.retain(|rule| !rule.pattern_matches("TODO") && !rule.pattern_matches("todo"));
}
if config.remove_fixme {
comprehensive_rules.retain(|rule| {
!rule.pattern_matches("FIXME") && !rule.pattern_matches("fixme")
});
}
if config.remove_docs {
comprehensive_rules.retain(|rule| !matches!(rule, PreservationRule::Documentation));
}
rules.extend(comprehensive_rules);
}
rules
}
fn remove_comments_from_content(
&self,
content: &str,
comments_to_remove: &[CommentInfo],
) -> String {
if comments_to_remove.is_empty() {
return content.to_string();
}
let mut bytes = content.as_bytes().to_vec();
let mut deduped = comments_to_remove.to_vec();
deduped.sort_by(|a, b| {
a.start_byte
.cmp(&b.start_byte)
.then(b.end_byte.cmp(&a.end_byte))
});
let mut filtered: Vec<CommentInfo> = Vec::new();
for comment in deduped {
if let Some(previous) = filtered.last()
&& comment.start_byte >= previous.start_byte
&& comment.end_byte <= previous.end_byte
{
continue;
}
filtered.push(comment);
}
let mut sorted_comments = filtered;
sorted_comments.sort_by(|a, b| b.start_byte.cmp(&a.start_byte));
for comment in sorted_comments {
if comment.start_byte >= bytes.len() {
continue;
}
let start = comment.start_byte;
let end = comment.end_byte.min(bytes.len());
if start >= end {
continue;
}
let mut line_start = start;
while line_start > 0 && bytes[line_start - 1] != b'\n' {
line_start -= 1;
}
let mut line_end = end;
while line_end < bytes.len() && bytes[line_end] != b'\n' {
line_end += 1;
}
if line_end < bytes.len() {
line_end += 1;
}
let before = &bytes[line_start..start];
let after = &bytes[end..line_end];
let before_ws = before.iter().all(|b| b.is_ascii_whitespace());
let after_ws = after.iter().all(|b| b.is_ascii_whitespace());
if before_ws && after_ws {
bytes.drain(line_start..line_end);
} else {
bytes.drain(start..end);
}
}
String::from_utf8(bytes).unwrap_or_default()
}
}
#[derive(Debug)]
pub struct ProcessedFile {
pub path: std::path::PathBuf,
pub original_content: String,
pub processed_content: String,
pub modified: bool,
pub comments_removed: usize,
pub important_removals: Vec<ImportantRemoval>,
}
#[derive(Debug, Clone)]
pub struct ImportantRemoval {
pub line: usize,
pub reason: String,
pub preview: String,
}
pub struct OutputWriter {
dry_run: bool,
verbose: bool,
show_diff: bool,
}
impl OutputWriter {
pub fn new(dry_run: bool, verbose: bool, show_diff: bool) -> Self {
Self {
dry_run,
verbose,
show_diff,
}
}
pub fn write_file(&self, processed_file: &ProcessedFile) -> Result<()> {
let modified = processed_file.original_content != processed_file.processed_content;
if !modified {
if self.verbose {
println!("✓ No changes needed: {}", processed_file.path.display());
}
return Ok(());
}
if self.dry_run {
println!("[DRY RUN] Would modify: {}", processed_file.path.display());
if self.verbose {
println!(" Removed {} comment(s)", processed_file.comments_removed);
}
if self.show_diff {
self.show_diff(processed_file)?;
}
} else {
std::fs::write(&processed_file.path, &processed_file.processed_content).with_context(
|| format!("Failed to write file: {}", processed_file.path.display()),
)?;
if self.verbose {
println!(
"✓ Modified: {} (removed {} comment(s))",
processed_file.path.display(),
processed_file.comments_removed
);
} else {
println!("Modified: {}", processed_file.path.display());
}
}
Ok(())
}
fn show_diff(&self, processed_file: &ProcessedFile) -> Result<()> {
println!("\n--- {}", processed_file.path.display());
println!("+++ {} (processed)", processed_file.path.display());
let original_lines: Vec<&str> = processed_file.original_content.lines().collect();
let processed_lines: Vec<&str> = processed_file.processed_content.lines().collect();
let max_lines = original_lines.len().max(processed_lines.len());
for i in 0..max_lines {
let original_line = original_lines.get(i).copied().unwrap_or("");
let processed_line = processed_lines.get(i).copied().unwrap_or("");
if original_line != processed_line {
if i < original_lines.len() && i >= processed_lines.len() {
println!("-{original_line}");
} else if i >= original_lines.len() && i < processed_lines.len() {
println!("+{processed_line}");
} else if original_line != processed_line {
println!("-{original_line}");
println!("+{processed_line}");
}
}
}
Ok(())
}
pub fn print_summary(&self, total_files: usize, modified_files: usize) {
if self.dry_run {
println!(
"\n[DRY RUN] Summary: {total_files} files processed, {modified_files} would be modified"
);
} else {
println!("\nSummary: {total_files} files processed, {modified_files} modified");
}
if total_files > 0 && modified_files == 0 {
println!("All files were already comment-free or only contained preserved comments.");
}
}
}
fn detect_important_removals(comments_to_remove: &[CommentInfo]) -> Vec<ImportantRemoval> {
comments_to_remove
.iter()
.filter_map(|comment| {
let trimmed = comment.content.trim_start();
let reason = if trimmed.starts_with("#!") {
Some("shebang".to_string())
} else if trimmed.starts_with("//go:")
|| trimmed.starts_with("/*go:")
|| trimmed.starts_with("//+build")
|| trimmed.starts_with("// +build")
|| trimmed.starts_with("//line ")
|| trimmed.starts_with("/*line ")
{
Some("go directive".to_string())
} else if trimmed.contains("shellcheck") {
Some("shellcheck directive".to_string())
} else if trimmed.contains("eslint-")
|| trimmed.contains("prettier-")
|| trimmed.contains("@ts-")
|| trimmed.contains("biome-")
|| trimmed.contains("deno-")
|| trimmed.contains("nolint")
{
Some("linter/formatter directive".to_string())
} else if trimmed.starts_with("#pragma")
|| trimmed.contains("NOLINT")
|| trimmed.contains("clang-format")
{
Some("compiler/formatter directive".to_string())
} else if trimmed.starts_with("# frozen_string_literal:")
|| trimmed.starts_with("# encoding:")
|| trimmed.starts_with("# coding:")
|| trimmed.starts_with("# typed:")
{
Some("language magic comment".to_string())
} else {
None
}?;
let mut preview = trimmed.replace('\n', " ");
const MAX: usize = 120;
if preview.len() > MAX {
preview.truncate(MAX);
preview.push('…');
}
Some(ImportantRemoval {
line: comment.start_row + 1,
reason,
preview,
})
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::{Config, ConfigManager, ResolvedConfig};
use crate::languages::config::LanguageConfig;
use tempfile::tempdir;
fn default_resolved_config() -> ResolvedConfig {
ResolvedConfig {
remove_todos: false,
remove_fixme: false,
remove_docs: false,
preserve_patterns: Vec::new(),
use_default_ignores: true,
respect_gitignore: true,
traverse_git_repos: false,
language_config: None,
grammar_config: None,
}
}
fn process_rust(source: &str) -> String {
let mut processor = Processor::new();
let language_config = LanguageConfig::rust();
let resolved_config = default_resolved_config();
let (output, _, _) = processor
.process_content_with_config(source, &language_config, &resolved_config)
.expect("processing rust source");
output
}
fn process_go(source: &str, use_default_ignores: bool, remove_docs: bool) -> String {
let mut processor = Processor::new();
let language_config = LanguageConfig::go();
let mut resolved_config = default_resolved_config();
resolved_config.use_default_ignores = use_default_ignores;
resolved_config.remove_docs = remove_docs;
let (output, _, _) = processor
.process_content_with_config(source, &language_config, &resolved_config)
.expect("processing go source");
output
}
fn process_language(source: &str, language_config: LanguageConfig) -> String {
let mut processor = Processor::new();
let resolved_config = default_resolved_config();
let (output, _, _) = processor
.process_content_with_config(source, &language_config, &resolved_config)
.expect("processing source");
output
}
fn process_language_with_default_ignores(
source: &str,
language_config: LanguageConfig,
use_default_ignores: bool,
) -> String {
let mut processor = Processor::new();
let mut resolved_config = default_resolved_config();
resolved_config.use_default_ignores = use_default_ignores;
let (output, _, _) = processor
.process_content_with_config(source, &language_config, &resolved_config)
.expect("processing source");
output
}
#[test]
fn preserves_strings_matching_comment_text() {
let source = r#"fn main() {
let pattern = "// comment";
println!("{}", pattern); // comment
}
"#;
let processed = process_rust(source);
assert!(processed.contains("\"// comment\""));
assert!(!processed.contains("; // comment"));
}
#[test]
fn preserves_macro_invocations_with_comment_like_strings() {
let source = r#"macro_rules! announce {
($msg:expr) => {{
println!("{}", $msg); // keep
}};
}
fn main() {
announce!("// keep");
}
"#;
let processed = process_rust(source);
assert!(processed.contains("announce!(\"// keep\");"));
assert!(!processed.contains("// keep\n"));
}
#[test]
fn preserves_attributes_when_removing_doc_comments() {
let source = r#"#[derive(Subcommand, Debug)]
pub enum Commands {
/// Create smart config
#[command(about = "Create a template configuration file")]
Init,
}
"#;
let mut processor = Processor::new();
let language_config = LanguageConfig::rust();
let mut config = default_resolved_config();
config.remove_docs = true;
let (processed, _, _) = processor
.process_content_with_config(source, &language_config, &config)
.expect("process doc comments");
assert!(processed.contains("#[command(about = \"Create a template configuration file\")]"));
assert!(!processed.contains("Create smart config"));
}
#[test]
fn respects_no_default_ignores_override() {
let dir = tempdir().expect("create temp dir");
let file_path = dir.path().join("sample.rs");
let source = r#"/// #![feature(never_type)]
// NOTE: this would normally be preserved
fn main() {}
"#;
std::fs::write(&file_path, source).expect("write test file");
let config_manager = ConfigManager::from_single_config(dir.path(), Config::default())
.expect("config manager");
let mut processor = Processor::new();
let overrides_with_defaults = ProcessingOptions {
remove_todo: true,
remove_fixme: true,
remove_doc: true,
custom_preserve_patterns: Vec::new(),
use_default_ignores: true,
dry_run: true,
show_diff: false,
respect_gitignore: true,
traverse_git_repos: false,
};
let with_defaults = processor
.process_file_with_config(&file_path, &config_manager, Some(&overrides_with_defaults))
.expect("process with defaults");
assert!(with_defaults.processed_content.contains("NOTE"));
assert!(with_defaults.processed_content.contains("#![feature"));
let overrides_without_defaults = ProcessingOptions {
use_default_ignores: false,
..overrides_with_defaults
};
let without_defaults = processor
.process_file_with_config(
&file_path,
&config_manager,
Some(&overrides_without_defaults),
)
.expect("process without defaults");
assert!(!without_defaults.processed_content.contains("NOTE"));
assert!(!without_defaults.processed_content.contains("#![feature"));
assert!(without_defaults.processed_content.contains("fn main()"));
}
#[test]
fn preserves_go_embed_directives_even_without_default_ignores() {
let source = r#"package main
//go:embed hello.txt
var embedded string
func main() { /* regular comment should be removed */ }
"#;
let processed = process_go(source, false, true);
assert!(processed.contains("//go:embed hello.txt"));
assert!(!processed.contains("regular comment should be removed"));
}
#[test]
fn preserves_go_cgo_preamble_comments() {
let source = r#"package htmltomarkdown
// #cgo LDFLAGS: -lhtml_to_markdown_ffi
// #include <stdlib.h>
// extern const char* html_to_markdown_version();
import "C"
func Version() string { return C.GoString(C.html_to_markdown_version()) /* regular comment should be removed */ }
"#;
for use_default_ignores in [true, false] {
let processed = process_go(source, use_default_ignores, true);
assert!(
processed.contains("// #cgo LDFLAGS: -lhtml_to_markdown_ffi"),
"expected to preserve cgo preamble with use_default_ignores={use_default_ignores}"
);
assert!(
processed.contains("// #include <stdlib.h>"),
"expected to preserve cgo preamble with use_default_ignores={use_default_ignores}"
);
assert!(
processed.contains("// extern const char* html_to_markdown_version();"),
"expected to preserve cgo preamble with use_default_ignores={use_default_ignores}"
);
assert!(processed.contains("import \"C\""));
assert!(!processed.contains("regular comment should be removed"));
}
}
#[test]
fn removes_ruby_comments_without_touching_strings() {
let source = r#"# remove me
puts "Hello # not a comment"
"#;
let processed = process_language(source, LanguageConfig::ruby());
assert!(!processed.contains("# remove me"));
assert!(processed.contains("Hello # not a comment"));
}
#[test]
fn preserves_ruby_frozen_string_literal_magic_comment() {
let source = r#"# frozen_string_literal: true
# remove me
puts "ok"
"#;
let processed = process_language(source, LanguageConfig::ruby());
assert!(processed.contains("# frozen_string_literal: true"));
assert!(!processed.contains("# remove me"));
}
#[test]
fn preserves_shebangs_even_without_default_ignores() {
let source = r#"#!/usr/bin/env bash
# remove me
echo "ok"
"#;
let processed =
process_language_with_default_ignores(source, LanguageConfig::shell(), false);
assert!(processed.starts_with("#!/usr/bin/env bash\n"));
assert!(!processed.contains("# remove me"));
assert!(processed.contains("echo \"ok\""));
}
#[test]
fn preserves_ruby_yard_doc_comments_by_default() {
let source = r#"# @param x [Integer]
def foo(x)
x + 1
end
"#;
let processed = process_language(source, LanguageConfig::ruby());
assert!(processed.contains("# @param x [Integer]"));
}
#[test]
fn removes_php_comments_without_touching_strings() {
let source = r#"<?php
// remove me
$s = "// not a comment";
echo $s;
"#;
let processed = process_language(source, LanguageConfig::php());
assert!(!processed.contains("// remove me"));
assert!(processed.contains("\"// not a comment\""));
}
#[test]
fn preserves_c_header_guard_trailing_comments() {
let source = r#"#ifndef HTML_TO_MARKDOWN_H
#define HTML_TO_MARKDOWN_H
// remove me
int x;
#endif /* HTML_TO_MARKDOWN_H */
"#;
let processed = process_language(source, LanguageConfig::c());
assert!(processed.contains("#endif /* HTML_TO_MARKDOWN_H */"));
assert!(!processed.contains("remove me"));
assert!(processed.contains("int x;"));
}
#[test]
fn removes_elixir_comments_without_touching_strings() {
let source = r##"# remove me
IO.puts("# not a comment")
"##;
let processed = process_language(source, LanguageConfig::elixir());
assert!(!processed.contains("# remove me"));
assert!(processed.contains("\"# not a comment\""));
}
#[test]
fn removes_toml_comments_without_touching_strings() {
let source = r##"# remove me
key = "# not a comment"
"##;
let processed = process_language(source, LanguageConfig::toml());
assert!(!processed.contains("# remove me"));
assert!(processed.contains("\"# not a comment\""));
}
#[test]
fn removes_csharp_comments_without_touching_strings() {
let source = r#"// remove me
class C { void M() { var s = "// not a comment"; } }
"#;
let processed = process_language(source, LanguageConfig::csharp());
assert!(!processed.contains("// remove me"));
assert!(processed.contains("\"// not a comment\""));
}
}