codestats 0.3.1

A CLI tool for analyzing code statistics across different programming languages
#![warn(clippy::all, clippy::pedantic, clippy::nursery)]

use std::{
	collections::{HashMap, HashSet},
	env,
	error::Error,
	fmt::Write as _,
	fs,
	path::Path,
	result,
};

use indexmap::IndexMap;
use serde::Deserialize;

#[derive(Debug, Deserialize)]
#[serde(deny_unknown_fields)]
struct LanguageConfig {
	file_patterns: Vec<String>,
	#[serde(default)]
	line_comments: Vec<String>,
	#[serde(default)]
	block_comments: Vec<Vec<String>>,
	#[serde(default)]
	nested_blocks: bool,
	#[serde(default)]
	shebangs: Vec<String>,
	#[serde(default)]
	keywords: Vec<String>,
}

#[derive(Debug, Clone)]
struct ProcessedLanguage {
	name: String,
	file_patterns: Vec<String>,
	line_comments: Vec<String>,
	block_comments: Vec<(String, String)>,
	nested_blocks: bool,
	shebangs: Vec<String>,
	keywords: Vec<String>,
}

impl ProcessedLanguage {
	fn from_config(name: String, config: LanguageConfig) -> Self {
		let LanguageConfig { file_patterns, line_comments, block_comments, nested_blocks, shebangs, keywords } = config;
		let block_comments = block_comments
			.into_iter()
			.filter_map(|pair| {
				let mut iter = pair.into_iter();
				let start = iter.next()?;
				let end = iter.next()?;
				Some((start, end))
			})
			.collect();
		Self { name, file_patterns, line_comments, block_comments, nested_blocks, shebangs, keywords }
	}
}

fn get_language_schema() -> Vec<(&'static str, &'static str)> {
	vec![
		("index", "usize"),
		("name", "&'static str"),
		("file_patterns", "&'static [&'static str]"),
		("line_comments", "&'static [&'static str]"),
		("block_comments", "&'static [(&'static str, &'static str)]"),
		("nested_blocks", "bool"),
		("shebangs", "&'static [&'static str]"),
		("keywords", "&'static [&'static str]"),
	]
}

fn render_languages(languages: &[ProcessedLanguage]) -> String {
	let mut output = String::new();
	output.push_str("#[derive(Debug, Clone, PartialEq, Eq)]\n");
	output.push_str("/// Holds information about a single programming language.\n");
	output.push_str("pub struct Language {\n");
	for (field, ty) in get_language_schema() {
		let _ = writeln!(output, "\tpub {field}: {ty},");
	}
	output.push_str("}\n\n");
	output.push_str("pub static LANGUAGES: &[Language] = &[\n");
	for (idx, lang) in languages.iter().enumerate() {
		output.push_str("\tLanguage {\n");
		let _ = writeln!(output, "\t\tindex: {idx},");
		let _ = writeln!(output, "\t\tname: {},", render_str(&lang.name));
		let _ = writeln!(output, "\t\tfile_patterns: {},", render_str_slice(&lang.file_patterns));
		let _ = writeln!(output, "\t\tline_comments: {},", render_str_slice(&lang.line_comments));
		let _ = writeln!(output, "\t\tblock_comments: {},", render_block_comments(&lang.block_comments));
		let _ = writeln!(output, "\t\tnested_blocks: {},", lang.nested_blocks);
		let _ = writeln!(output, "\t\tshebangs: {},", render_str_slice(&lang.shebangs));
		let _ = writeln!(output, "\t\tkeywords: {},", render_str_slice(&lang.keywords));
		output.push_str("\t},\n");
	}
	output.push_str("];\n\n");
	output
}

fn render_str(value: &str) -> String {
	format!("{value:?}")
}

fn render_str_slice(values: &[String]) -> String {
	if values.is_empty() {
		"&[]".to_string()
	} else {
		let joined = values.iter().map(|value| render_str(value)).collect::<Vec<String>>().join(", ");
		format!("&[{joined}]")
	}
}

fn render_block_comments(values: &[(String, String)]) -> String {
	if values.is_empty() {
		"&[]".to_string()
	} else {
		let joined = values
			.iter()
			.map(|(start, end)| format!("({}, {})", render_str(start), render_str(end)))
			.collect::<Vec<String>>()
			.join(", ");
		format!("&[{joined}]")
	}
}

fn normalize_languages(entries: IndexMap<String, LanguageConfig>) -> Result<Vec<ProcessedLanguage>> {
	let mut errors = Vec::new();
	let mut prev_name: Option<String> = None;
	let languages: Vec<ProcessedLanguage> = entries
		.into_iter()
		.enumerate()
		.filter_map(|(index, (name, config))| {
			if name.trim().is_empty() {
				errors.push(format!("Language at position {}: name cannot be empty", index + 1));
				return None;
			}
			if let Some(prev) = &prev_name {
				if name.to_lowercase() < prev.to_lowercase() {
					errors
						.push(format!("Language '{name}' is not in alphabetical order (should come before '{prev}')"));
				}
			}
			prev_name = Some(name.clone());
			Some(ProcessedLanguage::from_config(name, config))
		})
		.collect();
	if errors.is_empty() { Ok(languages) } else { Err(errors.join("\n").into()) }
}

type Result<T> = result::Result<T, Box<dyn Error>>;

fn main() -> Result<()> {
	let manifest_dir = env::var("CARGO_MANIFEST_DIR")?;
	let json_path = Path::new(&manifest_dir).join("languages.json5");
	println!("cargo:rerun-if-changed={}", json_path.display());
	let json_content = fs::read_to_string(&json_path)?;
	let language_map: IndexMap<String, LanguageConfig> =
		json5::from_str(&json_content).map_err(|e| format!("Failed to parse languages.json5: {e}"))?;
	let languages = normalize_languages(language_map)?;
	validate_languages(&languages)?;
	let rendered = render_languages(&languages);
	let out_dir = env::var("OUT_DIR")?;
	let dest_path = Path::new(&out_dir).join("languages.rs");
	fs::write(dest_path, rendered)?;
	println!("Generated language data for {} languages", languages.len());
	Ok(())
}

fn validate_languages(languages: &[ProcessedLanguage]) -> Result<()> {
	let mut errors = Vec::new();
	let seen_patterns = validate_language_fields(languages, &mut errors);
	validate_pattern_disambiguation(languages, seen_patterns, &mut errors);
	if !errors.is_empty() {
		let error_message =
			format!("Language validation failed with {} error(s):\n{}", errors.len(), errors.join("\n"));
		return Err(error_message.into());
	}
	Ok(())
}

fn validate_language_fields(languages: &[ProcessedLanguage], errors: &mut Vec<String>) -> HashMap<String, Vec<String>> {
	let mut seen_names: HashSet<String> = HashSet::new();
	let mut seen_patterns: HashMap<String, Vec<String>> = HashMap::new();
	for (index, lang) in languages.iter().enumerate() {
		let position = format!("Language at position {}", index + 1);
		validate_basic_fields(lang, &position, &mut seen_names, errors);
		validate_file_patterns(lang, &position, &mut seen_patterns, errors);
		validate_comment_fields(lang, &position, errors);
	}
	seen_patterns
}

fn validate_basic_fields(
	lang: &ProcessedLanguage,
	position: &str,
	seen_names: &mut HashSet<String>,
	errors: &mut Vec<String>,
) {
	if lang.name.is_empty() {
		errors.push(format!("{position}: 'name' field cannot be empty"));
	}
	if lang.file_patterns.is_empty() {
		errors.push(format!("{} ('{}'): 'file_patterns' field cannot be empty", position, lang.name));
	}
	if !seen_names.insert(lang.name.clone()) {
		errors.push(format!("{position}: Duplicate language name '{}'", lang.name));
	}
	if lang.name.trim() != lang.name {
		errors.push(format!("{} ('{}'): Language name has leading/trailing whitespace", position, lang.name));
	}
}

fn validate_file_patterns(
	lang: &ProcessedLanguage,
	position: &str,
	seen_patterns: &mut HashMap<String, Vec<String>>,
	errors: &mut Vec<String>,
) {
	for (pattern_idx, pattern) in lang.file_patterns.iter().enumerate() {
		if pattern.is_empty() {
			errors.push(format!(
				"{} ('{}'), pattern {}: File pattern cannot be empty",
				position,
				lang.name,
				pattern_idx + 1
			));
		}
		if pattern.trim() != pattern {
			errors.push(format!(
				"{} ('{}'), pattern {}: File pattern '{}' has leading/trailing whitespace",
				position,
				lang.name,
				pattern_idx + 1,
				pattern
			));
		}
		seen_patterns.entry(pattern.clone()).or_default().push(lang.name.clone());
	}
}

fn validate_comment_fields(lang: &ProcessedLanguage, position: &str, errors: &mut Vec<String>) {
	for (comment_idx, comment) in lang.line_comments.iter().enumerate() {
		if comment.is_empty() {
			errors.push(format!(
				"{} ('{}'), line comment {}: Line comment cannot be empty",
				position,
				lang.name,
				comment_idx + 1
			));
		}
	}
	for (comment_idx, block_comment) in lang.block_comments.iter().enumerate() {
		if block_comment.0.is_empty() {
			errors.push(format!(
				"{} ('{}'), block comment {}: Block comment start cannot be empty",
				position,
				lang.name,
				comment_idx + 1
			));
		}
		if block_comment.1.is_empty() {
			errors.push(format!(
				"{} ('{}'), block comment {}: Block comment end cannot be empty",
				position,
				lang.name,
				comment_idx + 1
			));
		}
	}
}

fn validate_pattern_disambiguation(
	languages: &[ProcessedLanguage],
	seen_patterns: HashMap<String, Vec<String>>,
	errors: &mut Vec<String>,
) {
	for (pattern, language_names) in seen_patterns {
		if language_names.len() > 1 {
			let all_have_keywords = language_names
				.iter()
				.all(|name| languages.iter().find(|l| l.name == *name).is_some_and(|l| !l.keywords.is_empty()));
			if !all_have_keywords {
				errors.push(format!(
					"Duplicate pattern '{}' in [{}] - all must have 'keywords' for disambiguation",
					pattern,
					language_names.join(", ")
				));
			}
		}
	}
}