mod config;
mod count;
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
use clap::{CommandFactory, Parser};
use clap_complete::generate;
use globset::{Glob, GlobSetBuilder};
use memmap2::MmapOptions;
use rayon::prelude::*;
use serde::Serialize;
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, Read};
use std::path::Path;
use walkdir::WalkDir;
#[derive(Serialize)]
struct Counts {
lines: usize,
words: usize,
bytes: usize,
chars: usize,
max_line_length: usize,
pattern: usize,
unique_words: usize,
#[serde(skip_serializing_if = "Option::is_none")]
statistics: Option<Statistics>,
#[serde(skip_serializing_if = "Option::is_none")]
histogram: Option<HashMap<usize, usize>>,
}
#[derive(Serialize)]
struct Statistics {
mean_line_length: f64,
median_line_length: usize,
std_dev: f64,
min_line_length: usize,
max_line_length: usize,
empty_lines: usize,
}
impl Counts {
fn new() -> Self {
Self {
lines: 0,
words: 0,
bytes: 0,
chars: 0,
max_line_length: 0,
pattern: 0,
unique_words: 0,
statistics: None,
histogram: None,
}
}
fn add(&mut self, other: &Counts) {
self.lines += other.lines;
self.words += other.words;
self.bytes += other.bytes;
self.chars += other.chars;
self.max_line_length = self.max_line_length.max(other.max_line_length);
self.pattern += other.pattern;
self.unique_words += other.unique_words;
}
fn max_value(&self, args: &config::Args) -> usize {
let mut max = 0;
if args.lines {
max = max.max(self.lines);
}
if args.words {
max = max.max(self.words);
}
if args.chars {
max = max.max(self.chars);
}
if args.bytes {
max = max.max(self.bytes);
}
if args.max_line_length {
max = max.max(self.max_line_length);
}
if args.unique {
max = max.max(self.unique_words);
}
if args.pattern.is_some() {
max = max.max(self.pattern);
}
max
}
fn format(&self, args: &config::Args, name: &str, width: usize) -> String {
let mut values = Vec::new();
if args.lines {
values.push(self.lines);
}
if args.words {
values.push(self.words);
}
if args.chars {
values.push(self.chars);
}
if args.bytes {
values.push(self.bytes);
}
if args.max_line_length {
values.push(self.max_line_length);
}
if args.unique {
values.push(self.unique_words);
}
if args.pattern.is_some() {
values.push(self.pattern);
}
let formatted: Vec<String> = values
.iter()
.enumerate()
.map(|(i, v)| {
if i == 0 {
v.to_string()
} else {
format!("{:width$}", v)
}
})
.collect();
if name.is_empty() {
formatted.join(" ")
} else {
format!("{} {}", formatted.join(" "), name)
}
}
fn format_stats(&self) -> String {
if let Some(ref stats) = self.statistics {
format!(
"Statistics:\n Lines: {}\n Words: {}\n Bytes: {}\n Mean line length: {:.2}\n Median line length: {}\n Std deviation: {:.2}\n Min line length: {}\n Max line length: {}\n Empty lines: {}",
self.lines,
self.words,
self.bytes,
stats.mean_line_length,
stats.median_line_length,
stats.std_dev,
stats.min_line_length,
stats.max_line_length,
stats.empty_lines
)
} else {
String::new()
}
}
fn format_histogram(&self) -> String {
if let Some(ref hist) = self.histogram {
let mut sorted: Vec<_> = hist.iter().collect();
sorted.sort_by_key(|(k, _)| **k);
let max_count = *hist.values().max().unwrap_or(&1);
let max_bar_width = 50;
let mut result = String::from("Line Length Histogram:\n");
for (bucket, count) in sorted {
let bar_width =
((*count as f64 / max_count as f64) * max_bar_width as f64) as usize;
let bar = "â–ˆ".repeat(bar_width);
result.push_str(&format!(
" {:4}-{:4}: {:6} {}\n",
bucket,
bucket + 9,
count,
bar
));
}
result
} else {
String::new()
}
}
}
fn process_data(data: &[u8], args: &config::Args) -> Counts {
let mut counts = Counts::new();
let filtered_data;
let data_to_process = if args.code {
filtered_data = count::filter_code_comments(data);
&filtered_data
} else if args.markdown {
filtered_data = count::filter_markdown_code(data);
&filtered_data
} else {
data
};
if args.lines {
counts.lines = count::count_lines(data_to_process);
}
if args.words {
counts.words = count::count_all_words(data_to_process);
}
if args.chars {
if args.fast {
counts.chars = data_to_process.len();
} else {
counts.chars = count::count_chars(data_to_process);
}
}
if args.bytes {
counts.bytes = data_to_process.len();
}
if args.max_line_length {
counts.max_line_length = count::max_line_length(data_to_process);
}
if args.unique {
counts.unique_words = count::count_unique_words(data_to_process);
}
if let Some(pattern) = &args.pattern {
counts.pattern = count::count_pattern(data_to_process, pattern.as_bytes());
}
if args.stats {
let stats = count::calculate_statistics(data_to_process);
counts.statistics = Some(Statistics {
mean_line_length: stats.mean_line_length,
median_line_length: stats.median_line_length,
std_dev: stats.std_dev,
min_line_length: stats.min_line_length,
max_line_length: stats.max_line_length,
empty_lines: stats.empty_lines,
});
}
if args.histogram {
counts.histogram = Some(count::generate_histogram(data_to_process));
}
counts
}
fn process_file(path: &str, args: &config::Args) -> io::Result<Counts> {
let needs_only_bytes = args.bytes
&& !args.lines
&& !args.words
&& !args.chars
&& !args.max_line_length
&& !args.unique
&& args.pattern.is_none()
&& !args.stats
&& !args.histogram
&& !args.code
&& !args.markdown;
if needs_only_bytes {
let metadata = std::fs::metadata(path)?;
let mut counts = Counts::new();
counts.bytes = metadata.len() as usize;
return Ok(counts);
}
let file = File::open(path)?;
let metadata = file.metadata()?;
let file_size = metadata.len() as usize;
if file_size == 0 {
return Ok(Counts::new());
}
if metadata.is_file() {
let mmap = unsafe { MmapOptions::new().map(&file)? };
if count::is_binary(&mmap) {
eprintln!("kz: {}: binary file detected, skipping", path);
return Ok(Counts::new());
}
Ok(process_data(&mmap, args))
} else {
let mut buffer = Vec::new();
let mut file = file;
file.read_to_end(&mut buffer)?;
if count::is_binary(&buffer) {
eprintln!("kz: {}: binary file detected, skipping", path);
return Ok(Counts::new());
}
Ok(process_data(&buffer, args))
}
}
fn process_stdin(args: &config::Args) -> io::Result<Counts> {
let mut buffer = Vec::new();
io::stdin().read_to_end(&mut buffer)?;
if count::is_binary(&buffer) {
eprintln!("kz: stdin: binary data detected, skipping");
return Ok(Counts::new());
}
Ok(process_data(&buffer, args))
}
fn read_files_from_file(path: &str) -> io::Result<Vec<String>> {
let mut content = Vec::new();
if path == "-" {
io::stdin().read_to_end(&mut content)?;
} else {
let mut file = File::open(path)?;
file.read_to_end(&mut content)?;
}
Ok(content
.split(|&b| b == 0)
.filter(|s| !s.is_empty())
.filter_map(|s| std::str::from_utf8(s).ok())
.map(|s| s.to_string())
.collect())
}
fn collect_files(args: &config::Args) -> io::Result<Vec<String>> {
let mut all_files = Vec::new();
let mut exclude_builder = GlobSetBuilder::new();
for pattern in &args.exclude {
let glob =
Glob::new(pattern).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
exclude_builder.add(glob);
}
let exclude_set = exclude_builder
.build()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
if let Some(ref files0_path) = args.files0_from {
let files = read_files_from_file(files0_path)?;
all_files.extend(files);
}
for path_str in &args.files {
let path = Path::new(path_str);
if !path.exists() {
return Err(io::Error::new(
io::ErrorKind::NotFound,
format!("{}: No such file or directory", path_str),
));
}
if path.is_file() {
all_files.push(path_str.clone());
} else if path.is_dir() {
if !args.recursive {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("{}: Is a directory (use -r for recursive)", path_str),
));
}
for entry in WalkDir::new(path).follow_links(true) {
let entry = entry?;
let entry_path = entry.path();
if !entry_path.is_file() {
continue;
}
if !args.exclude.is_empty() && exclude_set.is_match(entry_path) {
continue;
}
if let Some(path_str) = entry_path.to_str() {
all_files.push(path_str.to_string());
}
}
}
}
Ok(all_files)
}
fn main() {
let mut args = config::Args::parse();
if let Some(shell) = args.generate_completion {
let mut cmd = config::Args::command();
generate(shell, &mut cmd, "kz", &mut io::stdout());
return;
}
args.normalize();
if args.files.is_empty() && args.files0_from.is_none() {
if atty::is(atty::Stream::Stdin) {
eprintln!("kz: no input provided (use --help for usage)");
std::process::exit(1);
}
match process_stdin(&args) {
Ok(counts) => {
if args.json {
println!("{}", serde_json::to_string_pretty(&counts).unwrap());
} else if args.stats {
println!("{}", counts.format_stats());
} else if args.histogram {
println!("{}", counts.format_histogram());
} else {
let width = counts.max_value(&args).to_string().len().max(1);
println!("{}", counts.format(&args, "", width));
}
}
Err(e) => {
eprintln!("kz: stdin: {}", e);
std::process::exit(1);
}
}
return;
}
let files = match collect_files(&args) {
Ok(f) => f,
Err(e) => {
eprintln!("kz: {}", e);
std::process::exit(1);
}
};
if files.is_empty() {
eprintln!("kz: no files to process");
std::process::exit(1);
}
let show_total = files.len() > 1;
let file_results: Vec<_> = if files.len() == 1 {
files
.iter()
.map(|path| (path.clone(), process_file(path, &args)))
.collect()
} else {
files
.par_iter()
.map(|path| (path.clone(), process_file(path, &args)))
.collect()
};
let mut total = Counts::new();
let mut had_error = false;
let mut json_results = Vec::new();
for (path, result) in &file_results {
match result {
Ok(counts) => {
total.add(counts);
}
Err(e) => {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("kz: {}: {}", path, e);
had_error = true;
}
}
}
}
let width = total.max_value(&args).to_string().len().max(1);
for (path, result) in &file_results {
if let Ok(counts) = result {
if args.json {
continue;
} else if args.stats {
println!("\n{}", path);
println!("{}", counts.format_stats());
} else if args.histogram {
println!("\n{}", path);
println!("{}", counts.format_histogram());
} else {
println!("{}", counts.format(&args, path, width));
}
}
}
if args.json {
for (path, result) in &file_results {
if let Ok(counts) = result {
let mut json_obj = serde_json::Map::new();
json_obj.insert("file".to_string(), serde_json::Value::String(path.clone()));
json_obj.insert("counts".to_string(), serde_json::to_value(&counts).unwrap());
json_results.push(serde_json::Value::Object(json_obj));
}
}
if show_total {
let mut json_obj = serde_json::Map::new();
json_obj.insert(
"file".to_string(),
serde_json::Value::String("total".to_string()),
);
json_obj.insert("counts".to_string(), serde_json::to_value(&total).unwrap());
json_results.push(serde_json::Value::Object(json_obj));
}
println!(
"{}",
serde_json::to_string_pretty(&serde_json::Value::Array(json_results)).unwrap()
);
} else if show_total && !args.stats && !args.histogram {
println!("{}", total.format(&args, "total", width));
}
if had_error {
std::process::exit(1);
}
}