use crate::cancellation::CancellationToken;
use crate::config::{Config, ProcessingConfig};
use crate::core_types::FileInfo;
use crate::errors::{io_error_with_path, Error, Result};
use crate::filtering::is_likely_text_from_buffer;
use log::debug;
use rayon::prelude::*;
use crate::core_types::FileContent;
pub mod counter;
pub mod filters;
pub use counter::calculate_counts;
use filters::ContentFilter;
use std::fs;
#[doc(hidden)] #[derive(Debug, Clone, Copy)]
pub struct ProcessingOptions<'a> {
pub include_binary: bool,
pub counts: bool,
pub content_filters: &'a [Box<dyn ContentFilter>],
}
impl<'a> From<&'a Config> for ProcessingOptions<'a> {
fn from(config: &'a Config) -> Self {
Self {
include_binary: config.processing.include_binary,
counts: config.processing.counts,
content_filters: &config.processing.content_filters,
}
}
}
pub fn process_content<'a>(
files_content: impl Iterator<Item = FileContent> + Send + 'a,
opts: ProcessingOptions<'a>,
token: &'a CancellationToken,
) -> impl Iterator<Item = Result<FileInfo>> {
files_content
.par_bridge()
.filter_map(move |file_content| {
if token.is_cancelled() {
return Some(Err(Error::Interrupted));
}
debug!(
"Processing content for: {}",
file_content.relative_path.display()
);
let content_bytes = &file_content.content;
let is_binary = !is_likely_text_from_buffer(content_bytes);
if is_binary && !opts.include_binary {
debug!(
"Skipping binary content: {}",
file_content.relative_path.display()
);
return None;
}
let original_content_str = String::from_utf8_lossy(content_bytes).to_string();
let mut file_info = FileInfo {
absolute_path: file_content.relative_path.clone(),
relative_path: file_content.relative_path,
size: content_bytes.len() as u64,
processed_content: None,
counts: None,
is_process_last: file_content.is_process_last,
process_last_order: file_content.process_last_order,
is_binary,
};
if opts.counts {
file_info.counts = Some(if is_binary {
crate::core_types::FileCounts {
lines: 0,
characters: content_bytes.len(),
words: 0,
}
} else {
calculate_counts(&original_content_str)
});
}
file_info.processed_content = Some(if !is_binary {
opts.content_filters
.iter()
.fold(original_content_str, |acc, filter| filter.apply(&acc))
} else {
original_content_str
});
Some(Ok(file_info))
})
.collect::<Vec<_>>()
.into_iter()
}
pub(crate) fn process_and_filter_files_internal<'a>(
files: impl ParallelIterator<Item = FileInfo> + 'a,
config: &'a ProcessingConfig,
token: &'a CancellationToken,
) -> impl ParallelIterator<Item = Result<FileInfo>> + 'a {
files.filter_map(move |mut file_info| {
if token.is_cancelled() {
return Some(Err(Error::Interrupted));
}
debug!("Processing file: {}", file_info.absolute_path.display());
let content_bytes = match fs::read(&file_info.absolute_path) {
Ok(bytes) => bytes,
Err(e) => {
let app_err = io_error_with_path(e, &file_info.absolute_path);
return Some(Err(app_err));
}
};
let is_binary = !is_likely_text_from_buffer(&content_bytes);
file_info.is_binary = is_binary;
if is_binary && !config.include_binary {
debug!(
"Skipping binary file: {}",
file_info.relative_path.display()
);
return None; }
let original_content_str = String::from_utf8_lossy(&content_bytes).to_string();
if config.counts {
if is_binary {
file_info.counts = Some(crate::core_types::FileCounts {
lines: 0,
characters: content_bytes.len(),
words: 0,
});
} else {
file_info.counts = Some(calculate_counts(&original_content_str));
}
debug!(
"Calculated counts for {}: {:?}",
file_info.relative_path.display(),
file_info.counts
);
}
let mut processed_content = original_content_str;
if !is_binary {
for filter in &config.content_filters {
processed_content = filter.apply(&processed_content);
debug!(
"Applied filter '{}' to {}",
filter.name(),
file_info.relative_path.display()
);
}
} else {
debug!(
"Skipping content filters for binary file {}",
file_info.relative_path.display()
);
}
file_info.processed_content = Some(processed_content);
Some(Ok(file_info))
})
}
pub fn process_files<'a>(
files: impl Iterator<Item = FileInfo> + Send + 'a,
config: &'a ProcessingConfig,
token: &'a CancellationToken,
) -> impl Iterator<Item = Result<FileInfo>> {
process_and_filter_files_internal(files.par_bridge(), config, token)
.collect::<Vec<_>>()
.into_iter()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cancellation::CancellationToken;
use crate::config::Config;
use crate::core_types::FileInfo;
use crate::processing::filters::{RemoveCommentsFilter, RemoveEmptyLinesFilter};
use std::fs;
use std::path::PathBuf;
use tempfile::tempdir;
fn setup_test_file(content: &[u8]) -> (tempfile::TempDir, FileInfo) {
let dir = tempdir().unwrap();
let file_path = dir.path().join("test.rs");
fs::write(&file_path, content).unwrap();
let file_info = FileInfo {
absolute_path: file_path,
relative_path: PathBuf::from("test.rs"),
size: content.len() as u64,
processed_content: None,
counts: None,
is_process_last: false,
process_last_order: None,
is_binary: false,
};
(dir, file_info)
}
#[test]
fn test_process_files_applies_filters_sequentially() -> Result<()> {
let original_content = b"// comment\n\nfn main() {}\n";
let (_dir, file_info) = setup_test_file(original_content);
let token = CancellationToken::new();
let mut config = Config::new_for_test();
config
.processing
.content_filters
.push(Box::new(RemoveCommentsFilter));
config
.processing
.content_filters
.push(Box::new(RemoveEmptyLinesFilter));
let processed: Vec<_> = process_and_filter_files_internal(
vec![file_info].into_par_iter(),
&config.processing,
&token,
)
.collect::<Result<_>>()?;
assert_eq!(processed.len(), 1);
let expected_content = "fn main() {}";
assert_eq!(
processed[0].processed_content.as_deref(),
Some(expected_content)
);
Ok(())
}
#[test]
fn test_process_files_no_filters() -> Result<()> {
let original_content = b"// comment\n\nfn main() {}\n";
let (_dir, file_info) = setup_test_file(original_content);
let token = CancellationToken::new();
let config = Config::new_for_test(); assert!(config.processing.content_filters.is_empty());
let processed: Vec<_> = process_and_filter_files_internal(
vec![file_info].into_par_iter(),
&config.processing,
&token,
)
.collect::<Result<_>>()?;
assert_eq!(processed.len(), 1);
assert_eq!(
processed[0].processed_content.as_deref(),
Some(std::str::from_utf8(original_content).unwrap())
);
Ok(())
}
#[test]
fn test_process_files_skips_filters_for_binary() -> Result<()> {
let original_content = b"binary\0content";
let (_dir, file_info) = setup_test_file(original_content);
let token = CancellationToken::new();
let mut config = Config::new_for_test();
config.processing.include_binary = true; config
.processing
.content_filters
.push(Box::new(RemoveCommentsFilter));
let processed: Vec<_> = process_and_filter_files_internal(
vec![file_info].into_par_iter(),
&config.processing,
&token,
)
.collect::<Result<_>>()?;
assert_eq!(processed.len(), 1);
assert!(processed[0].is_binary);
let expected_lossy = String::from_utf8_lossy(original_content);
assert_eq!(
processed[0].processed_content.as_deref(),
Some(expected_lossy.as_ref())
);
Ok(())
}
}