use anyhow::Result;
use std::path::{Path, PathBuf};
pub struct InputValidator;
#[derive(Debug)]
pub struct ValidationError {
pub message: String,
pub suggestion: String,
pub error_code: i32,
}
impl std::fmt::Display for ValidationError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}\n{}", self.message, self.suggestion)
}
}
impl std::error::Error for ValidationError {}
impl InputValidator {
pub fn validate_file_input(file_path: &Path) -> Result<(), ValidationError> {
if !file_path.exists() {
return Err(ValidationError {
message: format!("File not found: {}", file_path.display()),
suggestion: Self::generate_file_suggestions(file_path),
error_code: 2, });
}
if file_path.is_dir() {
return Err(ValidationError {
message: format!("Path is a directory, not a file: {}", file_path.display()),
suggestion: "Use --recursive flag to process directories, or specify a file path"
.to_string(),
error_code: 21, });
}
Self::validate_file_extension(file_path)?;
Self::validate_file_permissions(file_path)?;
Self::validate_file_size(file_path)?;
Ok(())
}
pub fn validate_config_file(config_path: &Path) -> Result<(), ValidationError> {
if !config_path.exists() {
return Err(ValidationError {
message: format!("Configuration file not found: {}", config_path.display()),
suggestion: format!(
"Create a config file at {} or use 'dataprof --help' to see configuration options",
config_path.display()
),
error_code: 2,
});
}
if let Some(ext) = config_path.extension()
&& ext != "toml"
{
return Err(ValidationError {
message: "Configuration file must have .toml extension".to_string(),
suggestion:
"Rename your config file to have .toml extension (e.g., .dataprof.toml)"
.to_string(),
error_code: 22, });
}
Ok(())
}
pub fn validate_chunk_size(chunk_size: usize) -> Result<(), ValidationError> {
if chunk_size == 0 {
return Err(ValidationError {
message: "Chunk size cannot be zero".to_string(),
suggestion: "Use a positive chunk size (e.g., --chunk-size 1000) or omit for adaptive sizing".to_string(),
error_code: 22,
});
}
if chunk_size < 10 {
return Err(ValidationError {
message: format!("Chunk size too small: {}", chunk_size),
suggestion: "Use at least 10 rows per chunk for efficient processing".to_string(),
error_code: 22,
});
}
if chunk_size > 10_000_000 {
return Err(ValidationError {
message: format!("Chunk size very large: {}", chunk_size),
suggestion: "Consider using smaller chunks (< 10M rows) to avoid memory issues"
.to_string(),
error_code: 22,
});
}
Ok(())
}
pub fn validate_sample_size(sample_size: usize) -> Result<(), ValidationError> {
if sample_size == 0 {
return Err(ValidationError {
message: "Sample size cannot be zero".to_string(),
suggestion:
"Use a positive sample size (e.g., --sample 10000) or omit for full analysis"
.to_string(),
error_code: 22,
});
}
if sample_size < 100 {
return Err(ValidationError {
message: format!("Sample size very small: {}", sample_size),
suggestion: "Use at least 100 samples for meaningful statistical analysis"
.to_string(),
error_code: 22,
});
}
Ok(())
}
pub fn validate_argument_combinations(
streaming: bool,
sample: Option<usize>,
progress: bool,
benchmark: bool,
) -> Result<(), ValidationError> {
if progress && !streaming {
return Err(ValidationError {
message: "Progress display requires streaming mode".to_string(),
suggestion: "Add --streaming flag when using --progress".to_string(),
error_code: 22,
});
}
if benchmark && streaming {
return Err(ValidationError {
message: "Benchmark mode conflicts with streaming".to_string(),
suggestion: "Use either --benchmark OR --streaming, not both".to_string(),
error_code: 22,
});
}
if benchmark && sample.is_some() {
return Err(ValidationError {
message: "Benchmark mode conflicts with sampling".to_string(),
suggestion: "Use either --benchmark OR --sample, not both".to_string(),
error_code: 22,
});
}
Ok(())
}
fn generate_file_suggestions(file_path: &Path) -> String {
let mut suggestions = Vec::new();
if let Some(parent) = file_path.parent() {
if parent.exists() {
if let Ok(entries) = std::fs::read_dir(parent) {
let similar_files: Vec<PathBuf> = entries
.filter_map(|entry| entry.ok())
.filter(|entry| {
if let Some(ext) = entry.path().extension() {
matches!(ext.to_str(), Some("csv") | Some("json") | Some("jsonl"))
} else {
false
}
})
.take(3)
.map(|entry| entry.path())
.collect();
if !similar_files.is_empty() {
suggestions.push(format!("Similar files found in {}:", parent.display()));
for file in similar_files {
suggestions.push(format!(" • {}", file.display()));
}
}
}
} else {
suggestions.push(format!(
"Parent directory does not exist: {}",
parent.display()
));
}
}
if file_path.is_relative() {
suggestions
.push("Try using an absolute path or check your current directory".to_string());
}
if suggestions.is_empty() {
"Check the file path and make sure the file exists".to_string()
} else {
suggestions.join("\n")
}
}
fn validate_file_extension(file_path: &Path) -> Result<(), ValidationError> {
if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
match ext.to_lowercase().as_str() {
"csv" | "json" | "jsonl" => Ok(()),
_ => Err(ValidationError {
message: format!("Unsupported file format: .{}", ext),
suggestion: "Supported formats: .csv, .json, .jsonl".to_string(),
error_code: 22,
}),
}
} else {
Err(ValidationError {
message: "File has no extension or unrecognizable format".to_string(),
suggestion: "Use files with extensions: .csv, .json, or .jsonl".to_string(),
error_code: 22,
})
}
}
fn validate_file_permissions(file_path: &Path) -> Result<(), ValidationError> {
match std::fs::metadata(file_path) {
Ok(metadata) => {
if metadata.permissions().readonly() {
log::debug!("File is read-only: {}", file_path.display());
}
Ok(())
}
Err(e) => Err(ValidationError {
message: format!("Cannot access file metadata: {}", e),
suggestion: "Check file permissions and try again".to_string(),
error_code: 13,
}),
}
}
fn validate_file_size(file_path: &Path) -> Result<(), ValidationError> {
match std::fs::metadata(file_path) {
Ok(metadata) => {
let size_mb = metadata.len() as f64 / 1_048_576.0;
if size_mb > 1000.0 {
log::warn!(
"Large file detected ({:.1} MB). Consider using --streaming for better performance",
size_mb
);
}
if size_mb > 10_000.0 {
return Err(ValidationError {
message: format!("File very large: {:.1} GB", size_mb / 1024.0),
suggestion: "Use --streaming --sample for very large files, or ensure sufficient memory".to_string(),
error_code: 27, });
}
Ok(())
}
Err(e) => Err(ValidationError {
message: format!("Cannot check file size: {}", e),
suggestion: "Ensure file is accessible and try again".to_string(),
error_code: 13,
}),
}
}
#[cfg(feature = "database")]
pub fn validate_database_connection(connection_string: &str) -> Result<(), ValidationError> {
if connection_string.is_empty() {
return Err(ValidationError {
message: "Database connection string is empty".to_string(),
suggestion:
"Provide a valid connection string (e.g., postgresql://user:pass@host/db)"
.to_string(),
error_code: 22,
});
}
if !connection_string.contains("://") {
return Err(ValidationError {
message: "Invalid connection string format".to_string(),
suggestion: "Use format: protocol://[user:password@]host[:port]/database"
.to_string(),
error_code: 22,
});
}
let supported_protocols = ["postgresql", "postgres", "mysql", "sqlite"];
let protocol = connection_string.split("://").next().unwrap_or("");
if !supported_protocols.contains(&protocol) {
return Err(ValidationError {
message: format!("Unsupported database protocol: {}", protocol),
suggestion: format!("Supported protocols: {}", supported_protocols.join(", ")),
error_code: 22,
});
}
Ok(())
}
pub fn validate_glob_pattern(pattern: &str) -> Result<(), ValidationError> {
if pattern.is_empty() {
return Err(ValidationError {
message: "Glob pattern is empty".to_string(),
suggestion: "Provide a valid glob pattern (e.g., \"data/**/*.csv\")".to_string(),
error_code: 22,
});
}
match glob::Pattern::new(pattern) {
Ok(_) => Ok(()),
Err(e) => Err(ValidationError {
message: format!("Invalid glob pattern: {}", e),
suggestion: "Use valid glob syntax with *, **, ?, [abc], etc.".to_string(),
error_code: 22,
}),
}
}
pub fn get_exit_code(error: &ValidationError) -> i32 {
error.error_code
}
}
pub mod exit_codes {
pub const SUCCESS: i32 = 0;
pub const GENERAL_ERROR: i32 = 1;
pub const FILE_NOT_FOUND: i32 = 2;
pub const PERMISSION_DENIED: i32 = 13;
pub const INVALID_ARGUMENT: i32 = 22;
pub const FILE_TOO_LARGE: i32 = 27;
pub const NO_SPACE_LEFT: i32 = 28;
pub const BROKEN_PIPE: i32 = 32;
pub const INVALID_DATA_FORMAT: i32 = 65;
pub const PROCESSING_ERROR: i32 = 66;
pub const CONFIG_ERROR: i32 = 67;
pub const DATABASE_ERROR: i32 = 68;
pub const NETWORK_ERROR: i32 = 69;
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_chunk_size_zero_rejected() {
assert!(InputValidator::validate_chunk_size(0).is_err());
}
#[test]
fn test_chunk_size_too_small_rejected() {
assert!(InputValidator::validate_chunk_size(5).is_err());
}
#[test]
fn test_chunk_size_too_large_rejected() {
assert!(InputValidator::validate_chunk_size(20_000_000).is_err());
}
#[test]
fn test_chunk_size_valid() {
assert!(InputValidator::validate_chunk_size(1000).is_ok());
assert!(InputValidator::validate_chunk_size(10).is_ok());
assert!(InputValidator::validate_chunk_size(10_000_000).is_ok());
}
#[test]
fn test_sample_size_zero_rejected() {
assert!(InputValidator::validate_sample_size(0).is_err());
}
#[test]
fn test_sample_size_too_small_rejected() {
assert!(InputValidator::validate_sample_size(50).is_err());
}
#[test]
fn test_sample_size_valid() {
assert!(InputValidator::validate_sample_size(100).is_ok());
assert!(InputValidator::validate_sample_size(10_000).is_ok());
}
#[test]
fn test_progress_without_streaming_rejected() {
let result = InputValidator::validate_argument_combinations(false, None, true, false);
assert!(result.is_err());
}
#[test]
fn test_benchmark_with_streaming_rejected() {
let result = InputValidator::validate_argument_combinations(true, None, false, true);
assert!(result.is_err());
}
#[test]
fn test_benchmark_with_sample_rejected() {
let result = InputValidator::validate_argument_combinations(false, Some(1000), false, true);
assert!(result.is_err());
}
#[test]
fn test_valid_argument_combinations() {
assert!(InputValidator::validate_argument_combinations(true, None, true, false).is_ok());
assert!(InputValidator::validate_argument_combinations(false, None, false, false).is_ok());
assert!(InputValidator::validate_argument_combinations(false, None, false, true).is_ok());
}
#[test]
fn test_validate_file_nonexistent() {
let result = InputValidator::validate_file_input(Path::new("/nonexistent/file.csv"));
assert!(result.is_err());
let err = result.unwrap_err();
assert_eq!(err.error_code, 2); }
#[test]
fn test_validate_file_directory_rejected() {
let dir = tempfile::tempdir().unwrap();
let result = InputValidator::validate_file_input(dir.path());
assert!(result.is_err());
let err = result.unwrap_err();
assert_eq!(err.error_code, 21); }
#[test]
fn test_validate_file_unsupported_extension() {
let mut f = NamedTempFile::with_suffix(".xlsx").unwrap();
write!(f, "data").unwrap();
f.flush().unwrap();
let result = InputValidator::validate_file_input(f.path());
assert!(result.is_err());
assert_eq!(result.unwrap_err().error_code, 22); }
#[test]
fn test_validate_file_valid_csv() {
let mut f = NamedTempFile::with_suffix(".csv").unwrap();
write!(f, "a,b\n1,2\n").unwrap();
f.flush().unwrap();
assert!(InputValidator::validate_file_input(f.path()).is_ok());
}
#[test]
fn test_validate_file_valid_json() {
let mut f = NamedTempFile::with_suffix(".json").unwrap();
write!(f, "[]").unwrap();
f.flush().unwrap();
assert!(InputValidator::validate_file_input(f.path()).is_ok());
}
#[test]
fn test_glob_pattern_empty_rejected() {
assert!(InputValidator::validate_glob_pattern("").is_err());
}
#[test]
fn test_glob_pattern_valid() {
assert!(InputValidator::validate_glob_pattern("*.csv").is_ok());
assert!(InputValidator::validate_glob_pattern("data/**/*.json").is_ok());
}
#[test]
fn test_config_file_nonexistent() {
assert!(InputValidator::validate_config_file(Path::new("/no/config.toml")).is_err());
}
#[test]
fn test_config_file_wrong_extension() {
let mut f = NamedTempFile::with_suffix(".yaml").unwrap();
write!(f, "key: value").unwrap();
f.flush().unwrap();
assert!(InputValidator::validate_config_file(f.path()).is_err());
}
#[test]
fn test_config_file_valid_toml() {
let mut f = NamedTempFile::with_suffix(".toml").unwrap();
write!(f, "[settings]").unwrap();
f.flush().unwrap();
assert!(InputValidator::validate_config_file(f.path()).is_ok());
}
}