1use anyhow::Result;
2use std::path::{Path, PathBuf};
3
4pub struct InputValidator;
6
7#[derive(Debug)]
8pub struct ValidationError {
9 pub message: String,
10 pub suggestion: String,
11 pub error_code: i32,
12}
13
14impl std::fmt::Display for ValidationError {
15 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
16 write!(f, "{}\n{}", self.message, self.suggestion)
17 }
18}
19
20impl std::error::Error for ValidationError {}
21
22impl InputValidator {
23 pub fn validate_file_input(file_path: &Path) -> Result<(), ValidationError> {
25 if !file_path.exists() {
27 return Err(ValidationError {
28 message: format!("File not found: {}", file_path.display()),
29 suggestion: Self::generate_file_suggestions(file_path),
30 error_code: 2, });
32 }
33
34 if file_path.is_dir() {
36 return Err(ValidationError {
37 message: format!("Path is a directory, not a file: {}", file_path.display()),
38 suggestion: "Use --recursive flag to process directories, or specify a file path"
39 .to_string(),
40 error_code: 21, });
42 }
43
44 Self::validate_file_extension(file_path)?;
46
47 Self::validate_file_permissions(file_path)?;
49
50 Self::validate_file_size(file_path)?;
52
53 Ok(())
54 }
55
56 pub fn validate_config_file(config_path: &Path) -> Result<(), ValidationError> {
58 if !config_path.exists() {
59 return Err(ValidationError {
60 message: format!("Configuration file not found: {}", config_path.display()),
61 suggestion: format!(
62 "Create a config file at {} or use 'dataprof --help' to see configuration options",
63 config_path.display()
64 ),
65 error_code: 2,
66 });
67 }
68
69 if let Some(ext) = config_path.extension()
71 && ext != "toml"
72 {
73 return Err(ValidationError {
74 message: "Configuration file must have .toml extension".to_string(),
75 suggestion:
76 "Rename your config file to have .toml extension (e.g., .dataprof.toml)"
77 .to_string(),
78 error_code: 22, });
80 }
81
82 Ok(())
83 }
84
85 pub fn validate_chunk_size(chunk_size: usize) -> Result<(), ValidationError> {
87 if chunk_size == 0 {
88 return Err(ValidationError {
89 message: "Chunk size cannot be zero".to_string(),
90 suggestion: "Use a positive chunk size (e.g. `ChunkSize::Fixed(1000)`) or `ChunkSize::Adaptive` for automatic sizing".to_string(),
91 error_code: 22,
92 });
93 }
94
95 if chunk_size < 10 {
96 return Err(ValidationError {
97 message: format!("Chunk size too small: {}", chunk_size),
98 suggestion: "Use at least 10 rows per chunk for efficient processing".to_string(),
99 error_code: 22,
100 });
101 }
102
103 if chunk_size > 10_000_000 {
104 return Err(ValidationError {
105 message: format!("Chunk size very large: {}", chunk_size),
106 suggestion: "Consider using smaller chunks (< 10M rows) to avoid memory issues"
107 .to_string(),
108 error_code: 22,
109 });
110 }
111
112 Ok(())
113 }
114
115 pub fn validate_sample_size(sample_size: usize) -> Result<(), ValidationError> {
117 if sample_size == 0 {
118 return Err(ValidationError {
119 message: "Sample size cannot be zero".to_string(),
120 suggestion:
121 "Use a positive sample size (e.g., --sample 10000) or omit for full analysis"
122 .to_string(),
123 error_code: 22,
124 });
125 }
126
127 if sample_size < 100 {
128 return Err(ValidationError {
129 message: format!("Sample size very small: {}", sample_size),
130 suggestion: "Use at least 100 samples for meaningful statistical analysis"
131 .to_string(),
132 error_code: 22,
133 });
134 }
135
136 Ok(())
137 }
138
139 pub fn validate_argument_combinations(
141 streaming: bool,
142 sample: Option<usize>,
143 progress: bool,
144 benchmark: bool,
145 ) -> Result<(), ValidationError> {
146 if progress && !streaming {
148 return Err(ValidationError {
149 message: "Progress display requires streaming mode".to_string(),
150 suggestion: "Add --streaming flag when using --progress".to_string(),
151 error_code: 22,
152 });
153 }
154
155 if benchmark && streaming {
157 return Err(ValidationError {
158 message: "Benchmark mode conflicts with streaming".to_string(),
159 suggestion: "Use either --benchmark OR --streaming, not both".to_string(),
160 error_code: 22,
161 });
162 }
163
164 if benchmark && sample.is_some() {
165 return Err(ValidationError {
166 message: "Benchmark mode conflicts with sampling".to_string(),
167 suggestion: "Use either --benchmark OR --sample, not both".to_string(),
168 error_code: 22,
169 });
170 }
171
172 Ok(())
173 }
174
175 fn generate_file_suggestions(file_path: &Path) -> String {
177 let mut suggestions = Vec::new();
178
179 if let Some(parent) = file_path.parent() {
181 if parent.exists() {
182 if let Ok(entries) = std::fs::read_dir(parent) {
184 let similar_files: Vec<PathBuf> = entries
185 .filter_map(|entry| entry.ok())
186 .filter(|entry| {
187 if let Some(ext) = entry.path().extension() {
188 matches!(ext.to_str(), Some("csv") | Some("json") | Some("jsonl"))
189 } else {
190 false
191 }
192 })
193 .take(3)
194 .map(|entry| entry.path())
195 .collect();
196
197 if !similar_files.is_empty() {
198 suggestions.push(format!("Similar files found in {}:", parent.display()));
199 for file in similar_files {
200 suggestions.push(format!(" • {}", file.display()));
201 }
202 }
203 }
204 } else {
205 suggestions.push(format!(
206 "Parent directory does not exist: {}",
207 parent.display()
208 ));
209 }
210 }
211
212 if file_path.is_relative() {
214 suggestions
215 .push("Try using an absolute path or check your current directory".to_string());
216 }
217
218 if suggestions.is_empty() {
219 "Check the file path and make sure the file exists".to_string()
220 } else {
221 suggestions.join("\n")
222 }
223 }
224
225 fn validate_file_extension(file_path: &Path) -> Result<(), ValidationError> {
227 if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
228 match ext.to_lowercase().as_str() {
229 "csv" | "json" | "jsonl" => Ok(()),
230 _ => Err(ValidationError {
231 message: format!("Unsupported file format: .{}", ext),
232 suggestion: "Supported formats: .csv, .json, .jsonl".to_string(),
233 error_code: 22,
234 }),
235 }
236 } else {
237 Err(ValidationError {
238 message: "File has no extension or unrecognizable format".to_string(),
239 suggestion: "Use files with extensions: .csv, .json, or .jsonl".to_string(),
240 error_code: 22,
241 })
242 }
243 }
244
245 fn validate_file_permissions(file_path: &Path) -> Result<(), ValidationError> {
247 match std::fs::metadata(file_path) {
248 Ok(metadata) => {
249 if metadata.permissions().readonly() {
250 log::debug!("File is read-only: {}", file_path.display());
252 }
253 Ok(())
254 }
255 Err(e) => Err(ValidationError {
256 message: format!("Cannot access file metadata: {}", e),
257 suggestion: "Check file permissions and try again".to_string(),
258 error_code: 13,
259 }),
260 }
261 }
262
263 fn validate_file_size(file_path: &Path) -> Result<(), ValidationError> {
265 match std::fs::metadata(file_path) {
266 Ok(metadata) => {
267 let size_mb = metadata.len() as f64 / 1_048_576.0;
268
269 if size_mb > 1000.0 {
270 log::warn!(
272 "Large file detected ({:.1} MB). Consider using --streaming for better performance",
273 size_mb
274 );
275 }
276
277 if size_mb > 10_000.0 {
278 return Err(ValidationError {
279 message: format!("File very large: {:.1} GB", size_mb / 1024.0),
280 suggestion: "Use --streaming --sample for very large files, or ensure sufficient memory".to_string(),
281 error_code: 27, });
283 }
284
285 Ok(())
286 }
287 Err(e) => Err(ValidationError {
288 message: format!("Cannot check file size: {}", e),
289 suggestion: "Ensure file is accessible and try again".to_string(),
290 error_code: 13,
291 }),
292 }
293 }
294
295 #[cfg(feature = "database")]
297 pub fn validate_database_connection(connection_string: &str) -> Result<(), ValidationError> {
298 if connection_string.is_empty() {
299 return Err(ValidationError {
300 message: "Database connection string is empty".to_string(),
301 suggestion:
302 "Provide a valid connection string (e.g., postgresql://user:pass@host/db)"
303 .to_string(),
304 error_code: 22,
305 });
306 }
307
308 if !connection_string.contains("://") {
310 return Err(ValidationError {
311 message: "Invalid connection string format".to_string(),
312 suggestion: "Use format: protocol://[user:password@]host[:port]/database"
313 .to_string(),
314 error_code: 22,
315 });
316 }
317
318 let supported_protocols = ["postgresql", "postgres", "mysql", "sqlite"];
320 let protocol = connection_string.split("://").next().unwrap_or("");
321
322 if !supported_protocols.contains(&protocol) {
323 return Err(ValidationError {
324 message: format!("Unsupported database protocol: {}", protocol),
325 suggestion: format!("Supported protocols: {}", supported_protocols.join(", ")),
326 error_code: 22,
327 });
328 }
329
330 Ok(())
331 }
332
333 pub fn validate_glob_pattern(pattern: &str) -> Result<(), ValidationError> {
335 if pattern.is_empty() {
336 return Err(ValidationError {
337 message: "Glob pattern is empty".to_string(),
338 suggestion: "Provide a valid glob pattern (e.g., \"data/**/*.csv\")".to_string(),
339 error_code: 22,
340 });
341 }
342
343 match glob::Pattern::new(pattern) {
345 Ok(_) => Ok(()),
346 Err(e) => Err(ValidationError {
347 message: format!("Invalid glob pattern: {}", e),
348 suggestion: "Use valid glob syntax with *, **, ?, [abc], etc.".to_string(),
349 error_code: 22,
350 }),
351 }
352 }
353
354 pub fn get_exit_code(error: &ValidationError) -> i32 {
356 error.error_code
357 }
358}
359
360pub mod exit_codes {
362 pub const SUCCESS: i32 = 0;
363 pub const GENERAL_ERROR: i32 = 1;
364 pub const FILE_NOT_FOUND: i32 = 2;
365 pub const PERMISSION_DENIED: i32 = 13;
366 pub const INVALID_ARGUMENT: i32 = 22;
367 pub const FILE_TOO_LARGE: i32 = 27;
368 pub const NO_SPACE_LEFT: i32 = 28;
369 pub const BROKEN_PIPE: i32 = 32;
370
371 pub const INVALID_DATA_FORMAT: i32 = 65;
373 pub const PROCESSING_ERROR: i32 = 66;
374 pub const CONFIG_ERROR: i32 = 67;
375 pub const DATABASE_ERROR: i32 = 68;
376 pub const NETWORK_ERROR: i32 = 69;
377}
378
379#[cfg(test)]
380mod tests {
381 use super::*;
382 use std::io::Write;
383 use tempfile::NamedTempFile;
384
385 #[test]
388 fn test_chunk_size_zero_rejected() {
389 assert!(InputValidator::validate_chunk_size(0).is_err());
390 }
391
392 #[test]
393 fn test_chunk_size_too_small_rejected() {
394 assert!(InputValidator::validate_chunk_size(5).is_err());
395 }
396
397 #[test]
398 fn test_chunk_size_too_large_rejected() {
399 assert!(InputValidator::validate_chunk_size(20_000_000).is_err());
400 }
401
402 #[test]
403 fn test_chunk_size_valid() {
404 assert!(InputValidator::validate_chunk_size(1000).is_ok());
405 assert!(InputValidator::validate_chunk_size(10).is_ok());
406 assert!(InputValidator::validate_chunk_size(10_000_000).is_ok());
407 }
408
409 #[test]
412 fn test_sample_size_zero_rejected() {
413 assert!(InputValidator::validate_sample_size(0).is_err());
414 }
415
416 #[test]
417 fn test_sample_size_too_small_rejected() {
418 assert!(InputValidator::validate_sample_size(50).is_err());
419 }
420
421 #[test]
422 fn test_sample_size_valid() {
423 assert!(InputValidator::validate_sample_size(100).is_ok());
424 assert!(InputValidator::validate_sample_size(10_000).is_ok());
425 }
426
427 #[test]
430 fn test_progress_without_streaming_rejected() {
431 let result = InputValidator::validate_argument_combinations(false, None, true, false);
432 assert!(result.is_err());
433 }
434
435 #[test]
436 fn test_benchmark_with_streaming_rejected() {
437 let result = InputValidator::validate_argument_combinations(true, None, false, true);
438 assert!(result.is_err());
439 }
440
441 #[test]
442 fn test_benchmark_with_sample_rejected() {
443 let result = InputValidator::validate_argument_combinations(false, Some(1000), false, true);
444 assert!(result.is_err());
445 }
446
447 #[test]
448 fn test_valid_argument_combinations() {
449 assert!(InputValidator::validate_argument_combinations(true, None, true, false).is_ok());
451 assert!(InputValidator::validate_argument_combinations(false, None, false, false).is_ok());
453 assert!(InputValidator::validate_argument_combinations(false, None, false, true).is_ok());
455 }
456
457 #[test]
460 fn test_validate_file_nonexistent() {
461 let result = InputValidator::validate_file_input(Path::new("/nonexistent/file.csv"));
462 assert!(result.is_err());
463 let err = result.unwrap_err();
464 assert_eq!(err.error_code, 2); }
466
467 #[test]
468 fn test_validate_file_directory_rejected() {
469 let dir = tempfile::tempdir().unwrap();
470 let result = InputValidator::validate_file_input(dir.path());
471 assert!(result.is_err());
472 let err = result.unwrap_err();
473 assert_eq!(err.error_code, 21); }
475
476 #[test]
477 fn test_validate_file_unsupported_extension() {
478 let mut f = NamedTempFile::with_suffix(".xlsx").unwrap();
479 write!(f, "data").unwrap();
480 f.flush().unwrap();
481 let result = InputValidator::validate_file_input(f.path());
482 assert!(result.is_err());
483 assert_eq!(result.unwrap_err().error_code, 22); }
485
486 #[test]
487 fn test_validate_file_valid_csv() {
488 let mut f = NamedTempFile::with_suffix(".csv").unwrap();
489 write!(f, "a,b\n1,2\n").unwrap();
490 f.flush().unwrap();
491 assert!(InputValidator::validate_file_input(f.path()).is_ok());
492 }
493
494 #[test]
495 fn test_validate_file_valid_json() {
496 let mut f = NamedTempFile::with_suffix(".json").unwrap();
497 write!(f, "[]").unwrap();
498 f.flush().unwrap();
499 assert!(InputValidator::validate_file_input(f.path()).is_ok());
500 }
501
502 #[test]
505 fn test_glob_pattern_empty_rejected() {
506 assert!(InputValidator::validate_glob_pattern("").is_err());
507 }
508
509 #[test]
510 fn test_glob_pattern_valid() {
511 assert!(InputValidator::validate_glob_pattern("*.csv").is_ok());
512 assert!(InputValidator::validate_glob_pattern("data/**/*.json").is_ok());
513 }
514
515 #[test]
518 fn test_config_file_nonexistent() {
519 assert!(InputValidator::validate_config_file(Path::new("/no/config.toml")).is_err());
520 }
521
522 #[test]
523 fn test_config_file_wrong_extension() {
524 let mut f = NamedTempFile::with_suffix(".yaml").unwrap();
525 write!(f, "key: value").unwrap();
526 f.flush().unwrap();
527 assert!(InputValidator::validate_config_file(f.path()).is_err());
528 }
529
530 #[test]
531 fn test_config_file_valid_toml() {
532 let mut f = NamedTempFile::with_suffix(".toml").unwrap();
533 write!(f, "[settings]").unwrap();
534 f.flush().unwrap();
535 assert!(InputValidator::validate_config_file(f.path()).is_ok());
536 }
537}