Skip to main content

dataprof_core/
validation.rs

1use anyhow::Result;
2use std::path::{Path, PathBuf};
3
4/// Enhanced input validation with helpful error messages and suggestions
5pub struct InputValidator;
6
7#[derive(Debug)]
8pub struct ValidationError {
9    pub message: String,
10    pub suggestion: String,
11    pub error_code: i32,
12}
13
14impl std::fmt::Display for ValidationError {
15    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
16        write!(f, "{}\n{}", self.message, self.suggestion)
17    }
18}
19
20impl std::error::Error for ValidationError {}
21
22impl InputValidator {
23    /// Validate file input with helpful suggestions
24    pub fn validate_file_input(file_path: &Path) -> Result<(), ValidationError> {
25        // Check if file exists
26        if !file_path.exists() {
27            return Err(ValidationError {
28                message: format!("File not found: {}", file_path.display()),
29                suggestion: Self::generate_file_suggestions(file_path),
30                error_code: 2, // ENOENT
31            });
32        }
33
34        // Check if it's actually a file (not directory)
35        if file_path.is_dir() {
36            return Err(ValidationError {
37                message: format!("Path is a directory, not a file: {}", file_path.display()),
38                suggestion: "Use --recursive flag to process directories, or specify a file path"
39                    .to_string(),
40                error_code: 21, // EISDIR
41            });
42        }
43
44        // Check file extension
45        Self::validate_file_extension(file_path)?;
46
47        // Check file permissions
48        Self::validate_file_permissions(file_path)?;
49
50        // Check file size (warn for very large files)
51        Self::validate_file_size(file_path)?;
52
53        Ok(())
54    }
55
56    /// Validate configuration file
57    pub fn validate_config_file(config_path: &Path) -> Result<(), ValidationError> {
58        if !config_path.exists() {
59            return Err(ValidationError {
60                message: format!("Configuration file not found: {}", config_path.display()),
61                suggestion: format!(
62                    "Create a config file at {} or use 'dataprof --help' to see configuration options",
63                    config_path.display()
64                ),
65                error_code: 2,
66            });
67        }
68
69        // Check if it's a TOML file
70        if let Some(ext) = config_path.extension()
71            && ext != "toml"
72        {
73            return Err(ValidationError {
74                message: "Configuration file must have .toml extension".to_string(),
75                suggestion:
76                    "Rename your config file to have .toml extension (e.g., .dataprof.toml)"
77                        .to_string(),
78                error_code: 22, // EINVAL
79            });
80        }
81
82        Ok(())
83    }
84
85    /// Validate chunk size parameter
86    pub fn validate_chunk_size(chunk_size: usize) -> Result<(), ValidationError> {
87        if chunk_size == 0 {
88            return Err(ValidationError {
89                message: "Chunk size cannot be zero".to_string(),
90                suggestion: "Use a positive chunk size (e.g. `ChunkSize::Fixed(1000)`) or `ChunkSize::Adaptive` for automatic sizing".to_string(),
91                error_code: 22,
92            });
93        }
94
95        if chunk_size < 10 {
96            return Err(ValidationError {
97                message: format!("Chunk size too small: {}", chunk_size),
98                suggestion: "Use at least 10 rows per chunk for efficient processing".to_string(),
99                error_code: 22,
100            });
101        }
102
103        if chunk_size > 10_000_000 {
104            return Err(ValidationError {
105                message: format!("Chunk size very large: {}", chunk_size),
106                suggestion: "Consider using smaller chunks (< 10M rows) to avoid memory issues"
107                    .to_string(),
108                error_code: 22,
109            });
110        }
111
112        Ok(())
113    }
114
115    /// Validate sample size parameter
116    pub fn validate_sample_size(sample_size: usize) -> Result<(), ValidationError> {
117        if sample_size == 0 {
118            return Err(ValidationError {
119                message: "Sample size cannot be zero".to_string(),
120                suggestion:
121                    "Use a positive sample size (e.g., --sample 10000) or omit for full analysis"
122                        .to_string(),
123                error_code: 22,
124            });
125        }
126
127        if sample_size < 100 {
128            return Err(ValidationError {
129                message: format!("Sample size very small: {}", sample_size),
130                suggestion: "Use at least 100 samples for meaningful statistical analysis"
131                    .to_string(),
132                error_code: 22,
133            });
134        }
135
136        Ok(())
137    }
138
139    /// Validate conflicting arguments
140    pub fn validate_argument_combinations(
141        streaming: bool,
142        sample: Option<usize>,
143        progress: bool,
144        benchmark: bool,
145    ) -> Result<(), ValidationError> {
146        // Progress requires streaming
147        if progress && !streaming {
148            return Err(ValidationError {
149                message: "Progress display requires streaming mode".to_string(),
150                suggestion: "Add --streaming flag when using --progress".to_string(),
151                error_code: 22,
152            });
153        }
154
155        // Benchmark conflicts with other modes
156        if benchmark && streaming {
157            return Err(ValidationError {
158                message: "Benchmark mode conflicts with streaming".to_string(),
159                suggestion: "Use either --benchmark OR --streaming, not both".to_string(),
160                error_code: 22,
161            });
162        }
163
164        if benchmark && sample.is_some() {
165            return Err(ValidationError {
166                message: "Benchmark mode conflicts with sampling".to_string(),
167                suggestion: "Use either --benchmark OR --sample, not both".to_string(),
168                error_code: 22,
169            });
170        }
171
172        Ok(())
173    }
174
175    /// Generate helpful file suggestions
176    fn generate_file_suggestions(file_path: &Path) -> String {
177        let mut suggestions = Vec::new();
178
179        // Check parent directory
180        if let Some(parent) = file_path.parent() {
181            if parent.exists() {
182                // Look for similar files
183                if let Ok(entries) = std::fs::read_dir(parent) {
184                    let similar_files: Vec<PathBuf> = entries
185                        .filter_map(|entry| entry.ok())
186                        .filter(|entry| {
187                            if let Some(ext) = entry.path().extension() {
188                                matches!(ext.to_str(), Some("csv") | Some("json") | Some("jsonl"))
189                            } else {
190                                false
191                            }
192                        })
193                        .take(3)
194                        .map(|entry| entry.path())
195                        .collect();
196
197                    if !similar_files.is_empty() {
198                        suggestions.push(format!("Similar files found in {}:", parent.display()));
199                        for file in similar_files {
200                            suggestions.push(format!("  • {}", file.display()));
201                        }
202                    }
203                }
204            } else {
205                suggestions.push(format!(
206                    "Parent directory does not exist: {}",
207                    parent.display()
208                ));
209            }
210        }
211
212        // Check current directory
213        if file_path.is_relative() {
214            suggestions
215                .push("Try using an absolute path or check your current directory".to_string());
216        }
217
218        if suggestions.is_empty() {
219            "Check the file path and make sure the file exists".to_string()
220        } else {
221            suggestions.join("\n")
222        }
223    }
224
225    /// Validate file extension
226    fn validate_file_extension(file_path: &Path) -> Result<(), ValidationError> {
227        if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
228            match ext.to_lowercase().as_str() {
229                "csv" | "json" | "jsonl" => Ok(()),
230                _ => Err(ValidationError {
231                    message: format!("Unsupported file format: .{}", ext),
232                    suggestion: "Supported formats: .csv, .json, .jsonl".to_string(),
233                    error_code: 22,
234                }),
235            }
236        } else {
237            Err(ValidationError {
238                message: "File has no extension or unrecognizable format".to_string(),
239                suggestion: "Use files with extensions: .csv, .json, or .jsonl".to_string(),
240                error_code: 22,
241            })
242        }
243    }
244
245    /// Validate file permissions
246    fn validate_file_permissions(file_path: &Path) -> Result<(), ValidationError> {
247        match std::fs::metadata(file_path) {
248            Ok(metadata) => {
249                if metadata.permissions().readonly() {
250                    // This is actually OK for reading, but warn if they might want to write
251                    log::debug!("File is read-only: {}", file_path.display());
252                }
253                Ok(())
254            }
255            Err(e) => Err(ValidationError {
256                message: format!("Cannot access file metadata: {}", e),
257                suggestion: "Check file permissions and try again".to_string(),
258                error_code: 13,
259            }),
260        }
261    }
262
263    /// Validate file size and provide warnings
264    fn validate_file_size(file_path: &Path) -> Result<(), ValidationError> {
265        match std::fs::metadata(file_path) {
266            Ok(metadata) => {
267                let size_mb = metadata.len() as f64 / 1_048_576.0;
268
269                if size_mb > 1000.0 {
270                    // Large file warning, not error
271                    log::warn!(
272                        "Large file detected ({:.1} MB). Consider using --streaming for better performance",
273                        size_mb
274                    );
275                }
276
277                if size_mb > 10_000.0 {
278                    return Err(ValidationError {
279                        message: format!("File very large: {:.1} GB", size_mb / 1024.0),
280                        suggestion: "Use --streaming --sample for very large files, or ensure sufficient memory".to_string(),
281                        error_code: 27, // EFBIG
282                    });
283                }
284
285                Ok(())
286            }
287            Err(e) => Err(ValidationError {
288                message: format!("Cannot check file size: {}", e),
289                suggestion: "Ensure file is accessible and try again".to_string(),
290                error_code: 13,
291            }),
292        }
293    }
294
295    /// Validate database connection string format
296    #[cfg(feature = "database")]
297    pub fn validate_database_connection(connection_string: &str) -> Result<(), ValidationError> {
298        if connection_string.is_empty() {
299            return Err(ValidationError {
300                message: "Database connection string is empty".to_string(),
301                suggestion:
302                    "Provide a valid connection string (e.g., postgresql://user:pass@host/db)"
303                        .to_string(),
304                error_code: 22,
305            });
306        }
307
308        // Basic format validation
309        if !connection_string.contains("://") {
310            return Err(ValidationError {
311                message: "Invalid connection string format".to_string(),
312                suggestion: "Use format: protocol://[user:password@]host[:port]/database"
313                    .to_string(),
314                error_code: 22,
315            });
316        }
317
318        // Check for supported protocols
319        let supported_protocols = ["postgresql", "postgres", "mysql", "sqlite"];
320        let protocol = connection_string.split("://").next().unwrap_or("");
321
322        if !supported_protocols.contains(&protocol) {
323            return Err(ValidationError {
324                message: format!("Unsupported database protocol: {}", protocol),
325                suggestion: format!("Supported protocols: {}", supported_protocols.join(", ")),
326                error_code: 22,
327            });
328        }
329
330        Ok(())
331    }
332
333    /// Validate glob pattern
334    pub fn validate_glob_pattern(pattern: &str) -> Result<(), ValidationError> {
335        if pattern.is_empty() {
336            return Err(ValidationError {
337                message: "Glob pattern is empty".to_string(),
338                suggestion: "Provide a valid glob pattern (e.g., \"data/**/*.csv\")".to_string(),
339                error_code: 22,
340            });
341        }
342
343        // Test if pattern compiles
344        match glob::Pattern::new(pattern) {
345            Ok(_) => Ok(()),
346            Err(e) => Err(ValidationError {
347                message: format!("Invalid glob pattern: {}", e),
348                suggestion: "Use valid glob syntax with *, **, ?, [abc], etc.".to_string(),
349                error_code: 22,
350            }),
351        }
352    }
353
354    /// Get appropriate exit code for validation error
355    pub fn get_exit_code(error: &ValidationError) -> i32 {
356        error.error_code
357    }
358}
359
360/// Exit codes following Unix conventions
361pub mod exit_codes {
362    pub const SUCCESS: i32 = 0;
363    pub const GENERAL_ERROR: i32 = 1;
364    pub const FILE_NOT_FOUND: i32 = 2;
365    pub const PERMISSION_DENIED: i32 = 13;
366    pub const INVALID_ARGUMENT: i32 = 22;
367    pub const FILE_TOO_LARGE: i32 = 27;
368    pub const NO_SPACE_LEFT: i32 = 28;
369    pub const BROKEN_PIPE: i32 = 32;
370
371    // Custom application codes
372    pub const INVALID_DATA_FORMAT: i32 = 65;
373    pub const PROCESSING_ERROR: i32 = 66;
374    pub const CONFIG_ERROR: i32 = 67;
375    pub const DATABASE_ERROR: i32 = 68;
376    pub const NETWORK_ERROR: i32 = 69;
377}
378
379#[cfg(test)]
380mod tests {
381    use super::*;
382    use std::io::Write;
383    use tempfile::NamedTempFile;
384
385    // -- chunk size validation --
386
387    #[test]
388    fn test_chunk_size_zero_rejected() {
389        assert!(InputValidator::validate_chunk_size(0).is_err());
390    }
391
392    #[test]
393    fn test_chunk_size_too_small_rejected() {
394        assert!(InputValidator::validate_chunk_size(5).is_err());
395    }
396
397    #[test]
398    fn test_chunk_size_too_large_rejected() {
399        assert!(InputValidator::validate_chunk_size(20_000_000).is_err());
400    }
401
402    #[test]
403    fn test_chunk_size_valid() {
404        assert!(InputValidator::validate_chunk_size(1000).is_ok());
405        assert!(InputValidator::validate_chunk_size(10).is_ok());
406        assert!(InputValidator::validate_chunk_size(10_000_000).is_ok());
407    }
408
409    // -- sample size validation --
410
411    #[test]
412    fn test_sample_size_zero_rejected() {
413        assert!(InputValidator::validate_sample_size(0).is_err());
414    }
415
416    #[test]
417    fn test_sample_size_too_small_rejected() {
418        assert!(InputValidator::validate_sample_size(50).is_err());
419    }
420
421    #[test]
422    fn test_sample_size_valid() {
423        assert!(InputValidator::validate_sample_size(100).is_ok());
424        assert!(InputValidator::validate_sample_size(10_000).is_ok());
425    }
426
427    // -- argument combinations --
428
429    #[test]
430    fn test_progress_without_streaming_rejected() {
431        let result = InputValidator::validate_argument_combinations(false, None, true, false);
432        assert!(result.is_err());
433    }
434
435    #[test]
436    fn test_benchmark_with_streaming_rejected() {
437        let result = InputValidator::validate_argument_combinations(true, None, false, true);
438        assert!(result.is_err());
439    }
440
441    #[test]
442    fn test_benchmark_with_sample_rejected() {
443        let result = InputValidator::validate_argument_combinations(false, Some(1000), false, true);
444        assert!(result.is_err());
445    }
446
447    #[test]
448    fn test_valid_argument_combinations() {
449        // streaming + progress: OK
450        assert!(InputValidator::validate_argument_combinations(true, None, true, false).is_ok());
451        // no flags: OK
452        assert!(InputValidator::validate_argument_combinations(false, None, false, false).is_ok());
453        // benchmark alone: OK
454        assert!(InputValidator::validate_argument_combinations(false, None, false, true).is_ok());
455    }
456
457    // -- file validation --
458
459    #[test]
460    fn test_validate_file_nonexistent() {
461        let result = InputValidator::validate_file_input(Path::new("/nonexistent/file.csv"));
462        assert!(result.is_err());
463        let err = result.unwrap_err();
464        assert_eq!(err.error_code, 2); // ENOENT
465    }
466
467    #[test]
468    fn test_validate_file_directory_rejected() {
469        let dir = tempfile::tempdir().unwrap();
470        let result = InputValidator::validate_file_input(dir.path());
471        assert!(result.is_err());
472        let err = result.unwrap_err();
473        assert_eq!(err.error_code, 21); // EISDIR
474    }
475
476    #[test]
477    fn test_validate_file_unsupported_extension() {
478        let mut f = NamedTempFile::with_suffix(".xlsx").unwrap();
479        write!(f, "data").unwrap();
480        f.flush().unwrap();
481        let result = InputValidator::validate_file_input(f.path());
482        assert!(result.is_err());
483        assert_eq!(result.unwrap_err().error_code, 22); // EINVAL
484    }
485
486    #[test]
487    fn test_validate_file_valid_csv() {
488        let mut f = NamedTempFile::with_suffix(".csv").unwrap();
489        write!(f, "a,b\n1,2\n").unwrap();
490        f.flush().unwrap();
491        assert!(InputValidator::validate_file_input(f.path()).is_ok());
492    }
493
494    #[test]
495    fn test_validate_file_valid_json() {
496        let mut f = NamedTempFile::with_suffix(".json").unwrap();
497        write!(f, "[]").unwrap();
498        f.flush().unwrap();
499        assert!(InputValidator::validate_file_input(f.path()).is_ok());
500    }
501
502    // -- glob pattern validation --
503
504    #[test]
505    fn test_glob_pattern_empty_rejected() {
506        assert!(InputValidator::validate_glob_pattern("").is_err());
507    }
508
509    #[test]
510    fn test_glob_pattern_valid() {
511        assert!(InputValidator::validate_glob_pattern("*.csv").is_ok());
512        assert!(InputValidator::validate_glob_pattern("data/**/*.json").is_ok());
513    }
514
515    // -- config file validation --
516
517    #[test]
518    fn test_config_file_nonexistent() {
519        assert!(InputValidator::validate_config_file(Path::new("/no/config.toml")).is_err());
520    }
521
522    #[test]
523    fn test_config_file_wrong_extension() {
524        let mut f = NamedTempFile::with_suffix(".yaml").unwrap();
525        write!(f, "key: value").unwrap();
526        f.flush().unwrap();
527        assert!(InputValidator::validate_config_file(f.path()).is_err());
528    }
529
530    #[test]
531    fn test_config_file_valid_toml() {
532        let mut f = NamedTempFile::with_suffix(".toml").unwrap();
533        write!(f, "[settings]").unwrap();
534        f.flush().unwrap();
535        assert!(InputValidator::validate_config_file(f.path()).is_ok());
536    }
537}