1use crate::error::BuildError;
19use once_cell::sync::Lazy;
20use regex::Regex;
21use std::collections::HashSet;
22use std::fs;
23use std::path::{Component, Path, PathBuf};
24
25const MAX_PATH_LENGTH: usize = 260; const MAX_PATH_DEPTH: usize = 32;
30
31static DIRECTORY_TRAVERSAL: Lazy<Regex> =
33 Lazy::new(|| Regex::new(r"(?i)(\.\./|\.\.\x5c|/\.\./|\x5c\.\.\x5c)").unwrap());
34
35static ENCODED_TRAVERSAL: Lazy<Regex> =
37 Lazy::new(|| Regex::new(r"(?i)(%2e%2e%2f|%2e%2e%5c|%252e%252e%252f|%252e%252e%255c)").unwrap());
38
39static ABSOLUTE_PATH: Lazy<Regex> =
41 Lazy::new(|| Regex::new(r"(?i)(^[a-zA-Z]:\x5c|^/|^\x5c\x5c)").unwrap());
42
43static DANGEROUS_CHARS: Lazy<Regex> =
45 Lazy::new(|| Regex::new(r"[\x00-\x1F\x7F-\x9F]|%00").unwrap());
46
47static WINDOWS_RESERVED: Lazy<Regex> =
49 Lazy::new(|| Regex::new(r"(?i)^(con|prn|aux|nul|com[1-9]|lpt[1-9])(\.|$)").unwrap());
50
51static SUSPICIOUS_EXTENSIONS: Lazy<Regex> =
53 Lazy::new(|| Regex::new(r"(?i)\.(exe|bat|cmd|com|scr|pif|vbs|js|jar|dll|sys)$").unwrap());
54
55static WINDOWS_RESERVED_NAMES: Lazy<HashSet<&str>> = Lazy::new(|| {
57 let mut set = HashSet::new();
58 set.insert("CON");
59 set.insert("PRN");
60 set.insert("AUX");
61 set.insert("NUL");
62 set.insert("COM1");
63 set.insert("COM2");
64 set.insert("COM3");
65 set.insert("COM4");
66 set.insert("COM5");
67 set.insert("COM6");
68 set.insert("COM7");
69 set.insert("COM8");
70 set.insert("COM9");
71 set.insert("LPT1");
72 set.insert("LPT2");
73 set.insert("LPT3");
74 set.insert("LPT4");
75 set.insert("LPT5");
76 set.insert("LPT6");
77 set.insert("LPT7");
78 set.insert("LPT8");
79 set.insert("LPT9");
80 set
81});
82
83#[derive(Debug, Clone)]
85pub struct PathValidationConfig {
86 pub max_path_length: usize,
88 pub max_path_depth: usize,
90 pub allowed_base_dirs: Vec<PathBuf>,
92 pub allow_relative_outside_base: bool,
94 pub validate_symlinks: bool,
96 pub check_existence: bool,
98 pub allowed_extensions: HashSet<String>,
100 pub allow_hidden: bool,
102}
103
104impl Default for PathValidationConfig {
105 fn default() -> Self {
106 let mut allowed_extensions = HashSet::new();
107 allowed_extensions.insert("xml".to_string());
108 allowed_extensions.insert("json".to_string());
109 allowed_extensions.insert("txt".to_string());
110 allowed_extensions.insert("csv".to_string());
111
112 Self {
113 max_path_length: MAX_PATH_LENGTH,
114 max_path_depth: MAX_PATH_DEPTH,
115 allowed_base_dirs: vec![
116 PathBuf::from("data"),
117 PathBuf::from("input"),
118 PathBuf::from("output"),
119 PathBuf::from("temp"),
120 PathBuf::from("."),
121 ],
122 allow_relative_outside_base: false,
123 validate_symlinks: true,
124 check_existence: false,
125 allowed_extensions,
126 allow_hidden: false,
127 }
128 }
129}
130
131#[derive(Debug, Clone)]
133pub struct ValidatedPath {
134 pub original: String,
136 pub normalized: PathBuf,
138 pub canonical: Option<PathBuf>,
140 pub exists: bool,
142 pub warnings: Vec<String>,
144}
145
146#[derive(Debug, Clone)]
148pub struct PathValidator {
149 config: PathValidationConfig,
150}
151
152impl PathValidator {
153 pub fn new() -> Self {
155 Self {
156 config: PathValidationConfig::default(),
157 }
158 }
159
160 pub fn with_config(config: PathValidationConfig) -> Self {
162 Self { config }
163 }
164
165 pub fn validate(&self, path_str: &str) -> Result<ValidatedPath, BuildError> {
167 let sanitized_input = self.sanitize_input(path_str)?;
169
170 if sanitized_input.len() > self.config.max_path_length {
172 return Err(BuildError::InputSanitization(format!(
173 "Path too long: {} > {}",
174 sanitized_input.len(),
175 self.config.max_path_length
176 )));
177 }
178
179 self.detect_dangerous_patterns(&sanitized_input)?;
181
182 let normalized = self.normalize_path(&sanitized_input)?;
184
185 self.validate_components(&normalized)?;
187
188 self.validate_against_whitelist(&normalized)?;
190
191 let (canonical, exists) = self.safe_canonicalize(&normalized);
193
194 if self.config.validate_symlinks {
196 self.validate_symlinks(&normalized, &canonical)?;
197 }
198
199 if self.config.check_existence && !exists {
201 return Err(BuildError::InputSanitization(
202 "File does not exist".to_string(),
203 ));
204 }
205
206 let warnings = self.collect_warnings(&sanitized_input, &normalized);
207
208 Ok(ValidatedPath {
209 original: path_str.to_string(),
210 normalized,
211 canonical,
212 exists,
213 warnings,
214 })
215 }
216
217 fn sanitize_input(&self, input: &str) -> Result<String, BuildError> {
219 if input.contains('\0') {
221 return Err(BuildError::InputSanitization(
222 "Null byte detected in path".to_string(),
223 ));
224 }
225
226 let decoded = self.safe_url_decode(input)?;
228
229 if decoded
231 .chars()
232 .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
233 {
234 return Err(BuildError::InputSanitization(
235 "Control characters detected in path".to_string(),
236 ));
237 }
238
239 let normalized = self.normalize_unicode(&decoded)?;
241
242 Ok(normalized)
243 }
244
245 fn safe_url_decode(&self, input: &str) -> Result<String, BuildError> {
247 let first_decode = urlencoding::decode(input)
248 .map_err(|e| BuildError::InputSanitization(format!("URL decode error: {}", e)))?;
249
250 let second_decode = urlencoding::decode(&first_decode);
252 if second_decode.is_ok() && second_decode.as_ref().unwrap() != &first_decode {
253 return Err(BuildError::InputSanitization(
254 "Double URL encoding detected (potential attack)".to_string(),
255 ));
256 }
257
258 Ok(first_decode.into_owned())
259 }
260
261 fn normalize_unicode(&self, input: &str) -> Result<String, BuildError> {
263 use unicode_normalization::UnicodeNormalization;
264
265 let nfc = input.nfc().collect::<String>();
266 let nfd = input.nfd().collect::<String>();
267 let nfkc = input.nfkc().collect::<String>();
268 let nfkd = input.nfkd().collect::<String>();
269
270 let forms_identical = nfc == nfd && nfd == nfkc && nfkc == nfkd;
273
274 if !forms_identical {
277 let forms = [&nfc, &nfd, &nfkc, &nfkd];
278 let mut dangerous_forms = Vec::new();
279
280 for (i, form) in forms.iter().enumerate() {
281 if DIRECTORY_TRAVERSAL.is_match(form)
282 || ENCODED_TRAVERSAL.is_match(form)
283 || ABSOLUTE_PATH.is_match(form)
284 || DANGEROUS_CHARS.is_match(form)
285 {
286 dangerous_forms.push(match i {
287 0 => "NFC",
288 1 => "NFD",
289 2 => "NFKC",
290 3 => "NFKD",
291 _ => unreachable!(),
292 });
293 }
294 }
295
296 if !dangerous_forms.is_empty() {
297 return Err(BuildError::InputSanitization(format!(
298 "Unicode normalization attack detected in forms: {:?}",
299 dangerous_forms
300 )));
301 }
302 }
303
304 Ok(nfc)
306 }
307
308 fn detect_dangerous_patterns(&self, path: &str) -> Result<(), BuildError> {
310 if DIRECTORY_TRAVERSAL.is_match(path) {
312 return Err(BuildError::InputSanitization(
313 "Directory traversal pattern detected".to_string(),
314 ));
315 }
316
317 if ENCODED_TRAVERSAL.is_match(path) {
319 return Err(BuildError::InputSanitization(
320 "Encoded path traversal detected".to_string(),
321 ));
322 }
323
324 if ABSOLUTE_PATH.is_match(path) {
326 return Err(BuildError::InputSanitization(
327 "Absolute path not allowed".to_string(),
328 ));
329 }
330
331 if DANGEROUS_CHARS.is_match(path) {
333 return Err(BuildError::InputSanitization(
334 "Dangerous characters detected".to_string(),
335 ));
336 }
337
338 if let Some(filename) = Path::new(path).file_name().and_then(|s| s.to_str()) {
340 if WINDOWS_RESERVED.is_match(filename) {
341 return Err(BuildError::InputSanitization(
342 "Windows reserved filename detected".to_string(),
343 ));
344 }
345
346 let filename_upper = filename.to_uppercase();
348 let base_name = filename_upper.split('.').next().unwrap_or(&filename_upper);
349 if WINDOWS_RESERVED_NAMES.contains(base_name) {
350 return Err(BuildError::InputSanitization(
351 "Windows reserved filename detected".to_string(),
352 ));
353 }
354 }
355
356 Ok(())
357 }
358
359 fn normalize_path(&self, path: &str) -> Result<PathBuf, BuildError> {
361 let normalized_str = path.replace('\\', "/");
363
364 let components: Vec<&str> = normalized_str
366 .split('/')
367 .filter(|c| !c.is_empty() && *c != ".")
368 .collect();
369
370 if components.len() > self.config.max_path_depth {
372 return Err(BuildError::InputSanitization(format!(
373 "Path too deep: {} > {}",
374 components.len(),
375 self.config.max_path_depth
376 )));
377 }
378
379 let mut normalized = PathBuf::new();
381 for component in components {
382 if component == ".." {
384 return Err(BuildError::InputSanitization(
385 "Path traversal (..) detected".to_string(),
386 ));
387 }
388
389 normalized.push(component);
390 }
391
392 Ok(normalized)
393 }
394
395 fn validate_components(&self, path: &Path) -> Result<(), BuildError> {
397 for component in path.components() {
398 match component {
399 Component::Normal(name) => {
400 let name_str = name.to_string_lossy();
401
402 if !self.config.allow_hidden && name_str.starts_with('.') && name_str != "." {
405 return Err(BuildError::InputSanitization(
406 "Hidden files/directories not allowed".to_string(),
407 ));
408 }
409
410 if name_str.len() > 255 {
412 return Err(BuildError::InputSanitization(
413 "Path component too long".to_string(),
414 ));
415 }
416
417 if name_str.chars().any(|c| r#"<>:"|?*"#.contains(c)) {
419 return Err(BuildError::InputSanitization(
420 "Dangerous characters in path component".to_string(),
421 ));
422 }
423 }
424 Component::ParentDir => {
425 return Err(BuildError::InputSanitization(
426 "Parent directory traversal detected".to_string(),
427 ));
428 }
429 Component::RootDir => {
430 return Err(BuildError::InputSanitization(
431 "Root directory access not allowed".to_string(),
432 ));
433 }
434 Component::Prefix(_) => {
435 return Err(BuildError::InputSanitization(
436 "Windows path prefix not allowed".to_string(),
437 ));
438 }
439 Component::CurDir => {
440 }
442 }
443 }
444
445 Ok(())
446 }
447
448 fn validate_against_whitelist(&self, path: &Path) -> Result<(), BuildError> {
450 if self.config.allow_relative_outside_base && path.is_relative() {
451 return Ok(()); }
453
454 for base_dir in &self.config.allowed_base_dirs {
456 if path.starts_with(base_dir) || path == base_dir {
457 return Ok(());
458 }
459
460 if base_dir == Path::new(".")
463 && (path.parent().is_none() || path.parent() == Some(Path::new("")))
464 {
465 return Ok(());
466 }
467
468 if let Ok(canonical_base) = base_dir.canonicalize() {
470 if let Ok(canonical_path) = path.canonicalize() {
471 if canonical_path.starts_with(canonical_base) {
472 return Ok(());
473 }
474 }
475 }
476 }
477
478 Err(BuildError::InputSanitization(
479 "Path not within allowed directories".to_string(),
480 ))
481 }
482
483 fn safe_canonicalize(&self, path: &Path) -> (Option<PathBuf>, bool) {
485 let exists = path.exists();
486
487 if exists {
489 match path.canonicalize() {
490 Ok(canonical) => (Some(canonical), true),
491 Err(_) => (None, exists),
492 }
493 } else {
494 if let Some(parent) = path.parent() {
496 if parent.exists() {
497 match parent.canonicalize() {
498 Ok(canonical_parent) => {
499 if let Some(filename) = path.file_name() {
500 let canonical = canonical_parent.join(filename);
501 (Some(canonical), false)
502 } else {
503 (None, false)
504 }
505 }
506 Err(_) => (None, false),
507 }
508 } else {
509 (None, false)
510 }
511 } else {
512 (None, false)
513 }
514 }
515 }
516
517 fn validate_symlinks(
519 &self,
520 normalized: &Path,
521 canonical: &Option<PathBuf>,
522 ) -> Result<(), BuildError> {
523 if let Some(canonical_path) = canonical {
524 if normalized != canonical_path {
527 self.validate_against_whitelist(canonical_path)?;
529
530 if let Some(target_str) = canonical_path.to_str() {
532 if DIRECTORY_TRAVERSAL.is_match(target_str)
533 || ENCODED_TRAVERSAL.is_match(target_str)
534 || ABSOLUTE_PATH.is_match(target_str)
535 || DANGEROUS_CHARS.is_match(target_str)
536 {
537 return Err(BuildError::InputSanitization(
538 "Symlink target contains dangerous patterns".to_string(),
539 ));
540 }
541 }
542
543 if let Ok(metadata) = fs::symlink_metadata(normalized) {
545 if metadata.file_type().is_symlink() {
546 let mut visited = HashSet::new();
548 let mut current = normalized.to_path_buf();
549
550 while current.is_symlink() && visited.len() < 32 {
551 if visited.contains(¤t) {
552 return Err(BuildError::InputSanitization(
553 "Symlink loop detected".to_string(),
554 ));
555 }
556 visited.insert(current.clone());
557
558 match fs::read_link(¤t) {
559 Ok(target) => {
560 current = if target.is_absolute() {
561 target
562 } else {
563 current
564 .parent()
565 .unwrap_or_else(|| Path::new("."))
566 .join(target)
567 };
568 }
569 Err(_) => break,
570 }
571 }
572
573 if visited.len() >= 32 {
574 return Err(BuildError::InputSanitization(
575 "Symlink chain too long (potential loop)".to_string(),
576 ));
577 }
578 }
579 }
580 }
581 }
582
583 Ok(())
584 }
585
586 fn collect_warnings(&self, input: &str, normalized: &Path) -> Vec<String> {
588 let mut warnings = Vec::new();
589
590 if input.chars().any(|c| !c.is_ascii()) {
592 warnings.push("Path contains non-ASCII characters".to_string());
593 }
594
595 if let Some(filename) = normalized.file_name().and_then(|s| s.to_str()) {
597 if filename.len() > 100 {
598 warnings.push("Very long filename".to_string());
599 }
600 }
601
602 if normalized.components().count() > 8 {
604 warnings.push("Deeply nested path".to_string());
605 }
606
607 if let Some(extension) = normalized.extension().and_then(|s| s.to_str()) {
609 if !self
610 .config
611 .allowed_extensions
612 .contains(&extension.to_lowercase())
613 {
614 warnings.push(format!("Unusual file extension: {}", extension));
615 }
616 }
617
618 if let Some(filename) = normalized.file_name().and_then(|s| s.to_str()) {
620 if SUSPICIOUS_EXTENSIONS.is_match(filename) {
621 warnings.push("Suspicious file extension detected".to_string());
622 }
623 }
624
625 warnings
626 }
627
628 pub fn config(&self) -> &PathValidationConfig {
630 &self.config
631 }
632
633 pub fn update_config(&mut self, config: PathValidationConfig) {
635 self.config = config;
636 }
637}
638
639impl Default for PathValidator {
640 fn default() -> Self {
641 Self::new()
642 }
643}
644
645#[cfg(test)]
646mod tests {
647 use super::*;
648 use std::path::Path;
649
650 #[test]
651 fn test_basic_path_validation() {
652 let validator = PathValidator::new();
653
654 assert!(validator.validate("data/file.xml").is_ok());
656 assert!(validator.validate("input/subdir/file.json").is_ok());
657
658 assert!(validator.validate("./file.txt").is_ok());
659
660 assert!(validator.validate("../etc/passwd").is_err());
662 assert!(validator.validate("/etc/passwd").is_err());
663 assert!(validator.validate("C:\\Windows\\System32").is_err());
664 }
665
666 #[test]
667 fn test_dangerous_patterns() {
668 let validator = PathValidator::new();
669
670 let dangerous_paths = vec![
671 "../../../etc/passwd",
672 "..\\..\\..\\windows\\system32\\config\\sam",
673 "/etc/passwd",
674 "/proc/self/environ",
675 "C:\\Windows\\System32",
676 "\\\\server\\share",
677 "file%00.txt",
678 "%2e%2e%2fpasswd",
679 "%252e%252e%252fpasswd",
680 ];
681
682 for path in dangerous_paths {
683 let result = validator.validate(path);
684 assert!(result.is_err(), "Should reject dangerous path: {}", path);
685 }
686 }
687
688 #[test]
689 fn test_url_encoding_attacks() {
690 let validator = PathValidator::new();
691
692 let encoded_attacks = vec![
693 "%2e%2e%2f", "%2e%2e%5c", "%252e%252e%252f", "..%2f", "..%00", ];
699
700 for attack in encoded_attacks {
701 assert!(
702 validator.validate(attack).is_err(),
703 "Should block encoded attack: {}",
704 attack
705 );
706 }
707 }
708
709 #[test]
710 fn test_windows_reserved_names() {
711 let validator = PathValidator::new();
712
713 let reserved_names = vec![
714 "CON", "PRN", "AUX", "NUL", "COM1", "COM2", "LPT1", "LPT2", "con.txt", "prn.xml",
715 "aux.json",
716 ];
717
718 for name in reserved_names {
719 assert!(
720 validator.validate(name).is_err(),
721 "Should block reserved name: {}",
722 name
723 );
724 }
725 }
726
727 #[test]
728 fn test_path_normalization() {
729 let validator = PathValidator::new();
730
731 let result = validator.validate("data//file.xml").unwrap();
733 assert_eq!(result.normalized, Path::new("data/file.xml"));
734
735 let result = validator.validate("data\\subdir\\file.json").unwrap();
736 assert_eq!(result.normalized, Path::new("data/subdir/file.json"));
737
738 let result = validator.validate("./data/./file.txt").unwrap();
739 assert_eq!(result.normalized, Path::new("data/file.txt"));
740 }
741
742 #[test]
743 fn test_whitelist_validation() {
744 let mut config = PathValidationConfig::default();
745 config.allowed_base_dirs = vec![PathBuf::from("allowed")];
746 config.allow_relative_outside_base = false;
747
748 let validator = PathValidator::with_config(config);
749
750 assert!(validator.validate("allowed/file.xml").is_ok());
751 assert!(validator.validate("disallowed/file.xml").is_err());
752 }
753
754 #[test]
755 fn test_unicode_normalization() {
756 let validator = PathValidator::new();
757
758 assert!(validator.validate("data/résumé.txt").is_ok());
760
761 }
764
765 #[test]
766 fn test_length_limits() {
767 let mut config = PathValidationConfig::default();
768 config.max_path_length = 50;
769 config.max_path_depth = 3;
770
771 let validator = PathValidator::with_config(config);
772
773 let long_path = "a/".repeat(30);
775 assert!(validator.validate(&long_path).is_err());
776
777 let deep_path = "a/b/c/d/e/f/g.txt";
779 assert!(validator.validate(deep_path).is_err());
780 }
781
782 #[test]
783 fn test_file_extensions() {
784 let mut config = PathValidationConfig::default();
785 config.allowed_extensions = vec!["xml".to_string(), "json".to_string()]
786 .into_iter()
787 .collect();
788
789 let validator = PathValidator::with_config(config);
790
791 let result = validator.validate("data/file.xml").unwrap();
792 assert!(result.warnings.is_empty());
793
794 let result = validator.validate("data/file.exe").unwrap();
795 assert!(result.warnings.iter().any(|w| w.contains("extension")));
796 }
797}