path_security/
encoding.rs

1//! Encoding detection and normalization for path security
2
3use anyhow::{bail, Result};
4
5use crate::constants::*;
6
7/// Detect URL-encoded path traversal patterns
8/// Checks for: %2e, %2f, %5c and their uppercase variants
9pub fn detect_url_encoding(path: &str) -> Result<()> {
10    for pattern in SUSPICIOUS_ENCODED_PATTERNS.iter() {
11        if path.contains(pattern) {
12            bail!("URL-encoded characters detected in path: {}", pattern);
13        }
14    }
15    
16    // Check for double URL encoding (%25 = %)
17    if path.contains("%25") {
18        bail!("Double URL encoding detected in path");
19    }
20    
21    Ok(())
22}
23
24/// Detect UTF-8 overlong encoding attacks
25/// Overlong encodings like %c0%ae for "." are invalid but sometimes parsed
26pub fn detect_overlong_utf8(path: &str) -> Result<()> {
27    let path_lower = path.to_lowercase();
28    for pattern in OVERLONG_UTF8_PATTERNS.iter() {
29        if path_lower.contains(pattern) {
30            bail!("UTF-8 overlong encoding detected: {}", pattern);
31        }
32    }
33    
34    Ok(())
35}
36
37/// Detect Unicode encoding tricks
38pub fn detect_unicode_encoding(path: &str) -> Result<()> {
39    // Check for %u encoding (non-standard but sometimes accepted)
40    if path.contains("%u") {
41        bail!("Unicode percent encoding (%u) detected in path");
42    }
43    
44    // Check for HTML entity encoding
45    if path.contains("&#") {
46        bail!("HTML entity encoding detected in path");
47    }
48    
49    Ok(())
50}
51
52/// Detect dangerous Unicode characters
53pub fn detect_dangerous_unicode(path: &str) -> Result<()> {
54    for ch in path.chars() {
55        match ch {
56            // Zero-width characters
57            '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' => {
58                bail!("Zero-width Unicode character detected in path");
59            }
60            // Right-to-left override
61            '\u{202E}' => {
62                bail!("Right-to-left override character detected in path");
63            }
64            // Unicode homoglyphs for dots and slashes
65            '\u{2024}' | '\u{2025}' | '\u{2026}' => {
66                bail!("Unicode dot homoglyph detected in path");
67            }
68            // Unicode slash homoglyphs (forward slash variants)
69            // U+2044 (⁄), U+2215 (∕), U+2571 (╱), U+29F8, U+FF0F (/)
70            '\u{2044}' | '\u{2215}' | '\u{2571}' | '\u{29F8}' | '\u{FF0F}' => {
71                bail!("Unicode slash homoglyph detected in path");
72            }
73            // Unicode backslash homoglyphs
74            // U+2216 (∖), U+FF3C (\)
75            '\u{2216}' | '\u{FF3C}' => {
76                bail!("Unicode backslash homoglyph detected in path");
77            }
78            // Code page specific homoglyphs that map to path separators
79            // U+00A5 (¥) - maps to \ in CP932 (Japanese)
80            // U+20A9 (₩) - maps to \ in CP949 (Korean) and CP1361
81            // U+00B4 (´) - maps to / in CP1253 (Greek)
82            '\u{00A5}' | '\u{20A9}' | '\u{00B4}' => {
83                bail!("Code page specific path separator homoglyph detected in path");
84            }
85            // Full-width characters
86            '\u{FF01}'..='\u{FF5E}' => {
87                bail!("Full-width Unicode character detected in path");
88            }
89            // Wildcard characters that could be misused
90            '?' | '*' => {
91                bail!("Wildcard character detected in path: {}", ch);
92            }
93            _ => {}
94        }
95    }
96    
97    Ok(())
98}
99
100/// Detect mixed encoding attacks (UTF-8 + UTF-16)
101pub fn detect_mixed_encoding(path: &str) -> bool {
102    // Check for UTF-16 BOM characters (Unicode BOM)
103    if path.starts_with('\u{FEFF}') || path.starts_with('\u{FFFE}') {
104        return true;
105    }
106    
107    // Check for HTML/XML entity encoding mixed with UTF-8
108    if path.contains("&#x") || path.contains("&#") {
109        return true;
110    }
111    
112    // Check for alternating null bytes (UTF-16 little-endian pattern)
113    let bytes = path.as_bytes();
114    if bytes.len() >= 4 {
115        let mut null_count = 0;
116        for i in (0..bytes.len()).step_by(2) {
117            if i + 1 < bytes.len() && bytes[i + 1] == 0 {
118                null_count += 1;
119            }
120        }
121        // If more than 25% of even positions have null bytes, likely UTF-16
122        if null_count > bytes.len() / 8 {
123            return true;
124        }
125    }
126    
127    false
128}
129
130/// Normalize path to detect hidden traversal attempts
131pub fn normalize_and_check(path: &str) -> Result<String> {
132    let mut normalized = path.to_string();
133    
134    // Trim leading and trailing whitespace
135    normalized = normalized.trim().to_string();
136    
137    // Check if whitespace was present (could be evasion)
138    if normalized != path {
139        bail!("Leading or trailing whitespace detected in path");
140    }
141    
142    // Check for internal excessive whitespace
143    if normalized.contains("  ") {
144        bail!("Multiple consecutive spaces detected in path");
145    }
146    
147    Ok(normalized)
148}