soft_canonicalize/lib.rs
1//! # soft-canonicalize
2//!
3//! A pure Rust library for path canonicalization that works with non-existing paths.
4//!
5//! Unlike `std::fs::canonicalize()`, this library can resolve and normalize paths
6//! even when some or all of the path components don't exist on the filesystem.
7//! This is useful for security validation, path preprocessing, and working with
8//! paths before creating files.
9//!
10//! ## Features
11//!
12//! - **Works with non-existing paths**: Canonicalizes paths even when they don't exist
13//! - **Cross-platform**: Supports Windows, macOS, and Linux
14//! - **Zero dependencies**: No external dependencies beyond std
15//! - **Security focused**: Proper handling of `..` components and symlinks
16//! - **Pure algorithm**: No filesystem modification during canonicalization
17//!
18//! ## Example
19//!
20//! ```rust
21//! use soft_canonicalize::soft_canonicalize;
22//! use std::path::Path;
23//!
24//! # fn example() -> std::io::Result<()> {
25//! // Works with string paths (like std::fs::canonicalize)
26//! let from_str = soft_canonicalize("some/path/file.txt")?;
27//!
28//! // Works with existing paths (same as std::fs::canonicalize)
29//! let existing = soft_canonicalize(&std::env::temp_dir())?;
30//!
31//! // Also works with non-existing paths
32//! let non_existing = soft_canonicalize(
33//! std::env::temp_dir().join("some/deep/non/existing/path.txt")
34//! )?;
35//!
36//! // Resolves .. components logically
37//! let traversal = soft_canonicalize("some/path/../other/file.txt")?;
38//! # Ok(())
39//! # }
40//! ```
41//!
42//! ## Security
43//!
44//! This library is designed with security in mind:
45//!
46//! - Properly handles directory traversal (`..`) components
47//! - Resolves symlinks when they exist
48//! - Normalizes path separators and case (on case-insensitive filesystems)
49//! - Does not create or modify filesystem entries during canonicalization
50//!
51//! ## Algorithm
52//!
53//! The soft canonicalization algorithm works by:
54//!
55//! 1. Converting relative paths to absolute paths
56//! 2. Logically processing `..` components to resolve traversals
57//! 3. Finding the longest existing ancestor directory
58//! 4. Canonicalizing the existing portion using `std::fs::canonicalize`
59//! 5. Appending the non-existing components to the canonicalized base
60//!
61//! This approach provides the security benefits of full canonicalization while
62//! supporting paths that don't exist yet.
63
64use std::collections::HashSet;
65use std::path::{Path, PathBuf};
66use std::{fs, io};
67
68/// Maximum number of symlinks to follow before giving up.
69/// This matches the behavior of std::fs::canonicalize and OS limits:
70/// - Linux: ELOOP limit is typically 40
71/// - Windows: Similar limit around 63
72/// - Other Unix systems: Usually 32-40
73pub const MAX_SYMLINK_DEPTH: usize = if cfg!(target_os = "windows") { 63 } else { 40 };
74
75/// Internal helper function that finds the boundary between existing and non-existing path components.
76///
77/// Returns (existing_prefix, non_existing_suffix) where existing_prefix is the longest
78/// existing directory path, and non_existing_suffix contains the remaining components.
79/// This version properly handles symlinks by processing components incrementally.
80fn find_existing_boundary_with_symlinks(
81 path: &Path,
82 visited: &mut HashSet<PathBuf>,
83 symlink_depth: usize,
84) -> io::Result<(PathBuf, Vec<std::ffi::OsString>)> {
85 // Check symlink depth limit to match std::fs::canonicalize behavior
86 if symlink_depth > MAX_SYMLINK_DEPTH {
87 return Err(io::Error::new(
88 io::ErrorKind::InvalidInput,
89 "Too many levels of symbolic links",
90 ));
91 }
92
93 // Convert to absolute path first
94 let absolute_path = if path.is_absolute() {
95 path.to_path_buf()
96 } else {
97 std::env::current_dir()?.join(path)
98 };
99
100 // First, do lexical resolution of .. and . components
101 let mut resolved_components = Vec::new();
102 let mut result = PathBuf::new();
103
104 // Collect root components (Prefix, RootDir)
105 for component in absolute_path.components() {
106 match component {
107 std::path::Component::RootDir | std::path::Component::Prefix(_) => {
108 result.push(component.as_os_str());
109 }
110 std::path::Component::Normal(name) => {
111 resolved_components.push(name.to_os_string());
112 }
113 std::path::Component::ParentDir => {
114 // Handle .. by removing the last component if possible
115 if !resolved_components.is_empty() {
116 resolved_components.pop();
117 }
118 // If at root level, .. is ignored (cannot go above root)
119 }
120 std::path::Component::CurDir => {
121 // Ignore . components
122 }
123 }
124 }
125
126 // Now build path incrementally, handling symlinks as we go
127 let mut current_path = result;
128 let mut remaining_components = resolved_components.clone();
129
130 for (i, component) in resolved_components.iter().enumerate() {
131 let test_path = current_path.join(component);
132
133 if test_path.exists() {
134 // Check if this is a symlink
135 if test_path.is_symlink() {
136 // Check for symlink cycle
137 if visited.contains(&test_path) {
138 return Err(io::Error::new(
139 io::ErrorKind::InvalidInput,
140 "Too many levels of symbolic links",
141 ));
142 }
143
144 match fs::read_link(&test_path) {
145 Ok(target) => {
146 // Add this symlink to visited set
147 visited.insert(test_path.clone());
148
149 // Resolve the target path
150 let resolved_target = if target.is_absolute() {
151 target
152 } else {
153 current_path.join(target)
154 };
155
156 // Append remaining components to the target
157 let mut full_target = resolved_target;
158 for remaining in &resolved_components[i + 1..] {
159 full_target.push(remaining);
160 }
161
162 // Recursively process the target
163 let (symlink_prefix, symlink_suffix) =
164 find_existing_boundary_with_symlinks(
165 &full_target,
166 visited,
167 symlink_depth + 1,
168 )?;
169
170 // Remove from visited set
171 visited.remove(&test_path);
172
173 return Ok((symlink_prefix, symlink_suffix));
174 }
175 Err(_) => {
176 // Broken symlink - we still need to resolve it lexically
177 // Continue processing as if it doesn't exist, but we'll handle the
178 // symlink target resolution in the calling function
179 remaining_components = resolved_components[i..].to_vec();
180 break;
181 }
182 }
183 } else {
184 // Regular file/directory that exists
185 current_path = test_path;
186 remaining_components = resolved_components[i + 1..].to_vec();
187 }
188 } else {
189 // Found the boundary - everything from this component onwards doesn't exist
190 remaining_components = resolved_components[i..].to_vec();
191 break;
192 }
193 }
194
195 Ok((current_path, remaining_components))
196}
197
198/// Internal helper function that performs soft canonicalization.
199///
200/// This optimized version finds the existing/non-existing boundary and uses std::fs::canonicalize
201/// only on the existing portion for maximum efficiency while maintaining security and symlink handling.
202fn soft_canonicalize_internal(
203 path: &Path,
204 visited: &mut HashSet<PathBuf>,
205 symlink_depth: usize,
206) -> io::Result<PathBuf> {
207 // Check symlink depth limit to match std::fs::canonicalize behavior
208 if symlink_depth > MAX_SYMLINK_DEPTH {
209 return Err(io::Error::new(
210 io::ErrorKind::InvalidInput,
211 "Too many levels of symbolic links",
212 ));
213 }
214
215 // Handle empty path like std::fs::canonicalize - should fail
216 if path.as_os_str().is_empty() {
217 return Err(io::Error::new(
218 io::ErrorKind::NotFound,
219 "The system cannot find the path specified.",
220 ));
221 }
222
223 // Special handling for broken symlinks
224 if path.is_symlink() {
225 // Check for symlink cycle first
226 if visited.contains(path) {
227 return Err(io::Error::new(
228 io::ErrorKind::InvalidInput,
229 "Too many levels of symbolic links",
230 ));
231 }
232
233 // Check if we can canonicalize it (i.e., if the target exists)
234 match fs::canonicalize(path) {
235 Ok(canonical) => return Ok(canonical),
236 Err(_) => {
237 // It's a broken symlink - resolve it manually
238 let target = fs::read_link(path)?;
239 let resolved_target = if target.is_absolute() {
240 target
241 } else {
242 path.parent().unwrap_or(Path::new("/")).join(target)
243 };
244
245 // Add this symlink to visited set before recursing
246 visited.insert(path.to_path_buf());
247
248 // Recursively canonicalize the target (which may not exist)
249 let result =
250 soft_canonicalize_internal(&resolved_target, visited, symlink_depth + 1);
251
252 // Remove from visited set after recursion
253 visited.remove(path);
254
255 return result;
256 }
257 }
258 }
259
260 // Find the boundary between existing and non-existing components
261 let (existing_prefix, non_existing_suffix) =
262 find_existing_boundary_with_symlinks(path, visited, symlink_depth)?;
263
264 // Canonicalize the existing prefix (this handles all symlinks in the existing portion)
265 let canonical_prefix = if existing_prefix.as_os_str().is_empty()
266 || existing_prefix == Path::new("/")
267 || existing_prefix.parent().is_none()
268 {
269 // Handle root paths - they're already canonical
270 existing_prefix
271 } else {
272 // Use std::fs::canonicalize for existing paths - this is secure and handles all symlinks
273 fs::canonicalize(&existing_prefix)?
274 };
275
276 // Append the non-existing components lexically (no symlinks possible in non-existing paths)
277 let mut result = canonical_prefix;
278 for component in non_existing_suffix {
279 result.push(component);
280 }
281
282 Ok(result)
283}
284
285/// Performs "soft" canonicalization on a path.
286///
287/// Unlike `std::fs::canonicalize()`, this function works with non-existent paths by:
288/// 1. Finding the deepest existing ancestor directory
289/// 2. Canonicalizing that existing part (resolving symlinks, normalizing case, etc.)
290/// 3. Appending the non-existing path components to the canonicalized base
291///
292/// This provides the security benefits of canonicalization (symlink resolution,
293/// path normalization) without requiring the entire path to exist.
294///
295/// # Algorithm Details
296///
297/// The function performs the following steps:
298///
299/// 1. **Absolute Path Conversion**: Converts relative paths to absolute paths
300/// 2. **Logical Processing**: Processes `..` components mathematically without filesystem access
301/// 3. **Symlink Cycle Detection**: Tracks visited symlinks to prevent infinite recursion
302/// 4. **Existing Prefix Discovery**: Finds the longest existing ancestor
303/// 5. **Canonicalization**: Uses `std::fs::canonicalize` on the existing portion
304/// 6. **Reconstruction**: Appends non-existing components to the canonical base
305///
306/// # Security Considerations
307///
308/// - **Directory Traversal**: `..` components are resolved logically before filesystem access
309/// - **Symlink Resolution**: Existing symlinks are resolved with proper cycle detection
310/// - **No Side Effects**: No temporary files or directories are created during the process
311///
312/// # Cross-Platform Support
313///
314/// This function works correctly on:
315/// - **Windows**: Handles drive letters, UNC paths, and case normalization
316/// - **Unix-like systems**: Handles absolute paths starting with `/`
317/// - **All platforms**: Proper handling of path separators and components
318///
319/// # Examples
320///
321/// ## Basic Usage
322///
323/// ```rust
324/// use soft_canonicalize::soft_canonicalize;
325/// use std::path::{Path, PathBuf};
326///
327/// # fn example() -> std::io::Result<()> {
328/// // Works with &str (like std::fs::canonicalize)
329/// let from_str = soft_canonicalize("some/path/file.txt")?;
330///
331/// // Works with &Path
332/// let from_path = soft_canonicalize(Path::new("some/path/file.txt"))?;
333///
334/// // Works with &PathBuf
335/// let path_buf = PathBuf::from("some/path/file.txt");
336/// let from_pathbuf = soft_canonicalize(&path_buf)?;
337///
338/// // Works with existing paths (same as std::fs::canonicalize)
339/// let existing = soft_canonicalize(&std::env::temp_dir())?;
340/// println!("Existing path: {:?}", existing);
341///
342/// // Also works with non-existing paths
343/// let non_existing = soft_canonicalize(
344/// std::env::temp_dir().join("some/deep/non/existing/path.txt")
345/// )?;
346/// println!("Non-existing path: {:?}", non_existing);
347/// # Ok(())
348/// # }
349/// ```
350///
351/// ## Directory Traversal Handling
352///
353/// ```rust
354/// use soft_canonicalize::soft_canonicalize;
355/// use std::path::Path;
356///
357/// # fn example() -> std::io::Result<()> {
358/// // Resolves .. components logically
359/// let traversal = soft_canonicalize(
360/// Path::new("some/path/../other/file.txt")
361/// )?;
362/// // Result: /current/working/dir/some/other/file.txt
363///
364/// // Works with complex traversal patterns
365/// let complex = soft_canonicalize(
366/// Path::new("deep/nested/path/../../final/file.txt")
367/// )?;
368/// // Result: /current/working/dir/deep/final/file.txt
369/// # Ok(())
370/// # }
371/// ```
372///
373/// # Errors
374///
375/// Returns an `io::Error` in the following cases:
376/// - **Permission Denied**: When the current directory cannot be accessed (for relative paths)
377/// - **Invalid Path**: When the path contains invalid Unicode or system-specific issues
378/// - **Canonicalization Failure**: When the existing portion cannot be canonicalized
379/// - **Symlink Cycles**: When circular symlink references are detected
380///
381/// Note: This function does NOT return an error for non-existent paths, as supporting
382/// such paths is the primary purpose of soft canonicalization.
383///
384/// # Performance
385///
386/// - **Time Complexity**: O(n) where n is the number of path components
387/// - **Space Complexity**: O(n) for component storage during processing
388/// - **Filesystem Access**: Minimal - only to find existing ancestors and canonicalize them
389///
390pub fn soft_canonicalize(path: impl AsRef<Path>) -> io::Result<PathBuf> {
391 let path = path.as_ref();
392 let mut visited = HashSet::new();
393 soft_canonicalize_internal(path, &mut visited, 0)
394}
395
396#[cfg(test)]
397mod tests {
398 mod api_compatibility;
399 mod basic_functionality;
400 mod edge_cases;
401 mod optimization;
402 mod path_traversal;
403 mod platform_specific;
404 mod python_inspired_tests;
405 mod python_lessons;
406 mod security;
407 mod std_behavior;
408 mod symlink_depth;
409}