soft_canonicalize/
lib.rs

1//! # soft-canonicalize
2//!
3//! A pure Rust library for path canonicalization that works with non-existing paths.
4//!
5//! Unlike `std::fs::canonicalize()`, this library can resolve and normalize paths
6//! even when some or all of the path components don't exist on the filesystem.
7//! This is useful for security validation, path preprocessing, and working with
8//! paths before creating files.
9//!
10//! ## Features
11//!
12//! - **Works with non-existing paths**: Canonicalizes paths even when they don't exist
13//! - **Cross-platform**: Supports Windows, macOS, and Linux
14//! - **Zero dependencies**: No external dependencies beyond std
15//! - **Security focused**: Proper handling of `..` components and symlinks
16//! - **Pure algorithm**: No filesystem modification during canonicalization
17//!
18//! ## Example
19//!
20//! ```rust
21//! use soft_canonicalize::soft_canonicalize;
22//! use std::path::Path;
23//!
24//! # fn example() -> std::io::Result<()> {
25//! // Works with string paths (like std::fs::canonicalize)
26//! let from_str = soft_canonicalize("some/path/file.txt")?;
27//!
28//! // Works with existing paths (same as std::fs::canonicalize)
29//! let existing = soft_canonicalize(&std::env::temp_dir())?;
30//!
31//! // Also works with non-existing paths
32//! let non_existing = soft_canonicalize(
33//!     std::env::temp_dir().join("some/deep/non/existing/path.txt")
34//! )?;
35//!
36//! // Resolves .. components logically
37//! let traversal = soft_canonicalize("some/path/../other/file.txt")?;
38//! # Ok(())
39//! # }
40//! ```
41//!
42//! ## Security
43//!
44//! This library is designed with security in mind:
45//!
46//! - Properly handles directory traversal (`..`) components
47//! - Resolves symlinks when they exist
48//! - Normalizes path separators and case (on case-insensitive filesystems)
49//! - Does not create or modify filesystem entries during canonicalization
50//!
51//! ## Algorithm
52//!
53//! The soft canonicalization algorithm works by:
54//!
55//! 1. Converting relative paths to absolute paths
56//! 2. Logically processing `..` components to resolve traversals
57//! 3. Finding the longest existing ancestor directory
58//! 4. Canonicalizing the existing portion using `std::fs::canonicalize`
59//! 5. Appending the non-existing components to the canonicalized base
60//!
61//! This approach provides the security benefits of full canonicalization while
62//! supporting paths that don't exist yet.
63
64use std::collections::HashSet;
65use std::path::{Path, PathBuf};
66use std::{fs, io};
67
68/// Maximum number of symlinks to follow before giving up.
69/// This matches the behavior of std::fs::canonicalize and OS limits:
70/// - Linux: ELOOP limit is typically 40
71/// - Windows: Similar limit around 63
72/// - Other Unix systems: Usually 32-40
73pub const MAX_SYMLINK_DEPTH: usize = if cfg!(target_os = "windows") { 63 } else { 40 };
74
75/// Internal helper function that finds the boundary between existing and non-existing path components.
76///
77/// Returns (existing_prefix, non_existing_suffix) where existing_prefix is the longest
78/// existing directory path, and non_existing_suffix contains the remaining components.
79/// This version properly handles symlinks by processing components incrementally.
80fn find_existing_boundary_with_symlinks(
81    path: &Path,
82    visited: &mut HashSet<PathBuf>,
83    symlink_depth: usize,
84) -> io::Result<(PathBuf, Vec<std::ffi::OsString>)> {
85    // Check symlink depth limit to match std::fs::canonicalize behavior
86    if symlink_depth > MAX_SYMLINK_DEPTH {
87        return Err(io::Error::new(
88            io::ErrorKind::InvalidInput,
89            "Too many levels of symbolic links",
90        ));
91    }
92
93    // Convert to absolute path first
94    let absolute_path = if path.is_absolute() {
95        path.to_path_buf()
96    } else {
97        std::env::current_dir()?.join(path)
98    };
99
100    // First, do lexical resolution of .. and . components
101    let mut resolved_components = Vec::new();
102    let mut result = PathBuf::new();
103
104    // Collect root components (Prefix, RootDir)
105    for component in absolute_path.components() {
106        match component {
107            std::path::Component::RootDir | std::path::Component::Prefix(_) => {
108                result.push(component.as_os_str());
109            }
110            std::path::Component::Normal(name) => {
111                resolved_components.push(name.to_os_string());
112            }
113            std::path::Component::ParentDir => {
114                // Handle .. by removing the last component if possible
115                if !resolved_components.is_empty() {
116                    resolved_components.pop();
117                }
118                // If at root level, .. is ignored (cannot go above root)
119            }
120            std::path::Component::CurDir => {
121                // Ignore . components
122            }
123        }
124    }
125
126    // Now build path incrementally, handling symlinks as we go
127    let mut current_path = result;
128    let mut remaining_components = resolved_components.clone();
129
130    for (i, component) in resolved_components.iter().enumerate() {
131        let test_path = current_path.join(component);
132
133        if test_path.exists() {
134            // Check if this is a symlink
135            if test_path.is_symlink() {
136                // Check for symlink cycle
137                if visited.contains(&test_path) {
138                    return Err(io::Error::new(
139                        io::ErrorKind::InvalidInput,
140                        "Too many levels of symbolic links",
141                    ));
142                }
143
144                match fs::read_link(&test_path) {
145                    Ok(target) => {
146                        // Add this symlink to visited set
147                        visited.insert(test_path.clone());
148
149                        // Resolve the target path
150                        let resolved_target = if target.is_absolute() {
151                            target
152                        } else {
153                            current_path.join(target)
154                        };
155
156                        // Append remaining components to the target
157                        let mut full_target = resolved_target;
158                        for remaining in &resolved_components[i + 1..] {
159                            full_target.push(remaining);
160                        }
161
162                        // Recursively process the target
163                        let (symlink_prefix, symlink_suffix) =
164                            find_existing_boundary_with_symlinks(
165                                &full_target,
166                                visited,
167                                symlink_depth + 1,
168                            )?;
169
170                        // Remove from visited set
171                        visited.remove(&test_path);
172
173                        return Ok((symlink_prefix, symlink_suffix));
174                    }
175                    Err(_) => {
176                        // Broken symlink - we still need to resolve it lexically
177                        // Continue processing as if it doesn't exist, but we'll handle the
178                        // symlink target resolution in the calling function
179                        remaining_components = resolved_components[i..].to_vec();
180                        break;
181                    }
182                }
183            } else {
184                // Regular file/directory that exists
185                current_path = test_path;
186                remaining_components = resolved_components[i + 1..].to_vec();
187            }
188        } else {
189            // Found the boundary - everything from this component onwards doesn't exist
190            remaining_components = resolved_components[i..].to_vec();
191            break;
192        }
193    }
194
195    Ok((current_path, remaining_components))
196}
197
198/// Internal helper function that performs soft canonicalization.
199///
200/// This optimized version finds the existing/non-existing boundary and uses std::fs::canonicalize
201/// only on the existing portion for maximum efficiency while maintaining security and symlink handling.
202fn soft_canonicalize_internal(
203    path: &Path,
204    visited: &mut HashSet<PathBuf>,
205    symlink_depth: usize,
206) -> io::Result<PathBuf> {
207    // Check symlink depth limit to match std::fs::canonicalize behavior
208    if symlink_depth > MAX_SYMLINK_DEPTH {
209        return Err(io::Error::new(
210            io::ErrorKind::InvalidInput,
211            "Too many levels of symbolic links",
212        ));
213    }
214
215    // Handle empty path like std::fs::canonicalize - should fail
216    if path.as_os_str().is_empty() {
217        return Err(io::Error::new(
218            io::ErrorKind::NotFound,
219            "The system cannot find the path specified.",
220        ));
221    }
222
223    // Special handling for broken symlinks
224    if path.is_symlink() {
225        // Check for symlink cycle first
226        if visited.contains(path) {
227            return Err(io::Error::new(
228                io::ErrorKind::InvalidInput,
229                "Too many levels of symbolic links",
230            ));
231        }
232
233        // Check if we can canonicalize it (i.e., if the target exists)
234        match fs::canonicalize(path) {
235            Ok(canonical) => return Ok(canonical),
236            Err(_) => {
237                // It's a broken symlink - resolve it manually
238                let target = fs::read_link(path)?;
239                let resolved_target = if target.is_absolute() {
240                    target
241                } else {
242                    path.parent().unwrap_or(Path::new("/")).join(target)
243                };
244
245                // Add this symlink to visited set before recursing
246                visited.insert(path.to_path_buf());
247
248                // Recursively canonicalize the target (which may not exist)
249                let result =
250                    soft_canonicalize_internal(&resolved_target, visited, symlink_depth + 1);
251
252                // Remove from visited set after recursion
253                visited.remove(path);
254
255                return result;
256            }
257        }
258    }
259
260    // Find the boundary between existing and non-existing components
261    let (existing_prefix, non_existing_suffix) =
262        find_existing_boundary_with_symlinks(path, visited, symlink_depth)?;
263
264    // Canonicalize the existing prefix (this handles all symlinks in the existing portion)
265    let canonical_prefix = if existing_prefix.as_os_str().is_empty()
266        || existing_prefix == Path::new("/")
267        || existing_prefix.parent().is_none()
268    {
269        // Handle root paths - they're already canonical
270        existing_prefix
271    } else {
272        // Use std::fs::canonicalize for existing paths - this is secure and handles all symlinks
273        fs::canonicalize(&existing_prefix)?
274    };
275
276    // Append the non-existing components lexically (no symlinks possible in non-existing paths)
277    let mut result = canonical_prefix;
278    for component in non_existing_suffix {
279        result.push(component);
280    }
281
282    Ok(result)
283}
284
285/// Performs "soft" canonicalization on a path.
286///
287/// Unlike `std::fs::canonicalize()`, this function works with non-existent paths by:
288/// 1. Finding the deepest existing ancestor directory
289/// 2. Canonicalizing that existing part (resolving symlinks, normalizing case, etc.)
290/// 3. Appending the non-existing path components to the canonicalized base
291///
292/// This provides the security benefits of canonicalization (symlink resolution,
293/// path normalization) without requiring the entire path to exist.
294///
295/// # Algorithm Details
296///
297/// The function performs the following steps:
298///
299/// 1. **Absolute Path Conversion**: Converts relative paths to absolute paths
300/// 2. **Logical Processing**: Processes `..` components mathematically without filesystem access
301/// 3. **Symlink Cycle Detection**: Tracks visited symlinks to prevent infinite recursion
302/// 4. **Existing Prefix Discovery**: Finds the longest existing ancestor
303/// 5. **Canonicalization**: Uses `std::fs::canonicalize` on the existing portion
304/// 6. **Reconstruction**: Appends non-existing components to the canonical base
305///
306/// # Security Considerations
307///
308/// - **Directory Traversal**: `..` components are resolved logically before filesystem access
309/// - **Symlink Resolution**: Existing symlinks are resolved with proper cycle detection
310/// - **No Side Effects**: No temporary files or directories are created during the process
311///
312/// # Cross-Platform Support
313///
314/// This function works correctly on:
315/// - **Windows**: Handles drive letters, UNC paths, and case normalization
316/// - **Unix-like systems**: Handles absolute paths starting with `/`
317/// - **All platforms**: Proper handling of path separators and components
318///
319/// # Examples
320///
321/// ## Basic Usage
322///
323/// ```rust
324/// use soft_canonicalize::soft_canonicalize;
325/// use std::path::{Path, PathBuf};
326///
327/// # fn example() -> std::io::Result<()> {
328/// // Works with &str (like std::fs::canonicalize)
329/// let from_str = soft_canonicalize("some/path/file.txt")?;
330///
331/// // Works with &Path
332/// let from_path = soft_canonicalize(Path::new("some/path/file.txt"))?;
333///
334/// // Works with &PathBuf
335/// let path_buf = PathBuf::from("some/path/file.txt");
336/// let from_pathbuf = soft_canonicalize(&path_buf)?;
337///
338/// // Works with existing paths (same as std::fs::canonicalize)
339/// let existing = soft_canonicalize(&std::env::temp_dir())?;
340/// println!("Existing path: {:?}", existing);
341///
342/// // Also works with non-existing paths
343/// let non_existing = soft_canonicalize(
344///     std::env::temp_dir().join("some/deep/non/existing/path.txt")
345/// )?;
346/// println!("Non-existing path: {:?}", non_existing);
347/// # Ok(())
348/// # }
349/// ```
350///
351/// ## Directory Traversal Handling
352///
353/// ```rust
354/// use soft_canonicalize::soft_canonicalize;
355/// use std::path::Path;
356///
357/// # fn example() -> std::io::Result<()> {
358/// // Resolves .. components logically
359/// let traversal = soft_canonicalize(
360///     Path::new("some/path/../other/file.txt")
361/// )?;
362/// // Result: /current/working/dir/some/other/file.txt
363///
364/// // Works with complex traversal patterns
365/// let complex = soft_canonicalize(
366///     Path::new("deep/nested/path/../../final/file.txt")
367/// )?;
368/// // Result: /current/working/dir/deep/final/file.txt
369/// # Ok(())
370/// # }
371/// ```
372///
373/// # Errors
374///
375/// Returns an `io::Error` in the following cases:
376/// - **Permission Denied**: When the current directory cannot be accessed (for relative paths)
377/// - **Invalid Path**: When the path contains invalid Unicode or system-specific issues
378/// - **Canonicalization Failure**: When the existing portion cannot be canonicalized
379/// - **Symlink Cycles**: When circular symlink references are detected
380///
381/// Note: This function does NOT return an error for non-existent paths, as supporting
382/// such paths is the primary purpose of soft canonicalization.
383///
384/// # Performance
385///
386/// - **Time Complexity**: O(n) where n is the number of path components
387/// - **Space Complexity**: O(n) for component storage during processing
388/// - **Filesystem Access**: Minimal - only to find existing ancestors and canonicalize them
389///
390pub fn soft_canonicalize(path: impl AsRef<Path>) -> io::Result<PathBuf> {
391    let path = path.as_ref();
392    let mut visited = HashSet::new();
393    soft_canonicalize_internal(path, &mut visited, 0)
394}
395
396#[cfg(test)]
397mod tests {
398    mod api_compatibility;
399    mod basic_functionality;
400    mod edge_cases;
401    mod optimization;
402    mod path_traversal;
403    mod platform_specific;
404    mod python_inspired_tests;
405    mod python_lessons;
406    mod security;
407    mod std_behavior;
408    mod symlink_depth;
409}