Skip to main content

gdown_core/
extract.rs

1//! Archive extraction with security checks
2
3use crate::error::{GdownError, Result};
4use flate2::read::GzDecoder;
5use std::fs::File;
6use std::path::{Path, PathBuf};
7use tar::Archive as TarArchive;
8use zip::ZipArchive;
9
10/// Supported archive extensions
11const SUPPORTED_EXTENSIONS: &[&str] = &[
12    "zip", "tar", "tar.gz", "tgz", "tar.bz2", "tbz",
13];
14
15/// Check if a path is safe (doesn't escape destination)
16fn is_safe_path(destination: &Path, member_path: &Path) -> bool {
17    let normalized: PathBuf = member_path
18        .components()
19        .filter(|c| !matches!(c, std::path::Component::ParentDir))
20        .collect();
21
22    let full_path = destination.join(&normalized);
23    full_path.starts_with(destination)
24}
25
26/// Sanitize a filename to prevent path traversal
27/// Matches Python behavior in gdown-main:
28/// - Replace \x00 with ""
29/// - Replace "/" with "_"
30/// - Replace "\\" with "_"
31/// - trim whitespace
32/// - If filename in ("", ".", "..") return "_"
33fn sanitize_filename(filename: &str) -> String {
34    let filename = filename.replace("\x00", "");
35    let filename = filename.replace("/", "_").replace("\\", "_").trim().to_string();
36    if filename.is_empty() || filename == "." || filename == ".." {
37        return "_".to_string();
38    }
39    filename
40}
41
42/// Extract archive to destination
43pub fn extractall(archive: &Path, destination: &Path, quiet: bool) -> Result<Vec<PathBuf>> {
44    let extension = archive
45        .extension()
46        .and_then(|e| e.to_str())
47        .unwrap_or("");
48
49    let stem = archive.file_stem().and_then(|s| s.to_str()).unwrap_or("");
50
51    let full_ext = if stem.ends_with(".tar") && extension == "gz" {
52        "tar.gz"
53    } else if stem.ends_with(".tar") && extension == "bz2" {
54        "tar.bz2"
55    } else {
56        extension
57    };
58
59    let extracted = match full_ext {
60        "zip" => extract_zip(archive, destination),
61        "tar" => extract_tar(archive, destination),
62        "gz" | "tgz" if stem.ends_with(".tar") => extract_tar_gz(archive, destination),
63        "tar.gz" => extract_tar_gz(archive, destination),
64        "bz2" | "tbz" if stem.ends_with(".tar") => extract_tar_bz2(archive, destination),
65        "tar.bz2" => extract_tar_bz2(archive, destination),
66        _ => Err(GdownError::Extraction(format!("Unsupported archive format: {}", full_ext))),
67    }?;
68
69    if !quiet {
70        println!("Extracted {} files to {:?}", extracted.len(), destination);
71    }
72
73    Ok(extracted)
74}
75
76/// Extract ZIP archive
77fn extract_zip(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
78    let file = File::open(archive).map_err(GdownError::Io)?;
79    let mut zip = ZipArchive::new(file).map_err(|e| GdownError::Extraction(e.to_string()))?;
80    let mut extracted = Vec::new();
81
82    for i in 0..zip.len() {
83        let mut file = zip.by_index(i).map_err(|e| GdownError::Extraction(e.to_string()))?;
84        let outpath = destination.join(file.name());
85
86        if !is_safe_path(destination, &outpath) {
87            continue;
88        }
89
90        let sanitized_name = sanitize_filename(file.name());
91        let final_path = destination.join(&sanitized_name);
92
93        if file.is_dir() {
94            std::fs::create_dir_all(&final_path)?;
95        } else {
96            if let Some(parent) = final_path.parent() {
97                std::fs::create_dir_all(parent)?;
98            }
99
100            let mut outfile = File::create(&final_path)?;
101            std::io::copy(&mut file, &mut outfile)?;
102            extracted.push(final_path);
103        }
104
105        #[cfg(unix)]
106        {
107            use std::os::unix::fs::PermissionsExt;
108            if let Some(mode) = file.unix_mode() {
109                std::fs::set_permissions(&final_path, std::fs::Permissions::from_mode(mode))?;
110            }
111        }
112    }
113
114    Ok(extracted)
115}
116
117/// Extract TAR archive
118fn extract_tar(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
119    let file = File::open(archive).map_err(GdownError::Io)?;
120    let mut tar = TarArchive::new(file);
121    let mut extracted = Vec::new();
122
123    for entry in tar.entries().map_err(|e| GdownError::Extraction(e.to_string()))? {
124        let mut entry = entry.map_err(|e| GdownError::Extraction(e.to_string()))?;
125        let path = entry.path().map_err(|e| GdownError::Extraction(e.to_string()))?.into_owned();
126
127        if !is_safe_path(destination, &path) {
128            continue;
129        }
130
131        let entry_type = entry.header().entry_type();
132        if entry_type.is_symlink() || entry_type.is_hard_link() {
133            continue;
134        }
135
136        let sanitized_name = sanitize_filename(path.to_str().unwrap_or(""));
137        let final_path = destination.join(&sanitized_name);
138
139        if entry_type.is_dir() {
140            std::fs::create_dir_all(&final_path)?;
141        } else {
142            if let Some(parent) = final_path.parent() {
143                std::fs::create_dir_all(parent)?;
144            }
145            entry.unpack(&final_path).map_err(|e| GdownError::Extraction(e.to_string()))?;
146            extracted.push(final_path);
147        }
148    }
149
150    Ok(extracted)
151}
152
153/// Extract TAR.GZ archive
154fn extract_tar_gz(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
155    let file = File::open(archive).map_err(GdownError::Io)?;
156    let decoder = GzDecoder::new(file);
157    let mut tar = TarArchive::new(decoder);
158    let mut extracted = Vec::new();
159
160    for entry in tar.entries().map_err(|e| GdownError::Extraction(e.to_string()))? {
161        let mut entry = entry.map_err(|e| GdownError::Extraction(e.to_string()))?;
162        let path = entry.path().map_err(|e| GdownError::Extraction(e.to_string()))?.into_owned();
163
164        if !is_safe_path(destination, &path) {
165            continue;
166        }
167
168        let entry_type = entry.header().entry_type();
169        if entry_type.is_symlink() || entry_type.is_hard_link() {
170            continue;
171        }
172
173        let sanitized_name = sanitize_filename(path.to_str().unwrap_or(""));
174        let final_path = destination.join(&sanitized_name);
175
176        if entry_type.is_dir() {
177            std::fs::create_dir_all(&final_path)?;
178        } else {
179            if let Some(parent) = final_path.parent() {
180                std::fs::create_dir_all(parent)?;
181            }
182            entry.unpack(&final_path).map_err(|e| GdownError::Extraction(e.to_string()))?;
183            extracted.push(final_path);
184        }
185    }
186
187    Ok(extracted)
188}
189
190/// Extract TAR.BZ2 archive
191fn extract_tar_bz2(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
192    let file = File::open(archive).map_err(GdownError::Io)?;
193    let decoder = bzip2::read::BzDecoder::new(file);
194    let mut tar = TarArchive::new(decoder);
195    let mut extracted = Vec::new();
196
197    for entry in tar.entries().map_err(|e| GdownError::Extraction(e.to_string()))? {
198        let mut entry = entry.map_err(|e| GdownError::Extraction(e.to_string()))?;
199        let path = entry.path().map_err(|e| GdownError::Extraction(e.to_string()))?.into_owned();
200
201        if !is_safe_path(destination, &path) {
202            continue;
203        }
204
205        let entry_type = entry.header().entry_type();
206        if entry_type.is_symlink() || entry_type.is_hard_link() {
207            continue;
208        }
209
210        let sanitized_name = sanitize_filename(path.to_str().unwrap_or(""));
211        let final_path = destination.join(&sanitized_name);
212
213        if entry_type.is_dir() {
214            std::fs::create_dir_all(&final_path)?;
215        } else {
216            if let Some(parent) = final_path.parent() {
217                std::fs::create_dir_all(parent)?;
218            }
219            entry.unpack(&final_path).map_err(|e| GdownError::Extraction(e.to_string()))?;
220            extracted.push(final_path);
221        }
222    }
223
224    Ok(extracted)
225}
226
227/// Check if file is a supported archive
228pub fn is_archive(path: &Path) -> bool {
229    let extension = path
230        .extension()
231        .and_then(|e| e.to_str())
232        .unwrap_or("");
233
234    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
235
236    SUPPORTED_EXTENSIONS.contains(&extension)
237        || (stem.ends_with(".tar") && (extension == "gz" || extension == "bz2"))
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn test_is_safe_path() {
246        let dest = Path::new("C:\\tmp\\extract");
247        let safe = Path::new("C:\\tmp\\extract\\nested\\file.txt");
248        assert!(is_safe_path(dest, safe));
249    }
250
251    #[test]
252    fn test_sanitize_filename() {
253        // Note: Python only replaces ".." when it's the ENTIRE filename (after strip)
254        // "../etc/passwd" -> ".._etc_passwd" (not "_etc_passwd")
255        // "..根目录" stays "..根目录" (because ".." is not the entire string)
256        assert_eq!(sanitize_filename("..根目录"), "..根目录");
257        assert_eq!(sanitize_filename(".."), "_");
258        assert_eq!(sanitize_filename("."), "_");
259        assert_eq!(sanitize_filename(""), "_");
260        assert_eq!(sanitize_filename("normal.txt"), "normal.txt");
261        assert_eq!(sanitize_filename("Budget/2024.pdf"), "Budget_2024.pdf");
262        assert_eq!(sanitize_filename("path\\to\\file.pdf"), "path_to_file.pdf");
263        assert_eq!(sanitize_filename("file\x00name.txt"), "filename.txt");
264
265        // Additional cases matching Python gdown-main _sanitize_filename
266        assert_eq!(sanitize_filename("name/with/slashes.txt"), "name_with_slashes.txt");
267        assert_eq!(sanitize_filename("name\\with\\backslashes.txt"), "name_with_backslashes.txt");
268        assert_eq!(sanitize_filename("\x00nullbyte"), "nullbyte");
269        assert_eq!(sanitize_filename("  file.txt  "), "file.txt");
270        assert_eq!(sanitize_filename("/leading slash"), "_leading slash");
271        assert_eq!(sanitize_filename("trailing slash/"), "trailing slash_");
272        assert_eq!(sanitize_filename("multiple///slashes"), "multiple___slashes");
273    }
274
275    #[test]
276    fn test_is_safe_path_unsafe_windows() {
277        let dest = Path::new("C:\\tmp\\extract");
278        // On Windows, path components are different
279        // This path would be normalized to C:\tmp\extract\etc\passwd which passes
280        // So we test with actual Windows path traversal
281        let unsafe_path = Path::new("C:\\tmp\\..\\..\\Windows\\System32");
282        // The normalization strips .. so this becomes C:\tmp\Windows\System32
283        // which does NOT start with C:\tmp\extract, so it fails (correctly unsafe)
284        assert!(!is_safe_path(dest, unsafe_path));
285    }
286
287    #[test]
288    fn test_is_safe_path_absolute() {
289        let dest = Path::new("C:\\tmp\\extract");
290        let absolute = Path::new("C:\\Windows\\system32\\file.txt");
291        assert!(!is_safe_path(dest, absolute));
292    }
293
294    #[test]
295    fn test_is_safe_path_same_file() {
296        let dest = Path::new("C:\\tmp\\extract");
297        let same = Path::new("C:\\tmp\\extract");
298        assert!(is_safe_path(dest, same));
299    }
300
301    #[test]
302    fn test_is_safe_path_parent_in_name() {
303        let dest = Path::new("C:\\tmp\\extract");
304        // ".." as part of filename, not as path component
305        let parent_in_name = Path::new("C:\\tmp\\extract\\..hidden\\file.txt");
306        assert!(is_safe_path(dest, parent_in_name));
307    }
308
309    #[test]
310    fn test_is_archive() {
311        assert!(is_archive(Path::new("file.zip")));
312        assert!(is_archive(Path::new("file.tar")));
313        assert!(is_archive(Path::new("file.tar.gz")));
314        assert!(is_archive(Path::new("file.tgz")));
315        assert!(is_archive(Path::new("file.tar.bz2")));
316        assert!(!is_archive(Path::new("file.txt")));
317    }
318}