gdown-core 0.1.0

Core download logic for Google Drive
Documentation
//! Archive extraction with security checks

use crate::error::{GdownError, Result};
use flate2::read::GzDecoder;
use std::fs::File;
use std::path::{Path, PathBuf};
use tar::Archive as TarArchive;
use zip::ZipArchive;

/// Supported archive extensions
const SUPPORTED_EXTENSIONS: &[&str] = &[
    "zip", "tar", "tar.gz", "tgz", "tar.bz2", "tbz",
];

/// Check if a path is safe (doesn't escape destination)
fn is_safe_path(destination: &Path, member_path: &Path) -> bool {
    let normalized: PathBuf = member_path
        .components()
        .filter(|c| !matches!(c, std::path::Component::ParentDir))
        .collect();

    let full_path = destination.join(&normalized);
    full_path.starts_with(destination)
}

/// Sanitize a filename to prevent path traversal
/// Matches Python behavior in gdown-main:
/// - Replace \x00 with ""
/// - Replace "/" with "_"
/// - Replace "\\" with "_"
/// - trim whitespace
/// - If filename in ("", ".", "..") return "_"
fn sanitize_filename(filename: &str) -> String {
    let filename = filename.replace("\x00", "");
    let filename = filename.replace("/", "_").replace("\\", "_").trim().to_string();
    if filename.is_empty() || filename == "." || filename == ".." {
        return "_".to_string();
    }
    filename
}

/// Extract archive to destination
pub fn extractall(archive: &Path, destination: &Path, quiet: bool) -> Result<Vec<PathBuf>> {
    let extension = archive
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or("");

    let stem = archive.file_stem().and_then(|s| s.to_str()).unwrap_or("");

    let full_ext = if stem.ends_with(".tar") && extension == "gz" {
        "tar.gz"
    } else if stem.ends_with(".tar") && extension == "bz2" {
        "tar.bz2"
    } else {
        extension
    };

    let extracted = match full_ext {
        "zip" => extract_zip(archive, destination),
        "tar" => extract_tar(archive, destination),
        "gz" | "tgz" if stem.ends_with(".tar") => extract_tar_gz(archive, destination),
        "tar.gz" => extract_tar_gz(archive, destination),
        "bz2" | "tbz" if stem.ends_with(".tar") => extract_tar_bz2(archive, destination),
        "tar.bz2" => extract_tar_bz2(archive, destination),
        _ => Err(GdownError::Extraction(format!("Unsupported archive format: {}", full_ext))),
    }?;

    if !quiet {
        println!("Extracted {} files to {:?}", extracted.len(), destination);
    }

    Ok(extracted)
}

/// Extract ZIP archive
fn extract_zip(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
    let file = File::open(archive).map_err(GdownError::Io)?;
    let mut zip = ZipArchive::new(file).map_err(|e| GdownError::Extraction(e.to_string()))?;
    let mut extracted = Vec::new();

    for i in 0..zip.len() {
        let mut file = zip.by_index(i).map_err(|e| GdownError::Extraction(e.to_string()))?;
        let outpath = destination.join(file.name());

        if !is_safe_path(destination, &outpath) {
            continue;
        }

        let sanitized_name = sanitize_filename(file.name());
        let final_path = destination.join(&sanitized_name);

        if file.is_dir() {
            std::fs::create_dir_all(&final_path)?;
        } else {
            if let Some(parent) = final_path.parent() {
                std::fs::create_dir_all(parent)?;
            }

            let mut outfile = File::create(&final_path)?;
            std::io::copy(&mut file, &mut outfile)?;
            extracted.push(final_path);
        }

        #[cfg(unix)]
        {
            use std::os::unix::fs::PermissionsExt;
            if let Some(mode) = file.unix_mode() {
                std::fs::set_permissions(&final_path, std::fs::Permissions::from_mode(mode))?;
            }
        }
    }

    Ok(extracted)
}

/// Extract TAR archive
fn extract_tar(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
    let file = File::open(archive).map_err(GdownError::Io)?;
    let mut tar = TarArchive::new(file);
    let mut extracted = Vec::new();

    for entry in tar.entries().map_err(|e| GdownError::Extraction(e.to_string()))? {
        let mut entry = entry.map_err(|e| GdownError::Extraction(e.to_string()))?;
        let path = entry.path().map_err(|e| GdownError::Extraction(e.to_string()))?.into_owned();

        if !is_safe_path(destination, &path) {
            continue;
        }

        let entry_type = entry.header().entry_type();
        if entry_type.is_symlink() || entry_type.is_hard_link() {
            continue;
        }

        let sanitized_name = sanitize_filename(path.to_str().unwrap_or(""));
        let final_path = destination.join(&sanitized_name);

        if entry_type.is_dir() {
            std::fs::create_dir_all(&final_path)?;
        } else {
            if let Some(parent) = final_path.parent() {
                std::fs::create_dir_all(parent)?;
            }
            entry.unpack(&final_path).map_err(|e| GdownError::Extraction(e.to_string()))?;
            extracted.push(final_path);
        }
    }

    Ok(extracted)
}

/// Extract TAR.GZ archive
fn extract_tar_gz(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
    let file = File::open(archive).map_err(GdownError::Io)?;
    let decoder = GzDecoder::new(file);
    let mut tar = TarArchive::new(decoder);
    let mut extracted = Vec::new();

    for entry in tar.entries().map_err(|e| GdownError::Extraction(e.to_string()))? {
        let mut entry = entry.map_err(|e| GdownError::Extraction(e.to_string()))?;
        let path = entry.path().map_err(|e| GdownError::Extraction(e.to_string()))?.into_owned();

        if !is_safe_path(destination, &path) {
            continue;
        }

        let entry_type = entry.header().entry_type();
        if entry_type.is_symlink() || entry_type.is_hard_link() {
            continue;
        }

        let sanitized_name = sanitize_filename(path.to_str().unwrap_or(""));
        let final_path = destination.join(&sanitized_name);

        if entry_type.is_dir() {
            std::fs::create_dir_all(&final_path)?;
        } else {
            if let Some(parent) = final_path.parent() {
                std::fs::create_dir_all(parent)?;
            }
            entry.unpack(&final_path).map_err(|e| GdownError::Extraction(e.to_string()))?;
            extracted.push(final_path);
        }
    }

    Ok(extracted)
}

/// Extract TAR.BZ2 archive
fn extract_tar_bz2(archive: &Path, destination: &Path) -> Result<Vec<PathBuf>> {
    let file = File::open(archive).map_err(GdownError::Io)?;
    let decoder = bzip2::read::BzDecoder::new(file);
    let mut tar = TarArchive::new(decoder);
    let mut extracted = Vec::new();

    for entry in tar.entries().map_err(|e| GdownError::Extraction(e.to_string()))? {
        let mut entry = entry.map_err(|e| GdownError::Extraction(e.to_string()))?;
        let path = entry.path().map_err(|e| GdownError::Extraction(e.to_string()))?.into_owned();

        if !is_safe_path(destination, &path) {
            continue;
        }

        let entry_type = entry.header().entry_type();
        if entry_type.is_symlink() || entry_type.is_hard_link() {
            continue;
        }

        let sanitized_name = sanitize_filename(path.to_str().unwrap_or(""));
        let final_path = destination.join(&sanitized_name);

        if entry_type.is_dir() {
            std::fs::create_dir_all(&final_path)?;
        } else {
            if let Some(parent) = final_path.parent() {
                std::fs::create_dir_all(parent)?;
            }
            entry.unpack(&final_path).map_err(|e| GdownError::Extraction(e.to_string()))?;
            extracted.push(final_path);
        }
    }

    Ok(extracted)
}

/// Check if file is a supported archive
pub fn is_archive(path: &Path) -> bool {
    let extension = path
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or("");

    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");

    SUPPORTED_EXTENSIONS.contains(&extension)
        || (stem.ends_with(".tar") && (extension == "gz" || extension == "bz2"))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_is_safe_path() {
        let dest = Path::new("C:\\tmp\\extract");
        let safe = Path::new("C:\\tmp\\extract\\nested\\file.txt");
        assert!(is_safe_path(dest, safe));
    }

    #[test]
    fn test_sanitize_filename() {
        // Note: Python only replaces ".." when it's the ENTIRE filename (after strip)
        // "../etc/passwd" -> ".._etc_passwd" (not "_etc_passwd")
        // "..根目录" stays "..根目录" (because ".." is not the entire string)
        assert_eq!(sanitize_filename("..根目录"), "..根目录");
        assert_eq!(sanitize_filename(".."), "_");
        assert_eq!(sanitize_filename("."), "_");
        assert_eq!(sanitize_filename(""), "_");
        assert_eq!(sanitize_filename("normal.txt"), "normal.txt");
        assert_eq!(sanitize_filename("Budget/2024.pdf"), "Budget_2024.pdf");
        assert_eq!(sanitize_filename("path\\to\\file.pdf"), "path_to_file.pdf");
        assert_eq!(sanitize_filename("file\x00name.txt"), "filename.txt");

        // Additional cases matching Python gdown-main _sanitize_filename
        assert_eq!(sanitize_filename("name/with/slashes.txt"), "name_with_slashes.txt");
        assert_eq!(sanitize_filename("name\\with\\backslashes.txt"), "name_with_backslashes.txt");
        assert_eq!(sanitize_filename("\x00nullbyte"), "nullbyte");
        assert_eq!(sanitize_filename("  file.txt  "), "file.txt");
        assert_eq!(sanitize_filename("/leading slash"), "_leading slash");
        assert_eq!(sanitize_filename("trailing slash/"), "trailing slash_");
        assert_eq!(sanitize_filename("multiple///slashes"), "multiple___slashes");
    }

    #[test]
    fn test_is_safe_path_unsafe_windows() {
        let dest = Path::new("C:\\tmp\\extract");
        // On Windows, path components are different
        // This path would be normalized to C:\tmp\extract\etc\passwd which passes
        // So we test with actual Windows path traversal
        let unsafe_path = Path::new("C:\\tmp\\..\\..\\Windows\\System32");
        // The normalization strips .. so this becomes C:\tmp\Windows\System32
        // which does NOT start with C:\tmp\extract, so it fails (correctly unsafe)
        assert!(!is_safe_path(dest, unsafe_path));
    }

    #[test]
    fn test_is_safe_path_absolute() {
        let dest = Path::new("C:\\tmp\\extract");
        let absolute = Path::new("C:\\Windows\\system32\\file.txt");
        assert!(!is_safe_path(dest, absolute));
    }

    #[test]
    fn test_is_safe_path_same_file() {
        let dest = Path::new("C:\\tmp\\extract");
        let same = Path::new("C:\\tmp\\extract");
        assert!(is_safe_path(dest, same));
    }

    #[test]
    fn test_is_safe_path_parent_in_name() {
        let dest = Path::new("C:\\tmp\\extract");
        // ".." as part of filename, not as path component
        let parent_in_name = Path::new("C:\\tmp\\extract\\..hidden\\file.txt");
        assert!(is_safe_path(dest, parent_in_name));
    }

    #[test]
    fn test_is_archive() {
        assert!(is_archive(Path::new("file.zip")));
        assert!(is_archive(Path::new("file.tar")));
        assert!(is_archive(Path::new("file.tar.gz")));
        assert!(is_archive(Path::new("file.tgz")));
        assert!(is_archive(Path::new("file.tar.bz2")));
        assert!(!is_archive(Path::new("file.txt")));
    }
}