exarch-core 0.4.0

Memory-safe archive extraction library with security validation
Documentation
//! Archive format detection.

use std::path::Path;

use crate::ExtractionError;
use crate::Result;

/// File extensions that wrap a ZIP container with extra structure.
///
/// Signing, manifests, and ordering rules sit on top of the ZIP bytes
/// for these formats. Extraction treats them as ZIP; creation is
/// rejected separately in `api::reject_zip_family_creation`. Kept as a
/// single source of truth so the two call sites don't drift.
pub const ZIP_FAMILY_ALIASES: &[&str] = &[
    "jar", "war", "ear", "nar", "nbm", "apk", "aab", "ipa", "appx", "msix", "whl", "vsix", "xpi",
    "epub",
];

/// Returns true if `ext` (case-insensitive) names a ZIP-family alias.
/// Plain `.zip` is deliberately *not* included - callers can test it
/// separately when they need to distinguish "bare ZIP" from "ZIP under
/// another name".
pub(crate) fn is_zip_family_alias(ext: &str) -> bool {
    let lower = ext.to_ascii_lowercase();
    ZIP_FAMILY_ALIASES.contains(&lower.as_str())
}

/// Supported archive formats.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ArchiveType {
    /// Tar archive (uncompressed).
    Tar,
    /// Gzip-compressed tar archive.
    TarGz,
    /// Bzip2-compressed tar archive.
    TarBz2,
    /// XZ-compressed tar archive.
    TarXz,
    /// Zstd-compressed tar archive.
    TarZst,
    /// ZIP archive.
    Zip,
    /// 7z archive.
    SevenZ,
}

/// Detects the archive type from a file path.
///
/// `.gz` files are only accepted when the stem ends with `.tar`
/// (i.e. `archive.tar.gz`). A bare `archive.gz` returns
/// [`ExtractionError::UnsupportedFormat`].
///
/// # Errors
///
/// Returns [`ExtractionError::UnsupportedFormat`] if the extension is
/// unrecognised or if a `.gz` file has no `.tar` stem.
pub fn detect_format(path: &Path) -> Result<ArchiveType> {
    let extension = path
        .extension()
        .and_then(|e| e.to_str())
        .ok_or(ExtractionError::UnsupportedFormat)?;

    let ext_lower = extension.to_ascii_lowercase();
    match ext_lower.as_str() {
        "tar" => Ok(ArchiveType::Tar),
        "tgz" => Ok(ArchiveType::TarGz),
        "gz" => {
            if let Some(stem) = path.file_stem()
                && stem.to_string_lossy().ends_with(".tar")
            {
                Ok(ArchiveType::TarGz)
            } else {
                Err(ExtractionError::UnsupportedFormat)
            }
        }
        "bz2" | "tbz" | "tbz2" => Ok(ArchiveType::TarBz2),
        "xz" | "txz" => Ok(ArchiveType::TarXz),
        "zst" | "tzst" => Ok(ArchiveType::TarZst),
        "zip" => Ok(ArchiveType::Zip),
        "7z" => Ok(ArchiveType::SevenZ),
        // JVM artifacts, app bundles, Python wheels, IDE/browser
        // extensions, EPUBs - all ZIP under the hood, so they extract
        // through the same path. See `ZIP_FAMILY_ALIASES` for the list.
        ext if is_zip_family_alias(ext) => Ok(ArchiveType::Zip),
        _ => Err(ExtractionError::UnsupportedFormat),
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_detect_tar() {
        let path = PathBuf::from("archive.tar");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::Tar);
    }

    #[test]
    fn test_detect_tar_gz_still_works() {
        let path = PathBuf::from("archive.tar.gz");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::TarGz);

        let path2 = PathBuf::from("archive.tgz");
        assert_eq!(detect_format(&path2).unwrap(), ArchiveType::TarGz);
    }

    #[test]
    fn test_detect_bare_gz_returns_unsupported() {
        let path = PathBuf::from("archive.gz");
        assert!(matches!(
            detect_format(&path),
            Err(ExtractionError::UnsupportedFormat)
        ));
    }

    #[test]
    fn test_detect_tar_bz2() {
        let path = PathBuf::from("archive.tar.bz2");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::TarBz2);

        let path2 = PathBuf::from("archive.tbz");
        assert_eq!(detect_format(&path2).unwrap(), ArchiveType::TarBz2);

        let path3 = PathBuf::from("archive.tbz2");
        assert_eq!(detect_format(&path3).unwrap(), ArchiveType::TarBz2);
    }

    #[test]
    fn test_detect_tar_xz() {
        let path = PathBuf::from("archive.tar.xz");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::TarXz);

        let path2 = PathBuf::from("archive.txz");
        assert_eq!(detect_format(&path2).unwrap(), ArchiveType::TarXz);
    }

    #[test]
    fn test_detect_tar_zst() {
        let path = PathBuf::from("archive.tar.zst");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::TarZst);

        let path2 = PathBuf::from("archive.tzst");
        assert_eq!(detect_format(&path2).unwrap(), ArchiveType::TarZst);
    }

    #[test]
    fn test_detect_zip() {
        let path = PathBuf::from("archive.zip");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::Zip);
    }

    #[test]
    fn test_detect_zip_family_extensions() {
        // Each of these is a ZIP underneath and should resolve to
        // ArchiveType::Zip so the existing extractor picks it up. Upper-case
        // variants cover Windows-authored filenames. Driving off
        // ZIP_FAMILY_ALIASES keeps the test from drifting if the list changes.
        for ext in ZIP_FAMILY_ALIASES {
            let path = PathBuf::from(format!("archive.{ext}"));
            assert_eq!(
                detect_format(&path).unwrap(),
                ArchiveType::Zip,
                "{ext} should detect as ZIP",
            );

            let upper = PathBuf::from(format!("archive.{}", ext.to_ascii_uppercase()));
            assert_eq!(
                detect_format(&upper).unwrap(),
                ArchiveType::Zip,
                "{ext} uppercase should detect as ZIP",
            );
        }
    }

    #[test]
    fn test_detect_7z() {
        let path = PathBuf::from("archive.7z");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::SevenZ);
    }

    #[test]
    fn test_detect_7z_case_insensitive() {
        let path = PathBuf::from("ARCHIVE.7Z");
        assert_eq!(detect_format(&path).unwrap(), ArchiveType::SevenZ);

        let path2 = PathBuf::from("Archive.7Z");
        assert_eq!(detect_format(&path2).unwrap(), ArchiveType::SevenZ);
    }

    #[test]
    fn test_detect_unsupported() {
        let path = PathBuf::from("archive.rar");
        assert!(matches!(
            detect_format(&path),
            Err(ExtractionError::UnsupportedFormat)
        ));
    }

    #[test]
    fn test_archive_type_sevenz_equality() {
        assert_eq!(ArchiveType::SevenZ, ArchiveType::SevenZ);
        assert_ne!(ArchiveType::SevenZ, ArchiveType::Zip);
    }

    #[test]
    fn test_archive_type_sevenz_debug() {
        let format = ArchiveType::SevenZ;
        let debug_str = format!("{format:?}");
        assert_eq!(debug_str, "SevenZ");
    }
}