use-warc 0.1.0

WARC and ARC web archive labels, extensions, and record metadata for RustUse
Documentation
#![forbid(unsafe_code)]
#![doc = include_str!("../README.md")]

//! WARC and ARC web archive labels for `RustUse`.

use core::fmt;

/// Common WARC extension.
pub const WARC_EXTENSION: &str = "warc";
/// Common gzip-compressed WARC extension.
pub const WARC_GZIP_EXTENSION: &str = "warc.gz";
/// Common ARC extension.
pub const ARC_EXTENSION: &str = "arc";
/// Common gzip-compressed ARC extension.
pub const ARC_GZIP_EXTENSION: &str = "arc.gz";
/// Common WARC/ARC-related extensions.
pub const WARC_EXTENSIONS: &[&str] = &["warc", "warc.gz", "arc", "arc.gz"];

/// Web archive format labels.
#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum WarcFormat {
    /// WARC format label.
    Warc,
    /// ARC format label.
    Arc,
    /// Unknown or intentionally unspecified web archive format.
    #[default]
    Unknown,
}

impl WarcFormat {
    /// Returns a stable lowercase label.
    #[must_use]
    pub const fn as_str(self) -> &'static str {
        match self {
            Self::Warc => "warc",
            Self::Arc => "arc",
            Self::Unknown => "unknown",
        }
    }
}

impl fmt::Display for WarcFormat {
    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
        formatter.write_str(self.as_str())
    }
}

/// WARC record kind labels.
#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum WarcRecordKind {
    /// `warcinfo` record.
    WarcInfo,
    /// Response record.
    Response,
    /// Request record.
    Request,
    /// Metadata record.
    Metadata,
    /// Revisit record.
    Revisit,
    /// Conversion record.
    Conversion,
    /// Continuation record.
    Continuation,
    /// Resource record.
    Resource,
    /// Unknown or unsupported record kind.
    #[default]
    Unknown,
}

impl WarcRecordKind {
    /// Returns a stable lowercase label.
    #[must_use]
    pub const fn as_str(self) -> &'static str {
        match self {
            Self::WarcInfo => "warcinfo",
            Self::Response => "response",
            Self::Request => "request",
            Self::Metadata => "metadata",
            Self::Revisit => "revisit",
            Self::Conversion => "conversion",
            Self::Continuation => "continuation",
            Self::Resource => "resource",
            Self::Unknown => "unknown",
        }
    }
}

impl fmt::Display for WarcRecordKind {
    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
        formatter.write_str(self.as_str())
    }
}

/// Returns whether `extension` is a known WARC/ARC extension label.
#[must_use]
pub fn is_warc_extension(extension: &str) -> bool {
    matches!(
        normalize_extension(extension).as_str(),
        "warc" | "warc.gz" | "arc" | "arc.gz"
    )
}

/// Returns whether `name` has a known WARC/ARC filename encoding.
#[must_use]
pub fn is_warc_filename(name: &str) -> bool {
    let parts = filename_parts(name);

    match parts.as_slice() {
        [.., last] if matches!(last.as_str(), "warc" | "arc") => true,
        [.., previous, last] if matches!(previous.as_str(), "warc" | "arc") && last == "gz" => true,
        _ => false,
    }
}

fn normalize_extension(extension: &str) -> String {
    extension
        .trim()
        .trim_start_matches('.')
        .to_ascii_lowercase()
}

fn filename_parts(name: &str) -> Vec<String> {
    name.trim()
        .to_ascii_lowercase()
        .rsplit(['/', '\\'])
        .next()
        .unwrap_or_default()
        .trim_start_matches('.')
        .split('.')
        .filter(|part| !part.is_empty())
        .map(str::to_owned)
        .collect()
}

#[cfg(test)]
mod tests {
    use super::{WARC_EXTENSIONS, WarcFormat, WarcRecordKind, is_warc_extension, is_warc_filename};

    #[test]
    fn detects_warc_extensions() {
        assert!(is_warc_extension(".warc"));
        assert!(is_warc_extension("warc.gz"));
        assert!(is_warc_extension("arc.gz"));
        assert_eq!(WARC_EXTENSIONS[0], "warc");
    }

    #[test]
    fn detects_warc_filenames() {
        assert!(is_warc_filename("crawl.warc"));
        assert!(is_warc_filename("crawl.ARC.GZ"));
        assert!(!is_warc_filename("bundle.zip"));
    }

    #[test]
    fn exposes_default_and_unknown_labels() {
        assert_eq!(WarcFormat::default(), WarcFormat::Unknown);
        assert_eq!(WarcFormat::Arc.as_str(), "arc");
        assert_eq!(WarcRecordKind::default(), WarcRecordKind::Unknown);
        assert_eq!(WarcRecordKind::Continuation.as_str(), "continuation");
    }
}