#![forbid(unsafe_code)]
#![doc = include_str!("../README.md")]
use core::fmt;
pub const WARC_EXTENSION: &str = "warc";
pub const WARC_GZIP_EXTENSION: &str = "warc.gz";
pub const ARC_EXTENSION: &str = "arc";
pub const ARC_GZIP_EXTENSION: &str = "arc.gz";
pub const WARC_EXTENSIONS: &[&str] = &["warc", "warc.gz", "arc", "arc.gz"];
#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum WarcFormat {
Warc,
Arc,
#[default]
Unknown,
}
impl WarcFormat {
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::Warc => "warc",
Self::Arc => "arc",
Self::Unknown => "unknown",
}
}
}
impl fmt::Display for WarcFormat {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
formatter.write_str(self.as_str())
}
}
#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum WarcRecordKind {
WarcInfo,
Response,
Request,
Metadata,
Revisit,
Conversion,
Continuation,
Resource,
#[default]
Unknown,
}
impl WarcRecordKind {
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::WarcInfo => "warcinfo",
Self::Response => "response",
Self::Request => "request",
Self::Metadata => "metadata",
Self::Revisit => "revisit",
Self::Conversion => "conversion",
Self::Continuation => "continuation",
Self::Resource => "resource",
Self::Unknown => "unknown",
}
}
}
impl fmt::Display for WarcRecordKind {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
formatter.write_str(self.as_str())
}
}
#[must_use]
pub fn is_warc_extension(extension: &str) -> bool {
matches!(
normalize_extension(extension).as_str(),
"warc" | "warc.gz" | "arc" | "arc.gz"
)
}
#[must_use]
pub fn is_warc_filename(name: &str) -> bool {
let parts = filename_parts(name);
match parts.as_slice() {
[.., last] if matches!(last.as_str(), "warc" | "arc") => true,
[.., previous, last] if matches!(previous.as_str(), "warc" | "arc") && last == "gz" => true,
_ => false,
}
}
fn normalize_extension(extension: &str) -> String {
extension
.trim()
.trim_start_matches('.')
.to_ascii_lowercase()
}
fn filename_parts(name: &str) -> Vec<String> {
name.trim()
.to_ascii_lowercase()
.rsplit(['/', '\\'])
.next()
.unwrap_or_default()
.trim_start_matches('.')
.split('.')
.filter(|part| !part.is_empty())
.map(str::to_owned)
.collect()
}
#[cfg(test)]
mod tests {
use super::{WARC_EXTENSIONS, WarcFormat, WarcRecordKind, is_warc_extension, is_warc_filename};
#[test]
fn detects_warc_extensions() {
assert!(is_warc_extension(".warc"));
assert!(is_warc_extension("warc.gz"));
assert!(is_warc_extension("arc.gz"));
assert_eq!(WARC_EXTENSIONS[0], "warc");
}
#[test]
fn detects_warc_filenames() {
assert!(is_warc_filename("crawl.warc"));
assert!(is_warc_filename("crawl.ARC.GZ"));
assert!(!is_warc_filename("bundle.zip"));
}
#[test]
fn exposes_default_and_unknown_labels() {
assert_eq!(WarcFormat::default(), WarcFormat::Unknown);
assert_eq!(WarcFormat::Arc.as_str(), "arc");
assert_eq!(WarcRecordKind::default(), WarcRecordKind::Unknown);
assert_eq!(WarcRecordKind::Continuation.as_str(), "continuation");
}
}