Skip to main content

use_warc/
lib.rs

1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4//! WARC and ARC web archive labels for `RustUse`.
5
6use core::fmt;
7
8/// Common WARC extension.
9pub const WARC_EXTENSION: &str = "warc";
10/// Common gzip-compressed WARC extension.
11pub const WARC_GZIP_EXTENSION: &str = "warc.gz";
12/// Common ARC extension.
13pub const ARC_EXTENSION: &str = "arc";
14/// Common gzip-compressed ARC extension.
15pub const ARC_GZIP_EXTENSION: &str = "arc.gz";
16/// Common WARC/ARC-related extensions.
17pub const WARC_EXTENSIONS: &[&str] = &["warc", "warc.gz", "arc", "arc.gz"];
18
19/// Web archive format labels.
20#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
21pub enum WarcFormat {
22    /// WARC format label.
23    Warc,
24    /// ARC format label.
25    Arc,
26    /// Unknown or intentionally unspecified web archive format.
27    #[default]
28    Unknown,
29}
30
31impl WarcFormat {
32    /// Returns a stable lowercase label.
33    #[must_use]
34    pub const fn as_str(self) -> &'static str {
35        match self {
36            Self::Warc => "warc",
37            Self::Arc => "arc",
38            Self::Unknown => "unknown",
39        }
40    }
41}
42
43impl fmt::Display for WarcFormat {
44    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45        formatter.write_str(self.as_str())
46    }
47}
48
49/// WARC record kind labels.
50#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
51pub enum WarcRecordKind {
52    /// `warcinfo` record.
53    WarcInfo,
54    /// Response record.
55    Response,
56    /// Request record.
57    Request,
58    /// Metadata record.
59    Metadata,
60    /// Revisit record.
61    Revisit,
62    /// Conversion record.
63    Conversion,
64    /// Continuation record.
65    Continuation,
66    /// Resource record.
67    Resource,
68    /// Unknown or unsupported record kind.
69    #[default]
70    Unknown,
71}
72
73impl WarcRecordKind {
74    /// Returns a stable lowercase label.
75    #[must_use]
76    pub const fn as_str(self) -> &'static str {
77        match self {
78            Self::WarcInfo => "warcinfo",
79            Self::Response => "response",
80            Self::Request => "request",
81            Self::Metadata => "metadata",
82            Self::Revisit => "revisit",
83            Self::Conversion => "conversion",
84            Self::Continuation => "continuation",
85            Self::Resource => "resource",
86            Self::Unknown => "unknown",
87        }
88    }
89}
90
91impl fmt::Display for WarcRecordKind {
92    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
93        formatter.write_str(self.as_str())
94    }
95}
96
97/// Returns whether `extension` is a known WARC/ARC extension label.
98#[must_use]
99pub fn is_warc_extension(extension: &str) -> bool {
100    matches!(
101        normalize_extension(extension).as_str(),
102        "warc" | "warc.gz" | "arc" | "arc.gz"
103    )
104}
105
106/// Returns whether `name` has a known WARC/ARC filename encoding.
107#[must_use]
108pub fn is_warc_filename(name: &str) -> bool {
109    let parts = filename_parts(name);
110
111    match parts.as_slice() {
112        [.., last] if matches!(last.as_str(), "warc" | "arc") => true,
113        [.., previous, last] if matches!(previous.as_str(), "warc" | "arc") && last == "gz" => true,
114        _ => false,
115    }
116}
117
118fn normalize_extension(extension: &str) -> String {
119    extension
120        .trim()
121        .trim_start_matches('.')
122        .to_ascii_lowercase()
123}
124
125fn filename_parts(name: &str) -> Vec<String> {
126    name.trim()
127        .to_ascii_lowercase()
128        .rsplit(['/', '\\'])
129        .next()
130        .unwrap_or_default()
131        .trim_start_matches('.')
132        .split('.')
133        .filter(|part| !part.is_empty())
134        .map(str::to_owned)
135        .collect()
136}
137
138#[cfg(test)]
139mod tests {
140    use super::{WARC_EXTENSIONS, WarcFormat, WarcRecordKind, is_warc_extension, is_warc_filename};
141
142    #[test]
143    fn detects_warc_extensions() {
144        assert!(is_warc_extension(".warc"));
145        assert!(is_warc_extension("warc.gz"));
146        assert!(is_warc_extension("arc.gz"));
147        assert_eq!(WARC_EXTENSIONS[0], "warc");
148    }
149
150    #[test]
151    fn detects_warc_filenames() {
152        assert!(is_warc_filename("crawl.warc"));
153        assert!(is_warc_filename("crawl.ARC.GZ"));
154        assert!(!is_warc_filename("bundle.zip"));
155    }
156
157    #[test]
158    fn exposes_default_and_unknown_labels() {
159        assert_eq!(WarcFormat::default(), WarcFormat::Unknown);
160        assert_eq!(WarcFormat::Arc.as_str(), "arc");
161        assert_eq!(WarcRecordKind::default(), WarcRecordKind::Unknown);
162        assert_eq!(WarcRecordKind::Continuation.as_str(), "continuation");
163    }
164}