1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4use core::fmt;
7
8pub const WARC_EXTENSION: &str = "warc";
10pub const WARC_GZIP_EXTENSION: &str = "warc.gz";
12pub const ARC_EXTENSION: &str = "arc";
14pub const ARC_GZIP_EXTENSION: &str = "arc.gz";
16pub const WARC_EXTENSIONS: &[&str] = &["warc", "warc.gz", "arc", "arc.gz"];
18
19#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
21pub enum WarcFormat {
22 Warc,
24 Arc,
26 #[default]
28 Unknown,
29}
30
31impl WarcFormat {
32 #[must_use]
34 pub const fn as_str(self) -> &'static str {
35 match self {
36 Self::Warc => "warc",
37 Self::Arc => "arc",
38 Self::Unknown => "unknown",
39 }
40 }
41}
42
43impl fmt::Display for WarcFormat {
44 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
45 formatter.write_str(self.as_str())
46 }
47}
48
49#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
51pub enum WarcRecordKind {
52 WarcInfo,
54 Response,
56 Request,
58 Metadata,
60 Revisit,
62 Conversion,
64 Continuation,
66 Resource,
68 #[default]
70 Unknown,
71}
72
73impl WarcRecordKind {
74 #[must_use]
76 pub const fn as_str(self) -> &'static str {
77 match self {
78 Self::WarcInfo => "warcinfo",
79 Self::Response => "response",
80 Self::Request => "request",
81 Self::Metadata => "metadata",
82 Self::Revisit => "revisit",
83 Self::Conversion => "conversion",
84 Self::Continuation => "continuation",
85 Self::Resource => "resource",
86 Self::Unknown => "unknown",
87 }
88 }
89}
90
91impl fmt::Display for WarcRecordKind {
92 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
93 formatter.write_str(self.as_str())
94 }
95}
96
97#[must_use]
99pub fn is_warc_extension(extension: &str) -> bool {
100 matches!(
101 normalize_extension(extension).as_str(),
102 "warc" | "warc.gz" | "arc" | "arc.gz"
103 )
104}
105
106#[must_use]
108pub fn is_warc_filename(name: &str) -> bool {
109 let parts = filename_parts(name);
110
111 match parts.as_slice() {
112 [.., last] if matches!(last.as_str(), "warc" | "arc") => true,
113 [.., previous, last] if matches!(previous.as_str(), "warc" | "arc") && last == "gz" => true,
114 _ => false,
115 }
116}
117
118fn normalize_extension(extension: &str) -> String {
119 extension
120 .trim()
121 .trim_start_matches('.')
122 .to_ascii_lowercase()
123}
124
125fn filename_parts(name: &str) -> Vec<String> {
126 name.trim()
127 .to_ascii_lowercase()
128 .rsplit(['/', '\\'])
129 .next()
130 .unwrap_or_default()
131 .trim_start_matches('.')
132 .split('.')
133 .filter(|part| !part.is_empty())
134 .map(str::to_owned)
135 .collect()
136}
137
138#[cfg(test)]
139mod tests {
140 use super::{WARC_EXTENSIONS, WarcFormat, WarcRecordKind, is_warc_extension, is_warc_filename};
141
142 #[test]
143 fn detects_warc_extensions() {
144 assert!(is_warc_extension(".warc"));
145 assert!(is_warc_extension("warc.gz"));
146 assert!(is_warc_extension("arc.gz"));
147 assert_eq!(WARC_EXTENSIONS[0], "warc");
148 }
149
150 #[test]
151 fn detects_warc_filenames() {
152 assert!(is_warc_filename("crawl.warc"));
153 assert!(is_warc_filename("crawl.ARC.GZ"));
154 assert!(!is_warc_filename("bundle.zip"));
155 }
156
157 #[test]
158 fn exposes_default_and_unknown_labels() {
159 assert_eq!(WarcFormat::default(), WarcFormat::Unknown);
160 assert_eq!(WarcFormat::Arc.as_str(), "arc");
161 assert_eq!(WarcRecordKind::default(), WarcRecordKind::Unknown);
162 assert_eq!(WarcRecordKind::Continuation.as_str(), "continuation");
163 }
164}