use std::path::PathBuf;
use clap::{Parser, Subcommand, ValueEnum};
use regex::Regex;
#[derive(Parser)]
#[command(version, about, long_about = None)]
pub struct Cli {
#[command(subcommand)]
pub command: Option<Commands>,
}
#[derive(Subcommand)]
pub enum Commands {
DownloadPaths {
#[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
snapshot: String,
#[arg(value_name = "SUBSET")]
data_type: DataType,
#[arg(value_name = "DESTINATION")]
dst: PathBuf,
},
Download {
#[arg(value_name = "PATHS")]
path_file: PathBuf,
#[arg(value_name = "DESTINATION")]
dst: PathBuf,
#[arg(short, long)]
files_only: bool,
#[arg(short, long)]
numbered: bool,
#[arg(short, long, default_value = "10", value_name = "NUMBER OF THREADS")]
threads: usize,
#[arg(
short,
long,
default_value = "1000",
value_name = "MAX RETRIES PER FILE"
)]
retries: usize,
#[arg(short, long, action)]
progress: bool,
#[arg(short, long)]
strict: bool,
},
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum DataType {
Segment,
Warc,
Wat,
Wet,
Robotstxt,
Non200responses,
CcIndex,
CcIndexTable,
}
impl DataType {
pub fn as_str(&self) -> &str {
match self {
DataType::Segment => "segment",
DataType::Warc => "warc",
DataType::Wat => "wat",
DataType::Wet => "wet",
DataType::Robotstxt => "robotstxt",
DataType::Non200responses => "non200responses",
DataType::CcIndex => "cc-index",
DataType::CcIndexTable => "cc-index-table",
}
}
}
pub fn crawl_name_format(crawl: &str) -> Result<String, String> {
let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
let crawl_ref = crawl.to_uppercase();
if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string())
} else {
Ok(crawl_ref)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn valid_cc_main_format() {
assert_eq!(
crawl_name_format("CC-MAIN-2025-08"),
Ok("CC-MAIN-2025-08".to_string())
);
}
#[test]
fn valid_cc_news_format() {
assert_eq!(
crawl_name_format("CC-NEWS-2025-01"),
Ok("CC-NEWS-2025-01".to_string())
);
}
#[test]
fn case_insensitive_crawl_name() {
assert_eq!(
crawl_name_format("cc-main-2021-04"),
Ok("CC-MAIN-2021-04".to_string())
);
}
#[test]
fn invalid_crawl_name_missing_week() {
assert!(crawl_name_format("CC-MAIN-2025").is_err());
}
#[test]
fn invalid_crawl_name_wrong_prefix() {
assert!(crawl_name_format("CC-OTHER-2025-08").is_err());
}
#[test]
fn invalid_crawl_name_extra_digits() {
assert!(crawl_name_format("CC-MAIN-2025-123").is_err());
}
#[test]
fn invalid_crawl_name_empty() {
assert!(crawl_name_format("").is_err());
}
#[test]
fn data_type_segment() {
assert_eq!(DataType::Segment.as_str(), "segment");
}
#[test]
fn data_type_warc() {
assert_eq!(DataType::Warc.as_str(), "warc");
}
#[test]
fn data_type_wat() {
assert_eq!(DataType::Wat.as_str(), "wat");
}
#[test]
fn data_type_wet() {
assert_eq!(DataType::Wet.as_str(), "wet");
}
#[test]
fn data_type_robotstxt() {
assert_eq!(DataType::Robotstxt.as_str(), "robotstxt");
}
#[test]
fn data_type_non200responses() {
assert_eq!(DataType::Non200responses.as_str(), "non200responses");
}
#[test]
fn data_type_cc_index() {
assert_eq!(DataType::CcIndex.as_str(), "cc-index");
}
#[test]
fn data_type_cc_index_table() {
assert_eq!(DataType::CcIndexTable.as_str(), "cc-index-table");
}
}