use std::path::PathBuf;
use clap::{Parser, Subcommand, ValueEnum};
use regex::Regex;
#[derive(Parser)]
#[command(version, about, long_about = None)]
pub struct Cli {
#[command(subcommand)]
pub command: Option<Commands>,
}
#[derive(Subcommand)]
pub enum Commands {
DownloadPaths {
#[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
snapshot: String,
#[arg(value_name = "SUBSET")]
data_type: DataType,
#[arg(value_name = "DESTINATION")]
dst: PathBuf,
},
Download {
#[arg(value_name = "PATHS")]
path_file: PathBuf,
#[arg(value_name = "DESTINATION")]
dst: PathBuf,
#[arg(short, long)]
files_only: bool,
#[arg(short, long)]
numbered: bool,
#[arg(short, long, default_value = "10", value_name = "NUMBER OF THREADS")]
threads: usize,
#[arg(
short,
long,
default_value = "1000",
value_name = "MAX RETRIES PER FILE"
)]
retries: usize,
#[arg(short, long, action)]
progress: bool,
},
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
pub enum DataType {
Segment,
Warc,
Wat,
Wet,
Robotstxt,
Non200responses,
CcIndex,
CcIndexTable,
}
impl DataType {
pub fn as_str(&self) -> &str {
match self {
DataType::Segment => "segment",
DataType::Warc => "warc",
DataType::Wat => "wat",
DataType::Wet => "wet",
DataType::Robotstxt => "robotstxt",
DataType::Non200responses => "non200responses",
DataType::CcIndex => "cc-index",
DataType::CcIndexTable => "cc-index-table",
}
}
}
fn crawl_name_format(crawl: &str) -> Result<String, String> {
let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();
let crawl_ref = crawl.to_uppercase();
if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string())
} else {
Ok(crawl_ref)
}
}