biors 0.37.2

Command-line tools for bio-rs biological AI model input workflows.
use super::package_args::PackageCommand;
use biors_core::{
    model_input::PaddingPolicy,
    sequence::{SequenceKind, SequenceKindSelection},
    tokenizer::ProteinTokenizerProfile,
};
use clap::{Parser, Subcommand};
use clap_complete::Shell;
use std::path::PathBuf;

#[derive(Debug, Parser)]
#[command(name = "biors")]
#[command(about = "Rust/WASM tools for biological AI models.")]
#[command(version)]
pub struct Cli {
    #[arg(long, global = true, help = "Emit machine-readable JSON errors")]
    pub json: bool,
    #[command(subcommand)]
    pub command: Command,
}

#[derive(Debug, Subcommand)]
pub enum Command {
    Batch {
        #[command(subcommand)]
        command: BatchCommand,
    },
    Cache {
        #[command(subcommand)]
        command: CacheCommand,
    },
    Completions {
        #[arg(value_enum)]
        shell: Shell,
    },
    Dataset {
        #[command(subcommand)]
        command: DatasetCommand,
    },
    Debug {
        #[arg(long)]
        max_length: usize,
        path: PathBuf,
    },
    Diff {
        expected: PathBuf,
        observed: PathBuf,
    },
    Doctor,
    Fasta {
        #[command(subcommand)]
        command: FastaCommand,
    },
    Inspect {
        path: PathBuf,
    },
    ModelInput {
        #[arg(long)]
        max_length: usize,
        #[arg(long, default_value_t = 0)]
        pad_token_id: u8,
        #[arg(long, default_value_t = PaddingArg::FixedLength, value_enum)]
        padding: PaddingArg,
        path: PathBuf,
    },
    Package {
        #[command(subcommand)]
        command: PackageCommand,
    },
    Pipeline {
        #[arg(long)]
        config: Option<PathBuf>,
        #[arg(long)]
        dry_run: bool,
        #[arg(long)]
        explain_plan: bool,
        #[arg(long)]
        package: Option<PathBuf>,
        #[arg(long)]
        write_lock: Option<PathBuf>,
        #[arg(long)]
        max_length: Option<usize>,
        #[arg(long, default_value_t = 0)]
        pad_token_id: u8,
        #[arg(long, default_value_t = PaddingArg::FixedLength, value_enum)]
        padding: PaddingArg,
        path: Option<PathBuf>,
    },
    Seq {
        #[command(subcommand)]
        command: SeqCommand,
    },
    Tokenize {
        #[arg(long, value_enum, default_value_t = TokenizerProfileArg::Protein20)]
        profile: TokenizerProfileArg,
        #[arg(long)]
        config: Option<PathBuf>,
        path: PathBuf,
    },
    Tokenizer {
        #[command(subcommand)]
        command: TokenizerCommand,
    },
    Workflow {
        #[arg(long)]
        max_length: usize,
        #[arg(long, default_value_t = 0)]
        pad_token_id: u8,
        #[arg(long, default_value_t = PaddingArg::FixedLength, value_enum)]
        padding: PaddingArg,
        path: PathBuf,
    },
}

#[derive(Debug, Subcommand)]
pub enum TokenizerCommand {
    ConvertHf {
        path: PathBuf,
        #[arg(long)]
        output: Option<PathBuf>,
    },
    Inspect {
        #[arg(long, value_enum, default_value_t = TokenizerProfileArg::Protein20)]
        profile: TokenizerProfileArg,
        #[arg(long)]
        config: Option<PathBuf>,
    },
}

#[derive(Debug, Subcommand)]
pub enum BatchCommand {
    Validate {
        #[arg(long, default_value_t = KindArg::Auto, value_enum)]
        kind: KindArg,
        #[arg(required = true)]
        inputs: Vec<PathBuf>,
    },
}

#[derive(Debug, Subcommand)]
pub enum CacheCommand {
    Clean {
        #[arg(long)]
        root: Option<PathBuf>,
        #[arg(long)]
        dry_run: bool,
        #[arg(long)]
        yes: bool,
    },
    Inspect {
        #[arg(long)]
        root: Option<PathBuf>,
    },
}

#[derive(Debug, Subcommand)]
pub enum FastaCommand {
    Validate {
        #[arg(long, default_value_t = KindArg::Protein, value_enum)]
        kind: KindArg,
        path: PathBuf,
    },
}

#[derive(Debug, Subcommand)]
pub enum DatasetCommand {
    Inspect {
        #[arg(long, default_value = "local")]
        source: String,
        #[arg(long, default_value = "unversioned")]
        version: String,
        #[arg(long, default_value = "unspecified")]
        split: String,
        #[arg(long = "metadata")]
        metadata: Vec<String>,
        #[arg(required = true)]
        inputs: Vec<PathBuf>,
    },
}

#[derive(Debug, Subcommand)]
pub enum SeqCommand {
    Validate {
        #[arg(long, default_value_t = KindArg::Auto, value_enum)]
        kind: KindArg,
        path: PathBuf,
    },
}

#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, clap::ValueEnum)]
pub enum PaddingArg {
    #[default]
    FixedLength,
    NoPadding,
}

impl From<PaddingArg> for PaddingPolicy {
    fn from(value: PaddingArg) -> Self {
        match value {
            PaddingArg::FixedLength => Self::FixedLength,
            PaddingArg::NoPadding => Self::NoPadding,
        }
    }
}

#[derive(Debug, Clone, Copy, Default, clap::ValueEnum)]
pub enum TokenizerProfileArg {
    #[default]
    #[value(name = "protein-20")]
    Protein20,
    #[value(name = "protein-20-special")]
    Protein20Special,
}

impl From<TokenizerProfileArg> for ProteinTokenizerProfile {
    fn from(value: TokenizerProfileArg) -> Self {
        match value {
            TokenizerProfileArg::Protein20 => Self::Protein20,
            TokenizerProfileArg::Protein20Special => Self::Protein20Special,
        }
    }
}

#[derive(Debug, Clone, Copy, Default, clap::ValueEnum)]
pub enum KindArg {
    Auto,
    #[default]
    Protein,
    Dna,
    Rna,
}

impl From<KindArg> for SequenceKindSelection {
    fn from(value: KindArg) -> Self {
        match value {
            KindArg::Auto => Self::Auto,
            KindArg::Protein => Self::Explicit(SequenceKind::Protein),
            KindArg::Dna => Self::Explicit(SequenceKind::Dna),
            KindArg::Rna => Self::Explicit(SequenceKind::Rna),
        }
    }
}