biors 0.47.12

Command-line tools for bio-rs biological AI model input workflows.
use crate::cli::package_skeleton_files::{
    copy_asset, file_sha256, fixture_name, planned_write_paths, validate_required_list, write_docs,
    write_pipeline_config, write_tokenizer_config,
};
use crate::errors::CliError;
use crate::output::print_success;
use biors_core::package::{
    ModelArtifact, ModelFormat, PackageDirectoryLayout, PackageFixture, PackageManifest,
    PipelineConfigArtifact, PipelineStep, RuntimeBackend, RuntimeTarget, RuntimeTargetPlatform,
    SchemaVersion,
};
use serde::Serialize;
use std::collections::BTreeSet;
use std::path::PathBuf;

#[derive(Debug, Serialize)]
struct PackageSkeletonOutput {
    package: String,
    output_dir: String,
    manifest_path: String,
    manifest_sha256: String,
    created_files: Vec<String>,
    next_commands: Vec<String>,
    notes: Vec<String>,
}

pub(crate) struct PackageSkeletonRequest {
    pub(crate) output_dir: PathBuf,
    pub(crate) name: String,
    pub(crate) model: PathBuf,
    pub(crate) tokenizer_config: Option<PathBuf>,
    pub(crate) fixture_input: PathBuf,
    pub(crate) fixture_output: PathBuf,
    pub(crate) license: String,
    pub(crate) citation: String,
    pub(crate) doi: Option<String>,
    pub(crate) model_card_summary: String,
    pub(crate) intended_use: Vec<String>,
    pub(crate) limitations: Vec<String>,
    pub(crate) force: bool,
}

pub(crate) fn create_package_skeleton(request: PackageSkeletonRequest) -> Result<(), CliError> {
    validate_required_list("--intended-use", &request.intended_use)?;
    validate_required_list("--limitation", &request.limitations)?;
    let model_runtime = model_runtime_for_path(&request.model)?;

    let write_paths = planned_write_paths(&request)?;
    reject_duplicate_write_paths(&write_paths)?;
    if !request.force {
        reject_existing_write_paths(&write_paths)?;
    }

    let manifest_path = request.output_dir.join("manifest.json");

    for dir in [
        "models",
        "tokenizers",
        "pipelines",
        "fixtures",
        "observed",
        "docs",
    ] {
        std::fs::create_dir_all(request.output_dir.join(dir)).map_err(CliError::Write)?;
    }

    let mut created_files = Vec::new();
    let model_rel = copy_asset(
        &request.model,
        &request.output_dir,
        "models",
        &mut created_files,
    )?;
    let fixture_input_rel = copy_asset(
        &request.fixture_input,
        &request.output_dir,
        "fixtures",
        &mut created_files,
    )?;
    let fixture_output_rel = copy_asset(
        &request.fixture_output,
        &request.output_dir,
        "fixtures",
        &mut created_files,
    )?;
    let (tokenizer_asset, tokenizer_profile, notes) =
        write_tokenizer_config(&request, &mut created_files)?;
    let pipeline_rel =
        write_pipeline_config(&request.output_dir, &fixture_input_rel, &mut created_files)?;
    let metadata = write_docs(&request, &mut created_files)?;

    let manifest = PackageManifest {
        schema_version: SchemaVersion::BiorsPackageV1,
        name: request.name.clone(),
        package_layout: Some(PackageDirectoryLayout {
            manifest: "manifest.json".to_string(),
            models: "models".to_string(),
            tokenizers: Some("tokenizers".to_string()),
            vocabs: None,
            pipelines: Some("pipelines".to_string()),
            fixtures: "fixtures".to_string(),
            observed: Some("observed".to_string()),
            docs: "docs".to_string(),
        }),
        metadata: Some(metadata),
        model: ModelArtifact {
            format: model_runtime.model_format,
            path: model_rel.clone(),
            checksum: Some(file_sha256(&request.output_dir.join(&model_rel))?),
            metadata: None,
        },
        tokenizer: Some(tokenizer_asset),
        vocab: None,
        preprocessing: vec![PipelineStep {
            name: "protein_fasta_tokenize".to_string(),
            implementation: "biors-core".to_string(),
            contract: tokenizer_profile.as_str().to_string(),
            contract_version: Some(format!("{}.v0", tokenizer_profile.as_str())),
            config: Some(PipelineConfigArtifact {
                path: pipeline_rel.clone(),
                schema_version: biors_core::package::PipelineConfigVersion::BiorsPipelineV0,
                checksum: Some(file_sha256(&request.output_dir.join(&pipeline_rel))?),
            }),
        }],
        postprocessing: Vec::new(),
        runtime: RuntimeTarget {
            backend: model_runtime.runtime_backend,
            target: model_runtime.runtime_target,
            version: Some(format!("{}.v0", model_runtime.runtime_backend)),
        },
        expected_input: None,
        expected_output: None,
        fixtures: vec![PackageFixture {
            name: fixture_name(&request.fixture_input),
            input: fixture_input_rel.clone(),
            expected_output: fixture_output_rel.clone(),
            input_hash: Some(file_sha256(&request.output_dir.join(&fixture_input_rel))?),
            expected_output_hash: Some(file_sha256(&request.output_dir.join(&fixture_output_rel))?),
        }],
    };

    let manifest_json = serde_json::to_string_pretty(&manifest).map_err(CliError::Serialization)?;
    std::fs::write(&manifest_path, format!("{manifest_json}\n")).map_err(CliError::Write)?;
    created_files.push(manifest_path.display().to_string());
    let manifest_sha256 = file_sha256(&manifest_path)?;

    let output = PackageSkeletonOutput {
        package: request.name,
        output_dir: request.output_dir.display().to_string(),
        manifest_path: manifest_path.display().to_string(),
        manifest_sha256,
        created_files,
        next_commands: vec![
            format!("biors package validate {}", manifest_path.display()),
            format!(
                "biors package verify {} <observations.json>",
                manifest_path.display()
            ),
        ],
        notes,
    };
    print_success(None, output)
}

struct ModelRuntimeDefaults {
    model_format: ModelFormat,
    runtime_backend: RuntimeBackend,
    runtime_target: RuntimeTargetPlatform,
}

fn model_runtime_for_path(path: &std::path::Path) -> Result<ModelRuntimeDefaults, CliError> {
    let extension = path
        .extension()
        .and_then(|value| value.to_str())
        .map(str::to_ascii_lowercase);

    match extension.as_deref() {
        Some("onnx") => Ok(ModelRuntimeDefaults {
            model_format: ModelFormat::Onnx,
            runtime_backend: RuntimeBackend::OnnxWebgpu,
            runtime_target: RuntimeTargetPlatform::BrowserWasmWebgpu,
        }),
        Some("safetensors") => Ok(ModelRuntimeDefaults {
            model_format: ModelFormat::Safetensors,
            runtime_backend: RuntimeBackend::Candle,
            runtime_target: RuntimeTargetPlatform::LocalCpu,
        }),
        _ => Err(CliError::Validation {
            code: "package.init_unsupported_model_format",
            message: "package init supports .onnx and .safetensors model files".to_string(),
            location: Some(path.display().to_string()),
        }),
    }
}

fn reject_existing_write_paths(write_paths: &[PathBuf]) -> Result<(), CliError> {
    let collisions: Vec<_> = write_paths.iter().filter(|path| path.exists()).collect();
    if !collisions.is_empty() {
        let locations = collisions
            .iter()
            .map(|path| path.display().to_string())
            .collect::<Vec<_>>()
            .join(", ");
        return Err(CliError::Validation {
            code: "package.init_exists",
            message:
                "package initialization would overwrite existing files; pass --force to overwrite"
                    .to_string(),
            location: Some(locations),
        });
    }
    Ok(())
}

fn reject_duplicate_write_paths(write_paths: &[PathBuf]) -> Result<(), CliError> {
    let mut seen = BTreeSet::new();
    let mut duplicates = BTreeSet::new();
    for path in write_paths {
        let key = path.display().to_string();
        if !seen.insert(key.clone()) {
            duplicates.insert(key);
        }
    }

    if duplicates.is_empty() {
        return Ok(());
    }

    Err(CliError::Validation {
        code: "package.init_invalid_path",
        message: "package initialization has duplicate target paths".to_string(),
        location: Some(duplicates.into_iter().collect::<Vec<_>>().join(", ")),
    })
}