biors 0.37.2

Command-line tools for bio-rs biological AI model input workflows.
use super::pipeline_config::ResolvedPipelineConfig;
use crate::errors::CliError;
use biors_core::{hash::sha256_digest, package::PackageManifest, workflow::SequenceWorkflowOutput};
use serde::Serialize;
use std::path::{Path, PathBuf};

pub(crate) struct PipelineLockPackage {
    pub(crate) manifest_path: PathBuf,
    pub(crate) manifest: PackageManifest,
}

#[derive(Debug, Serialize)]
struct PipelineLockfile {
    schema_version: &'static str,
    generated_by: PipelineLockGenerator,
    pipeline_config: PipelineLockConfig,
    package: Option<PipelineLockPackageSection>,
    execution: PipelineLockExecution,
    hashes: PipelineLockHashes,
    python_baseline: PipelineLockPythonBaseline,
}

#[derive(Debug, Serialize)]
struct PipelineLockGenerator {
    biors_version: &'static str,
    biors_core_version: String,
}

#[derive(Debug, Serialize)]
struct PipelineLockConfig {
    schema_version: String,
    name: String,
    path: String,
    sha256: String,
}

#[derive(Debug, Serialize)]
struct PipelineLockPackageSection {
    name: String,
    schema_version: String,
    manifest_path: String,
    model_sha256: String,
    runtime_backend: String,
    runtime_target: String,
    backend_version: String,
}

#[derive(Debug, Serialize)]
struct PipelineLockExecution {
    command: String,
    arguments: Vec<String>,
    input_path: String,
    input_hash: String,
    ready: bool,
}

#[derive(Debug, Serialize)]
struct PipelineLockHashes {
    vocabulary_sha256: String,
    output_data_sha256: String,
}

#[derive(Debug, Serialize)]
struct PipelineLockPythonBaseline {
    comparison: &'static str,
    reference: &'static str,
    status: &'static str,
}

pub(crate) fn write_pipeline_lock(
    lock_path: &Path,
    config_path: &Path,
    resolved: &ResolvedPipelineConfig,
    workflow: &SequenceWorkflowOutput,
    package: Option<&PipelineLockPackage>,
) -> Result<(), CliError> {
    let lockfile = build_pipeline_lock(config_path, resolved, workflow, package)?;
    let json = serde_json::to_string_pretty(&lockfile).map_err(CliError::Serialization)?;
    if let Some(parent) = lock_path
        .parent()
        .filter(|path| !path.as_os_str().is_empty())
    {
        std::fs::create_dir_all(parent).map_err(CliError::Write)?;
    }
    std::fs::write(lock_path, format!("{json}\n")).map_err(CliError::Write)
}

fn build_pipeline_lock(
    config_path: &Path,
    resolved: &ResolvedPipelineConfig,
    workflow: &SequenceWorkflowOutput,
    package: Option<&PipelineLockPackage>,
) -> Result<PipelineLockfile, CliError> {
    Ok(PipelineLockfile {
        schema_version: "biors.pipeline.lock.v0",
        generated_by: PipelineLockGenerator {
            biors_version: env!("CARGO_PKG_VERSION"),
            biors_core_version: workflow.provenance.biors_core_version.clone(),
        },
        pipeline_config: PipelineLockConfig {
            schema_version: resolved.config.schema_version.to_string(),
            name: resolved.config.name.clone(),
            path: config_path.display().to_string(),
            sha256: file_sha256(config_path)?,
        },
        package: package.map(package_lock_section).transpose()?,
        execution: PipelineLockExecution {
            command: workflow.provenance.invocation.command.clone(),
            arguments: workflow.provenance.invocation.arguments.clone(),
            input_path: resolved.input_path.display().to_string(),
            input_hash: workflow.provenance.input_hash.clone(),
            ready: workflow.model_ready,
        },
        hashes: PipelineLockHashes {
            vocabulary_sha256: workflow.provenance.hashes.vocabulary_sha256.clone(),
            output_data_sha256: workflow.provenance.hashes.output_data_sha256.clone(),
        },
        python_baseline: PipelineLockPythonBaseline {
            comparison: "normalized_records_and_protein20_tokens",
            reference: "examples/model-input-contract/reference-python-parity.json",
            status: "strategy_recorded",
        },
    })
}

fn package_lock_section(
    package: &PipelineLockPackage,
) -> Result<PipelineLockPackageSection, CliError> {
    let model_sha256 =
        package
            .manifest
            .model
            .checksum
            .clone()
            .ok_or_else(|| CliError::Validation {
                code: "pipeline.lock_requires_model_checksum",
                message: "package model.checksum is required for pipeline.lock".to_string(),
                location: Some("package.model.checksum".to_string()),
            })?;
    let backend_version = package
        .manifest
        .runtime
        .version
        .clone()
        .unwrap_or_else(|| format!("{}.v0", package.manifest.runtime.backend));

    Ok(PipelineLockPackageSection {
        name: package.manifest.name.clone(),
        schema_version: package.manifest.schema_version.to_string(),
        manifest_path: package.manifest_path.display().to_string(),
        model_sha256,
        runtime_backend: package.manifest.runtime.backend.to_string(),
        runtime_target: package.manifest.runtime.target.to_string(),
        backend_version,
    })
}

fn file_sha256(path: &Path) -> Result<String, CliError> {
    let bytes = std::fs::read(path).map_err(|source| CliError::Read {
        path: path.to_path_buf(),
        source,
    })?;
    Ok(sha256_digest(&bytes))
}