biors 0.37.0

Command-line tools for bio-rs biological AI model input workflows.
use serde_json::Value;
use std::process::{Command, Stdio};

mod common;
use common::ChildInputExt;

#[test]
fn workflow_outputs_validated_tokenized_model_ready_json_with_provenance() {
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("workflow")
        .arg("--max-length")
        .arg("6")
        .arg("-")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("spawn biors workflow")
        .tap_stdin(">seq1\nacde\n");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["ok"], true);
    assert_eq!(value["data"]["workflow"], "protein_model_input.v0");
    assert_eq!(value["data"]["model_ready"], true);
    assert_eq!(
        value["data"]["provenance"]["input_hash"],
        value["input_hash"]
    );
    assert_eq!(
        value["data"]["provenance"]["normalization"],
        "strip_ascii_whitespace_uppercase"
    );
    assert_eq!(
        value["data"]["provenance"]["tokenizer"]["name"],
        "protein-20"
    );
    assert_eq!(
        value["data"]["provenance"]["tokenizer"]["unknown_token_policy"],
        "warn_or_error_with_unknown_token"
    );
    assert_eq!(value["data"]["validation"]["records"], 1);
    assert_eq!(
        value["data"]["validation"]["sequences"][0]["sequence"],
        "ACDE"
    );
    assert_eq!(value["data"]["tokenization"]["summary"]["records"], 1);
    assert_eq!(
        value["data"]["tokenization"]["records"][0]["tokens"],
        serde_json::json!([0, 1, 2, 3])
    );
    assert_eq!(
        value["data"]["model_input"]["records"][0]["input_ids"],
        serde_json::json!([0, 1, 2, 3, 0, 0])
    );
    assert_eq!(value["data"]["readiness_issues"], serde_json::json!([]));
}

#[test]
fn workflow_reports_non_model_ready_sequences_without_losing_validation_context() {
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("workflow")
        .arg("--max-length")
        .arg("6")
        .arg("-")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("spawn biors workflow")
        .tap_stdin(">seq1\nAX*\n");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["data"]["model_ready"], false);
    assert_eq!(value["data"]["model_input"], Value::Null);
    assert_eq!(value["data"]["validation"]["warning_count"], 1);
    assert_eq!(value["data"]["validation"]["error_count"], 1);
    assert_eq!(
        value["data"]["readiness_issues"][0]["code"],
        "sequence.not_model_ready"
    );
    assert_eq!(value["data"]["readiness_issues"][0]["id"], "seq1");
}

#[test]
fn workflow_records_invocation_and_reproducibility_hashes() {
    let first = run_workflow_json(">seq1\nacde\n");
    let second = run_workflow_json(">seq1\nacde\n");

    let provenance = &first["data"]["provenance"];
    assert_eq!(provenance["invocation"]["command"], "biors workflow");
    assert_eq!(
        provenance["invocation"]["arguments"],
        serde_json::json!([
            "--max-length",
            "6",
            "--pad-token-id",
            "0",
            "--padding",
            "fixed_length",
            "-"
        ])
    );

    for key in ["vocabulary_sha256", "output_data_sha256"] {
        assert!(
            provenance["hashes"][key]
                .as_str()
                .expect("workflow hash")
                .starts_with("sha256:"),
            "{key} should be a sha256 digest"
        );
    }
    assert_eq!(
        first["data"]["provenance"]["hashes"],
        second["data"]["provenance"]["hashes"]
    );
}

fn run_workflow_json(input: &str) -> Value {
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("workflow")
        .arg("--max-length")
        .arg("6")
        .arg("-")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("spawn biors workflow")
        .tap_stdin(input);

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());
    serde_json::from_slice(&output.stdout).expect("valid JSON output")
}