biors 0.47.9

Command-line tools for bio-rs biological AI model input workflows.
use serde_json::Value;
use std::process::{Command, Stdio};

mod common;
use common::ChildInputExt;

#[test]
fn workflow_outputs_validated_tokenized_model_ready_json_with_provenance() {
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("workflow")
        .arg("--max-length")
        .arg("6")
        .arg("-")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("spawn biors workflow")
        .tap_stdin(">seq1\nacde\n");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["ok"], true);
    assert_eq!(value["data"]["workflow"], "protein_model_input.v0");
    assert_eq!(value["data"]["model_ready"], true);
    assert_eq!(
        value["data"]["provenance"]["input_hash"],
        value["input_hash"]
    );
    assert_eq!(
        value["data"]["provenance"]["normalization"],
        "strip_ascii_whitespace_uppercase"
    );
    assert_eq!(
        value["data"]["provenance"]["tokenizer"]["name"],
        "protein-20"
    );
    assert_eq!(
        value["data"]["provenance"]["tokenizer"]["unknown_token_policy"],
        "warn_or_error_with_unknown_token"
    );
    assert_eq!(value["data"]["validation"]["records"], 1);
    assert_eq!(
        value["data"]["validation"]["sequences"][0]["sequence"],
        "ACDE"
    );
    assert_eq!(value["data"]["tokenization"]["summary"]["records"], 1);
    assert_eq!(
        value["data"]["tokenization"]["records"][0]["tokens"],
        serde_json::json!([0, 1, 2, 3])
    );
    assert_eq!(
        value["data"]["model_input"]["records"][0]["input_ids"],
        serde_json::json!([0, 1, 2, 3, 0, 0])
    );
    assert_eq!(value["data"]["readiness_issues"], serde_json::json!([]));
}

#[test]
fn workflow_reports_non_model_ready_sequences_without_losing_validation_context() {
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("workflow")
        .arg("--max-length")
        .arg("6")
        .arg("-")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("spawn biors workflow")
        .tap_stdin(">seq1\nAX*\n");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["data"]["model_ready"], false);
    assert_eq!(value["data"]["model_input"], Value::Null);
    assert_eq!(value["data"]["validation"]["warning_count"], 1);
    assert_eq!(value["data"]["validation"]["error_count"], 1);
    assert_eq!(
        value["data"]["readiness_issues"][0]["code"],
        "sequence.not_model_ready"
    );
    assert_eq!(value["data"]["readiness_issues"][0]["id"], "seq1");
}

#[test]
fn workflow_records_invocation_and_reproducibility_hashes() {
    let first = run_workflow_json(">seq1\nacde\n");
    let second = run_workflow_json(">seq1\nacde\n");

    let provenance = &first["data"]["provenance"];
    assert_eq!(provenance["invocation"]["command"], "biors workflow");
    assert_eq!(
        provenance["invocation"]["arguments"],
        serde_json::json!([
            "--max-length",
            "6",
            "--profile",
            "protein-20",
            "--pad-token-id",
            "0",
            "--padding",
            "fixed-length",
            "-"
        ])
    );

    let replayed = run_workflow_json_with_args(
        provenance["invocation"]["arguments"]
            .as_array()
            .expect("invocation arguments"),
        ">seq1\nacde\n",
    );
    assert_eq!(replayed["ok"], true);
    assert_eq!(replayed["data"]["model_ready"], true);

    for key in ["vocabulary_sha256", "output_data_sha256"] {
        assert!(
            provenance["hashes"][key]
                .as_str()
                .expect("workflow hash")
                .starts_with("sha256:"),
            "{key} should be a sha256 digest"
        );
    }
    assert_eq!(
        first["data"]["provenance"]["hashes"],
        second["data"]["provenance"]["hashes"]
    );
}

#[test]
fn workflow_accepts_nucleotide_profiles() {
    let value = run_workflow_json_with_args(
        &serde_json::json!(["--max-length", "6", "--profile", "dna-iupac", "-"])
            .as_array()
            .expect("arguments")
            .clone(),
        ">dna\nACGT\n",
    );

    assert_eq!(value["ok"], true);
    assert_eq!(value["data"]["workflow"], "sequence_model_input.v0");
    assert_eq!(value["data"]["model_ready"], true);
    assert_eq!(
        value["data"]["provenance"]["validation_alphabet"],
        "dna-iupac"
    );
    assert_eq!(
        value["data"]["provenance"]["tokenizer"]["name"],
        "dna-iupac"
    );
    assert_eq!(
        value["data"]["validation"]["sequences"][0]["alphabet"],
        "dna-iupac"
    );
    assert_eq!(
        value["data"]["tokenization"]["records"][0]["tokens"],
        serde_json::json!([0, 1, 2, 3])
    );
    assert_eq!(
        value["data"]["model_input"]["records"][0]["input_ids"],
        serde_json::json!([0, 1, 2, 3, 0, 0])
    );
}

fn run_workflow_json(input: &str) -> Value {
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("workflow")
        .arg("--max-length")
        .arg("6")
        .arg("-")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("spawn biors workflow")
        .tap_stdin(input);

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());
    serde_json::from_slice(&output.stdout).expect("valid JSON output")
}

fn run_workflow_json_with_args(arguments: &[Value], input: &str) -> Value {
    let mut command = Command::new(env!("CARGO_BIN_EXE_biors"));
    command.arg("workflow");
    for argument in arguments {
        command.arg(argument.as_str().expect("string invocation argument"));
    }
    let output = command
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("spawn biors workflow replay")
        .tap_stdin(input);

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());
    serde_json::from_slice(&output.stdout).expect("valid JSON output")
}