biors 0.46.0

Command-line tools for bio-rs biological AI model input workflows.
use serde_json::Value;
use std::fs;
use std::process::Command;

mod common;
use common::TempDir;

#[test]
fn dataset_inspect_resolves_files_directories_and_globs() {
    let temp = TempDir::new("biors-dataset-inspect");
    let root = temp.write("root.fasta", ">root\nACGN\n");
    fs::create_dir_all(temp.path().join("nested")).expect("create nested dir");
    let nested = temp.path().join("nested/protein.faa");
    fs::write(&nested, ">protein\nMEEPQSDPSV\n").expect("write nested FASTA");
    fs::write(temp.path().join("notes.txt"), "not fasta\n").expect("write notes");

    let glob = temp.path().join("*.fasta").display().to_string();
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("dataset")
        .arg("inspect")
        .arg(temp.path())
        .arg(glob)
        .output()
        .expect("run biors dataset inspect");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["data"]["provided_inputs"], 2);
    assert_eq!(value["data"]["files"], 2);
    assert_eq!(value["data"]["descriptor"]["source"], "local");
    assert_eq!(value["data"]["descriptor"]["version"], "unversioned");
    assert_eq!(value["data"]["descriptor"]["split"], "unspecified");
    assert!(value["data"]["dataset_hash"]
        .as_str()
        .expect("dataset hash")
        .starts_with("sha256:"));
    assert_eq!(
        value["data"]["total_bytes"].as_u64().expect("total bytes"),
        fs::metadata(&root).expect("root metadata").len()
            + fs::metadata(&nested).expect("nested metadata").len()
    );
    assert_eq!(value["data"]["sample_count"], 2);
    let resolved_paths: Vec<_> = value["data"]["resolved_files"]
        .as_array()
        .expect("resolved files")
        .iter()
        .map(|file| file["path"].as_str().expect("path").to_string())
        .collect();
    assert!(resolved_paths.contains(&root.display().to_string()));
    assert!(resolved_paths.contains(&nested.display().to_string()));
    let samples = value["data"]["samples"].as_array().expect("samples");
    assert!(samples.iter().any(|sample| {
        sample["sample_id"] == "root"
            && sample["record_index"] == 0
            && sample["dataset"]["source"] == "local"
    }));
    assert!(samples.iter().any(|sample| {
        sample["sample_id"] == "protein"
            && sample["sequence_length"] == 10
            && sample["file_sha256"]
                .as_str()
                .expect("file hash")
                .starts_with("sha256:")
    }));
}

#[test]
fn dataset_inspect_reports_empty_input_sets() {
    let temp = TempDir::new("biors-dataset-empty");
    fs::write(temp.path().join("notes.txt"), "not fasta\n").expect("write notes");

    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("--json")
        .arg("dataset")
        .arg("inspect")
        .arg(temp.path())
        .output()
        .expect("run empty dataset inspect");

    assert_eq!(output.status.code(), Some(2));
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON error");
    assert_eq!(value["error"]["code"], "dataset.no_inputs");
}

#[test]
fn dataset_inspect_accepts_descriptor_and_metadata() {
    let temp = TempDir::new("biors-dataset-descriptor");
    let input = temp.write("train.fasta", ">P53_HUMAN\nMEEPQSDPSV\n");

    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("dataset")
        .arg("inspect")
        .arg("--source")
        .arg("uniprot")
        .arg("--version")
        .arg("2026_02")
        .arg("--split")
        .arg("train")
        .arg("--metadata")
        .arg("organism=human")
        .arg(&input)
        .output()
        .expect("run biors dataset inspect with descriptor");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["data"]["descriptor"]["source"], "uniprot");
    assert_eq!(value["data"]["descriptor"]["version"], "2026_02");
    assert_eq!(value["data"]["descriptor"]["split"], "train");
    assert_eq!(value["data"]["metadata"]["organism"], "human");
    assert_eq!(value["data"]["samples"][0]["dataset"]["version"], "2026_02");
}