use serde_json::Value;
use std::fs;
use std::process::Command;
mod common;
use common::TempDir;
#[test]
fn dataset_inspect_resolves_files_directories_and_globs() {
let temp = TempDir::new("biors-dataset-inspect");
let root = temp.write("root.fasta", ">root\nACGN\n");
fs::create_dir_all(temp.path().join("nested")).expect("create nested dir");
let nested = temp.path().join("nested/protein.faa");
fs::write(&nested, ">protein\nMEEPQSDPSV\n").expect("write nested FASTA");
fs::write(temp.path().join("notes.txt"), "not fasta\n").expect("write notes");
let glob = temp.path().join("*.fasta").display().to_string();
let output = Command::new(env!("CARGO_BIN_EXE_biors"))
.arg("dataset")
.arg("inspect")
.arg(temp.path())
.arg(glob)
.output()
.expect("run biors dataset inspect");
assert!(
output.status.success(),
"stderr: {}",
String::from_utf8_lossy(&output.stderr)
);
assert!(output.stderr.is_empty());
let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
assert_eq!(value["data"]["provided_inputs"], 2);
assert_eq!(value["data"]["files"], 2);
assert_eq!(value["data"]["descriptor"]["source"], "local");
assert_eq!(value["data"]["descriptor"]["version"], "unversioned");
assert_eq!(value["data"]["descriptor"]["split"], "unspecified");
assert!(value["data"]["dataset_hash"]
.as_str()
.expect("dataset hash")
.starts_with("sha256:"));
assert_eq!(
value["data"]["total_bytes"].as_u64().expect("total bytes"),
fs::metadata(&root).expect("root metadata").len()
+ fs::metadata(&nested).expect("nested metadata").len()
);
assert_eq!(value["data"]["sample_count"], 2);
let resolved_paths: Vec<_> = value["data"]["resolved_files"]
.as_array()
.expect("resolved files")
.iter()
.map(|file| file["path"].as_str().expect("path").to_string())
.collect();
assert!(resolved_paths.contains(&root.display().to_string()));
assert!(resolved_paths.contains(&nested.display().to_string()));
let samples = value["data"]["samples"].as_array().expect("samples");
assert!(samples.iter().any(|sample| {
sample["sample_id"] == "root"
&& sample["record_index"] == 0
&& sample["dataset"]["source"] == "local"
}));
assert!(samples.iter().any(|sample| {
sample["sample_id"] == "protein"
&& sample["sequence_length"] == 10
&& sample["file_sha256"]
.as_str()
.expect("file hash")
.starts_with("sha256:")
}));
}
#[test]
fn dataset_inspect_reports_empty_input_sets() {
let temp = TempDir::new("biors-dataset-empty");
fs::write(temp.path().join("notes.txt"), "not fasta\n").expect("write notes");
let output = Command::new(env!("CARGO_BIN_EXE_biors"))
.arg("--json")
.arg("dataset")
.arg("inspect")
.arg(temp.path())
.output()
.expect("run empty dataset inspect");
assert_eq!(output.status.code(), Some(2));
assert!(output.stderr.is_empty());
let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON error");
assert_eq!(value["error"]["code"], "dataset.no_inputs");
}
#[test]
fn dataset_inspect_accepts_descriptor_and_metadata() {
let temp = TempDir::new("biors-dataset-descriptor");
let input = temp.write("train.fasta", ">P53_HUMAN\nMEEPQSDPSV\n");
let output = Command::new(env!("CARGO_BIN_EXE_biors"))
.arg("dataset")
.arg("inspect")
.arg("--source")
.arg("uniprot")
.arg("--version")
.arg("2026_02")
.arg("--split")
.arg("train")
.arg("--metadata")
.arg("organism=human")
.arg("--metadata")
.arg("reviewed=true")
.arg(&input)
.output()
.expect("run biors dataset inspect with descriptor");
assert!(
output.status.success(),
"stderr: {}",
String::from_utf8_lossy(&output.stderr)
);
let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
assert_eq!(value["data"]["descriptor"]["source"], "uniprot");
assert_eq!(value["data"]["descriptor"]["version"], "2026_02");
assert_eq!(value["data"]["descriptor"]["split"], "train");
assert_eq!(value["data"]["metadata"]["organism"], "human");
assert_eq!(value["data"]["metadata"]["reviewed"], "true");
assert_eq!(value["data"]["samples"][0]["dataset"]["version"], "2026_02");
}
#[test]
fn dataset_inspect_rejects_duplicate_metadata_keys() {
let temp = TempDir::new("biors-dataset-duplicate-metadata");
let input = temp.write("train.fasta", ">P53_HUMAN\nMEEPQSDPSV\n");
let value = dataset_inspect_metadata_error(&[
"--metadata",
"organism=human",
"--metadata",
"organism=mouse",
input.to_str().expect("input path"),
]);
assert_eq!(value["error"]["code"], "dataset.duplicate_metadata_key");
assert_eq!(value["error"]["location"], "organism");
}
#[test]
fn dataset_inspect_rejects_whitespace_normalized_duplicate_metadata_keys() {
let temp = TempDir::new("biors-dataset-trimmed-duplicate-metadata");
let input = temp.write("train.fasta", ">P53_HUMAN\nMEEPQSDPSV\n");
let value = dataset_inspect_metadata_error(&[
"--metadata",
"organism=human",
"--metadata",
" organism = mouse ",
input.to_str().expect("input path"),
]);
assert_eq!(value["error"]["code"], "dataset.duplicate_metadata_key");
assert_eq!(value["error"]["location"], "organism");
}
#[test]
fn dataset_inspect_content_hash_is_stable_when_file_moves() {
let temp = TempDir::new("biors-dataset-hash-relocation");
fs::create_dir_all(temp.path().join("a")).expect("create first dir");
fs::create_dir_all(temp.path().join("b")).expect("create second dir");
let first = temp.write("a/input.fasta", ">seq1\nACDEFG\n");
let second = temp.write("b/input.fasta", ">seq1\nACDEFG\n");
let first_output = dataset_inspect_json(&first);
let second_output = dataset_inspect_json(&second);
assert_eq!(
first_output["data"]["resolved_files"][0]["sha256"],
second_output["data"]["resolved_files"][0]["sha256"]
);
assert_eq!(
first_output["data"]["dataset_hash"],
second_output["data"]["dataset_hash"]
);
assert_ne!(
first_output["data"]["dataset_mapping_hash"],
second_output["data"]["dataset_mapping_hash"]
);
}
fn dataset_inspect_json(input: &std::path::Path) -> Value {
let output = Command::new(env!("CARGO_BIN_EXE_biors"))
.arg("dataset")
.arg("inspect")
.arg(input)
.output()
.expect("run biors dataset inspect");
assert!(
output.status.success(),
"stderr: {}",
String::from_utf8_lossy(&output.stderr)
);
serde_json::from_slice(&output.stdout).expect("valid JSON output")
}
fn dataset_inspect_metadata_error(args: &[&str]) -> Value {
let output = Command::new(env!("CARGO_BIN_EXE_biors"))
.arg("--json")
.arg("dataset")
.arg("inspect")
.args(args)
.output()
.expect("run biors dataset inspect");
assert_eq!(output.status.code(), Some(2));
assert!(output.stderr.is_empty());
serde_json::from_slice(&output.stdout).expect("valid JSON error")
}