biors 0.43.0

Command-line tools for bio-rs biological AI model input workflows.
use serde_json::Value;
use std::fs;
use std::path::Path;
use std::process::Command;

mod common;
use common::TempDir;

#[test]
fn tokenizer_inspect_outputs_special_token_policy() {
    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("tokenizer")
        .arg("inspect")
        .arg("--profile")
        .arg("protein-20-special")
        .output()
        .expect("run biors tokenizer inspect");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["data"]["profile"], "protein-20-special");
    assert_eq!(value["data"]["special_tokens"]["pad"]["token_id"], 21);
    assert_eq!(value["data"]["special_tokens"]["cls"]["token_id"], 22);
    assert_eq!(value["data"]["special_tokens"]["sep"]["token_id"], 23);
    assert_eq!(value["data"]["special_tokens"]["mask"]["token_id"], 24);
}

#[test]
fn tokenize_accepts_tokenizer_config_file() {
    let repo = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../..");
    let config = repo.join("examples/model-input-contract/protein-20-special.config.json");
    let fasta = repo.join("examples/model-input-contract/protein.fasta");

    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("tokenize")
        .arg("--config")
        .arg(config)
        .arg(fasta)
        .output()
        .expect("run biors tokenize with config");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(value["data"][0]["alphabet"], "protein-20-special");
    assert_eq!(
        value["data"][0]["tokens"],
        serde_json::json!([22, 0, 1, 2, 3, 23])
    );
}

#[test]
fn tokenizer_convert_hf_writes_biors_tokenizer_config() {
    let temp = TempDir::new("hf-tokenizer-convert");
    let hf_config = temp.write(
        "tokenizer_config.json",
        r#"{
  "tokenizer_class": "BertTokenizer",
  "model_max_length": 1024,
  "do_lower_case": false,
  "unk_token": "[UNK]",
  "pad_token": "[PAD]",
  "cls_token": "[CLS]",
  "sep_token": "[SEP]",
  "mask_token": "[MASK]"
}"#,
    );
    let output_path = temp.path().join("biors-tokenizer.json");

    let output = Command::new(env!("CARGO_BIN_EXE_biors"))
        .arg("tokenizer")
        .arg("convert-hf")
        .arg(&hf_config)
        .arg("--output")
        .arg(&output_path)
        .output()
        .expect("run biors tokenizer convert-hf");

    assert!(
        output.status.success(),
        "stderr: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    assert!(output.stderr.is_empty());

    let value: Value = serde_json::from_slice(&output.stdout).expect("valid JSON output");
    assert_eq!(
        value["data"]["source_format"],
        "huggingface.tokenizer_config"
    );
    assert_eq!(value["data"]["config"]["profile"], "protein-20-special");
    assert_eq!(value["data"]["config"]["add_special_tokens"], true);
    assert_eq!(
        value["data"]["output_path"],
        output_path.display().to_string()
    );

    let written: Value = serde_json::from_str(
        &fs::read_to_string(output_path).expect("read converted tokenizer config"),
    )
    .expect("written tokenizer JSON");
    assert_eq!(written["profile"], "protein-20-special");
    assert_eq!(written["add_special_tokens"], true);
}