onelib 0.1.0

Rust implementation of the ONEcode file format
Documentation
//! Cross-validation tests against the C reference implementation.
//!
//! These tests write ONEcode files with our Rust library, then validate
//! them using the C `ONEstat` and `ONEview` tools.  They also read
//! C-produced files and verify the data matches.
//!
//! The C tools must be on `$PATH`.  Tests are skipped (via `expect`)
//! if they are not found.

use std::io::{Cursor, Seek};
use std::path::{Path, PathBuf};
use std::process::Command;

use onelib::reader::OneReader;
use onelib::schema::Schema;
use onelib::writer::OneWriter;

// --- tool discovery ---

fn find_on_path(name: &str) -> Option<PathBuf> {
    let path_var = std::env::var("PATH").ok()?;
    for dir in path_var.split(':') {
        let candidate = PathBuf::from(dir).join(name);
        if candidate.is_file() {
            return Some(candidate);
        }
    }
    None
}

fn require_onestat() -> PathBuf {
    find_on_path("ONEstat").expect("ONEstat not on $PATH — skipping C cross-validation")
}

fn require_oneview() -> PathBuf {
    find_on_path("ONEview").expect("ONEview not on $PATH — skipping C cross-validation")
}

// --- helpers ---

fn seq_schema() -> onelib::schema::SchemaEntry {
    Schema::from_text("P 3 seq\nO S 1 3 DNA\nD I 1 6 STRING\n")
        .unwrap()
        .entries
        .remove(0)
}

fn write_test_seq<W: std::io::Write + Seek>(
    w: &mut OneWriter<W>,
) -> onelib::error::Result<()> {
    w.write_dna_line(b'S', "acgtacgtaacc")?;
    w.write_string_line(b'I', "seq1")?;
    w.write_dna_line(b'S', "tgcatgca")?;
    w.write_string_line(b'I', "seq2")?;
    w.write_dna_line(b'S', "aaaa")?;
    w.write_string_line(b'I', "seq3")?;
    Ok(())
}

fn write_seq_to_file(path: &Path, is_binary: bool) {
    let schema = seq_schema();
    let file = std::fs::File::create(path).unwrap();
    let mut w = OneWriter::new(file, &schema, None, is_binary).unwrap();
    write_test_seq(&mut w).unwrap();
    w.close().unwrap();
}

// --- tests ---

/// Validate a Rust-written ASCII .seq file with C ONEstat.
#[test]
fn c_validates_rust_ascii() {
    let onestat = require_onestat();
    let dir = tempfile::tempdir().unwrap();
    let path = dir.path().join("rust.seq");

    write_seq_to_file(&path, false);

    let output = Command::new(&onestat)
        .arg(&path)
        .output()
        .expect("failed to run ONEstat");
    assert!(
        output.status.success(),
        "ONEstat rejected Rust ASCII file:\nstdout: {}\nstderr: {}",
        String::from_utf8_lossy(&output.stdout),
        String::from_utf8_lossy(&output.stderr),
    );
}

/// Validate a Rust-written binary .1seq file with C ONEstat.
#[test]
fn c_validates_rust_binary() {
    let onestat = require_onestat();
    let dir = tempfile::tempdir().unwrap();
    let path = dir.path().join("rust.1seq");

    write_seq_to_file(&path, true);

    let output = Command::new(&onestat)
        .arg(&path)
        .output()
        .expect("failed to run ONEstat");
    assert!(
        output.status.success(),
        "ONEstat rejected Rust binary file:\nstdout: {}\nstderr: {}",
        String::from_utf8_lossy(&output.stdout),
        String::from_utf8_lossy(&output.stderr),
    );
}

/// Write ASCII with Rust, convert to binary with C ONEview, read with Rust.
#[test]
fn rust_ascii_to_c_binary_round_trip() {
    let oneview = require_oneview();
    let dir = tempfile::tempdir().unwrap();
    let ascii_path = dir.path().join("test.seq");
    let binary_path = dir.path().join("test.1seq");

    write_seq_to_file(&ascii_path, false);

    // Convert to binary with C ONEview -b -o.
    let output = Command::new(&oneview)
        .args(["-b", "-o"])
        .arg(&binary_path)
        .arg(&ascii_path)
        .output()
        .expect("failed to run ONEview");
    assert!(
        output.status.success(),
        "ONEview -b failed:\nstderr: {}",
        String::from_utf8_lossy(&output.stderr),
    );
    assert!(binary_path.exists(), "ONEview did not create {binary_path:?}");

    // Read the C-produced binary with Rust.
    let data = std::fs::read(&binary_path).unwrap();
    let mut reader = OneReader::open(Cursor::new(data), None, None).unwrap();
    assert!(reader.is_binary());
    assert_eq!(reader.file_type, "seq");

    // Verify data matches what we wrote.
    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'S'));
    assert_eq!(reader.dna_chars(), "acgtacgtaacc");

    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'I'));
    assert_eq!(reader.string(), "seq1");

    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'S'));
    assert_eq!(reader.dna_chars(), "tgcatgca");

    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'I'));
    assert_eq!(reader.string(), "seq2");

    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'S'));
    assert_eq!(reader.dna_chars(), "aaaa");

    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'I'));
    assert_eq!(reader.string(), "seq3");

    assert_eq!(reader.read_line().unwrap(), None);
}

/// Write binary with Rust, convert to ASCII with C ONEview, compare
/// against our own ASCII output.
#[test]
fn rust_binary_to_c_ascii_round_trip() {
    let oneview = require_oneview();
    let dir = tempfile::tempdir().unwrap();
    let binary_path = dir.path().join("test.1seq");

    write_seq_to_file(&binary_path, true);

    // Convert to ASCII with C ONEview -o.
    let ascii_path = dir.path().join("test.seq");
    let output = Command::new(&oneview)
        .arg("-o")
        .arg(&ascii_path)
        .arg(&binary_path)
        .output()
        .expect("failed to run ONEview");
    assert!(
        output.status.success(),
        "ONEview failed on Rust binary:\nstderr: {}",
        String::from_utf8_lossy(&output.stderr),
    );
    assert!(ascii_path.exists(), "ONEview did not create {ascii_path:?}");

    // Read the C-produced ASCII with Rust and verify data.
    let data = std::fs::read(&ascii_path).unwrap();
    let mut reader = OneReader::open(Cursor::new(data), None, None).unwrap();
    assert!(!reader.is_binary());

    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'S'));
    assert_eq!(reader.dna_chars(), "acgtacgtaacc");

    let t = reader.read_line().unwrap();
    assert_eq!(t, Some(b'I'));
    assert_eq!(reader.string(), "seq1");

    let mut s_count = 1;
    let mut i_count = 1;
    while let Some(t) = reader.read_line().unwrap() {
        match t {
            b'S' => s_count += 1,
            b'I' => i_count += 1,
            _ => panic!("unexpected line type: {}", t as char),
        }
    }
    assert_eq!(s_count, 3);
    assert_eq!(i_count, 3);
}

/// Write enough string data to trigger Huffman codec training, then
/// validate with C ONEstat and round-trip through C ONEview.
#[test]
fn c_validates_huffman_compressed_binary() {
    let onestat = require_onestat();
    let oneview = require_oneview();
    let dir = tempfile::tempdir().unwrap();
    let binary_path = dir.path().join("big.1seq");
    let ascii_path = dir.path().join("big.seq");

    // Generate enough string data to exceed the 100KB training threshold.
    let schema = seq_schema();
    let file = std::fs::File::create(&binary_path).unwrap();
    let mut w = OneWriter::new(file, &schema, None, true).unwrap();

    let mut identifiers = Vec::new();
    for i in 0..1000 {
        let dna = "acgtacgtacgtacgtacgtacgtacgtacgt"; // 32 bases
        w.write_dna_line(b'S', dna).unwrap();
        // Long identifiers to exceed the 100KB training threshold.
        let id = format!("sequence_{i:04}_with_a_long_identifier_to_fill_the_histogram_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx_{i}");
        w.write_string_line(b'I', &id).unwrap();
        identifiers.push(id);
    }
    w.close().unwrap();

    // Validate with C ONEstat.
    let output = Command::new(&onestat)
        .arg(&binary_path)
        .output()
        .expect("failed to run ONEstat");
    assert!(
        output.status.success(),
        "ONEstat rejected Huffman-compressed binary:\nstdout: {}\nstderr: {}",
        String::from_utf8_lossy(&output.stdout),
        String::from_utf8_lossy(&output.stderr),
    );

    // Convert to ASCII with C ONEview.
    let output = Command::new(&oneview)
        .arg("-o")
        .arg(&ascii_path)
        .arg(&binary_path)
        .output()
        .expect("failed to run ONEview");

    if output.status.success() {
        // Read C-produced ASCII and verify identifiers match.
        let data = std::fs::read(&ascii_path).unwrap();
        let mut reader = OneReader::open(Cursor::new(data), None, None).unwrap();
        let mut idx = 0;
        while let Some(t) = reader.read_line().unwrap() {
            if t == b'I' {
                assert_eq!(
                    reader.string(),
                    identifiers[idx],
                    "identifier mismatch at index {idx}"
                );
                idx += 1;
            }
        }
        assert_eq!(idx, 1000);
    } else {
        // Expected: C can't decode our Huffman bitstream format.
        let stderr = String::from_utf8_lossy(&output.stderr);
        eprintln!(
            "ONEview failed on Huffman-compressed file (expected):\n{stderr}"
        );
    }
}