infotheory 1.1.1

The algorithmic information theory library.
Documentation
#![cfg(all(feature = "cli", feature = "backend-rosa", feature = "backend-zpaq"))]

use std::io::Write;
use std::process::{Command, Stdio};

use infotheory::{
    NcdVariant, biased_entropy_rate_bytes, cross_entropy_rate_bytes, entropy_rate_bytes,
    marginal_entropy_bytes, ncd_matrix_bytes, ncd_paths,
};
use serde_json::Value;

fn run_batch(input: &Value) -> Value {
    let bin = env!("CARGO_BIN_EXE_infotheory");
    let mut child = Command::new(bin)
        .arg("batch")
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .spawn()
        .expect("failed to spawn infotheory batch");

    {
        let stdin = child.stdin.as_mut().expect("failed to open stdin");
        let line = serde_json::to_string(input).expect("failed to encode json input");
        writeln!(stdin, "{line}").expect("failed to write json input");
    }

    let output = child
        .wait_with_output()
        .expect("failed to read batch output");
    assert!(
        output.status.success(),
        "batch exited non-zero: {}",
        String::from_utf8_lossy(&output.stderr)
    );
    let stdout = String::from_utf8(output.stdout).expect("stdout is not utf-8");
    serde_json::from_str(stdout.lines().next().unwrap_or("")).expect("invalid batch json")
}

fn as_f64(obj: &Value, key: &str) -> f64 {
    obj.get(key)
        .and_then(Value::as_f64)
        .unwrap_or_else(|| panic!("missing/invalid key '{key}' in {obj}"))
}

fn assert_close(actual: f64, expected: f64, tol: f64, label: &str) {
    let delta = (actual - expected).abs();
    assert!(
        delta <= tol,
        "{label} mismatch: actual={actual}, expected={expected}, delta={delta}, tol={tol}"
    );
}

fn rosa_distance_like_cli(x: &[u8], y: &[u8], max_order: i64) -> f64 {
    let h_x_x = biased_entropy_rate_bytes(x, max_order);
    let h_y_y = biased_entropy_rate_bytes(y, max_order);
    let h_y_x = cross_entropy_rate_bytes(x, y, max_order);
    let h_x_y = cross_entropy_rate_bytes(y, x, max_order);
    if h_x_x < 1e-9 || h_y_y < 1e-9 {
        return 1.0;
    }
    (0.5 * (h_y_x / h_x_x + h_x_y / h_y_y) - 1.0).clamp(0.0, 1.0)
}

#[test]
fn metrics_text_parity_with_library() {
    let text = "entropy parity text";
    let max_order = 5;
    let out = run_batch(&serde_json::json!({
        "op": "metrics",
        "text": text,
        "max_order": max_order,
    }));

    let data = text.as_bytes();
    let h0 = marginal_entropy_bytes(data);
    let h_rate = entropy_rate_bytes(data, max_order);
    let id = ((h0 - h_rate) / h0).clamp(0.0, 1.0);

    assert_close(as_f64(&out, "h0"), h0, 1e-6, "h0");
    assert_close(as_f64(&out, "h_rate"), h_rate, 1e-6, "h_rate");
    assert_close(as_f64(&out, "id"), id, 1e-6, "id");
    assert_eq!(
        out.get("len").and_then(Value::as_u64),
        Some(data.len() as u64),
        "length mismatch"
    );
}

#[test]
fn metrics_file_parity_with_library() {
    let fixture = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/fixture_a.txt");
    let bytes = std::fs::read(fixture).expect("failed to read fixture");
    let out = run_batch(&serde_json::json!({
        "op": "metrics_file",
        "path": fixture,
        "max_order": 3,
    }));

    let h0 = marginal_entropy_bytes(&bytes);
    let h_rate = entropy_rate_bytes(&bytes, 3);
    let id = if h0 < 1e-9 {
        0.0
    } else {
        ((h0 - h_rate) / h0).clamp(0.0, 1.0)
    };

    assert_close(as_f64(&out, "h0"), h0, 1e-6, "h0");
    assert_close(as_f64(&out, "h_rate"), h_rate, 1e-6, "h_rate");
    assert_close(as_f64(&out, "id"), id, 1e-6, "id");
    assert_eq!(
        out.get("len").and_then(Value::as_u64),
        Some(bytes.len() as u64),
        "length mismatch"
    );
}

#[test]
fn ncd_file_parity_with_library() {
    let a = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/fixture_a.txt");
    let b = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/fixture_b.txt");
    let out = run_batch(&serde_json::json!({
        "op": "ncd_files",
        "path1": a,
        "path2": b,
        "method": "5",
        "variant": "vitanyi",
    }));
    let rust_val = ncd_paths(a, b, "5", NcdVariant::Vitanyi);
    assert_close(as_f64(&out, "ncd"), rust_val, 1e-6, "ncd");
}

#[test]
fn cross_entropy_parity_with_library() {
    let x = "abracadabra";
    let y = "alakazam";
    let max_order = 3;
    let out = run_batch(&serde_json::json!({
        "op": "cross_entropy",
        "text_x": x,
        "text_y": y,
        "max_order": max_order,
    }));
    let rust_val = cross_entropy_rate_bytes(x.as_bytes(), y.as_bytes(), max_order);
    assert_close(
        as_f64(&out, "cross_entropy"),
        rust_val,
        1e-6,
        "cross_entropy",
    );
}

#[test]
fn batch_metrics_parity_with_library() {
    let texts = vec!["abracadabra", "alakazam", "xyzxyz"];
    let max_order = 4;
    let out = run_batch(&serde_json::json!({
        "op": "batch_metrics",
        "texts": texts,
        "max_order": max_order,
    }));
    let rows = out
        .get("results")
        .and_then(Value::as_array)
        .expect("missing results array");
    assert_eq!(rows.len(), texts.len());

    for (idx, text) in texts.iter().enumerate() {
        let data = text.as_bytes();
        let h0 = marginal_entropy_bytes(data);
        let h_rate = entropy_rate_bytes(data, max_order);
        let id = if h0 < 1e-9 {
            0.0
        } else {
            ((h0 - h_rate) / h0).clamp(0.0, 1.0)
        };
        assert_close(as_f64(&rows[idx], "h0"), h0, 1e-6, "h0");
        assert_close(as_f64(&rows[idx], "h_rate"), h_rate, 1e-6, "h_rate");
        assert_close(as_f64(&rows[idx], "id"), id, 1e-6, "id");
        assert_eq!(
            rows[idx].get("len").and_then(Value::as_u64),
            Some(data.len() as u64)
        );
    }
}

#[test]
fn ncd_matrix_parity_with_library() {
    let texts = vec!["abracadabra", "alakazam", "xyzxyz"];
    let datas: Vec<Vec<u8>> = texts.iter().map(|s| s.as_bytes().to_vec()).collect();
    let out = run_batch(&serde_json::json!({
        "op": "ncd_matrix",
        "texts": texts,
        "method": "5",
        "variant": "sym",
    }));
    let n = out.get("n").and_then(Value::as_u64).expect("missing n") as usize;
    let matrix = out
        .get("matrix")
        .and_then(Value::as_array)
        .expect("missing matrix");
    assert_eq!(matrix.len(), n);
    let rust_flat = ncd_matrix_bytes(&datas, "5", NcdVariant::SymVitanyi);
    for i in 0..n {
        let row = matrix[i].as_array().expect("row must be array");
        for j in 0..n {
            let val = row[j].as_f64().expect("entry must be number");
            assert_close(val, rust_flat[i * n + j], 1e-6, "ncd_matrix");
        }
    }
}

#[test]
fn rosa_matrix_parity_with_library_formula() {
    let texts = vec!["abracadabra", "alakazam", "xyzxyz"];
    let max_order = 3;
    let out = run_batch(&serde_json::json!({
        "op": "rosa_matrix",
        "texts": texts,
        "max_order": max_order,
    }));
    let n = out.get("n").and_then(Value::as_u64).expect("missing n") as usize;
    let matrix = out
        .get("matrix")
        .and_then(Value::as_array)
        .expect("missing matrix");
    assert_eq!(matrix.len(), n);

    for i in 0..n {
        let row = matrix[i].as_array().expect("row must be array");
        for j in 0..n {
            let val = row[j].as_f64().expect("entry must be number");
            let expected = if i == j {
                0.0
            } else {
                rosa_distance_like_cli(texts[i].as_bytes(), texts[j].as_bytes(), max_order)
            };
            assert_close(val, expected, 1e-6, "rosa_matrix");
        }
    }
}