mod parser;
use parser::{parse_csv, parse_f64, parse_i32};
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList};
const LUNG_CSV: &str = include_str!("data/lung.csv");
const AML_CSV: &str = include_str!("data/aml.csv");
const VETERAN_CSV: &str = include_str!("data/veteran.csv");
const OVARIAN_CSV: &str = include_str!("data/ovarian.csv");
const COLON_CSV: &str = include_str!("data/colon.csv");
const PBC_CSV: &str = include_str!("data/pbc.csv");
const CGD_CSV: &str = include_str!("data/cgd.csv");
const BLADDER_CSV: &str = include_str!("data/bladder.csv");
const HEART_CSV: &str = include_str!("data/heart.csv");
const KIDNEY_CSV: &str = include_str!("data/kidney.csv");
const RATS_CSV: &str = include_str!("data/rats.csv");
const STANFORD2_CSV: &str = include_str!("data/stanford2.csv");
const UDCA_CSV: &str = include_str!("data/udca.csv");
const MYELOID_CSV: &str = include_str!("data/myeloid.csv");
const FLCHAIN_CSV: &str = include_str!("data/flchain.csv");
const TRANSPLANT_CSV: &str = include_str!("data/transplant.csv");
const MGUS_CSV: &str = include_str!("data/mgus.csv");
const MGUS2_CSV: &str = include_str!("data/mgus2.csv");
const DIABETIC_CSV: &str = include_str!("data/diabetic.csv");
const RETINOPATHY_CSV: &str = include_str!("data/retinopathy.csv");
const GBSG_CSV: &str = include_str!("data/gbsg.csv");
const ROTTERDAM_CSV: &str = include_str!("data/rotterdam.csv");
const LOGAN_CSV: &str = include_str!("data/logan.csv");
const NWTCO_CSV: &str = include_str!("data/nwtco.csv");
const SOLDER_CSV: &str = include_str!("data/solder.csv");
const TOBIN_CSV: &str = include_str!("data/tobin.csv");
const RATS2_CSV: &str = include_str!("data/rats2.csv");
const NAFLD_CSV: &str = include_str!("data/nafld.csv");
const CGDRAW_CSV: &str = include_str!("data/cgd0.csv");
const PBCSEQ_CSV: &str = include_str!("data/pbcseq.csv");
const HOEL_CSV: &str = include_str!("data/hoel.csv");
const MYELOMA_CSV: &str = include_str!("data/myeloma.csv");
const RHDNASE_CSV: &str = include_str!("data/rhDNase.csv");
#[derive(Clone, Copy)]
enum ColType {
Float,
Int,
Str,
}
fn csv_to_dict(py: Python<'_>, csv_data: &str, schema: &[(&str, ColType)]) -> PyResult<Py<PyDict>> {
let (headers, rows) = parse_csv(csv_data).map_err(pyo3::exceptions::PyValueError::new_err)?;
let dict = PyDict::new(py);
for (col_name, col_type) in schema {
let idx = headers.iter().position(|h| h == *col_name).ok_or_else(|| {
pyo3::exceptions::PyValueError::new_err(format!(
"Column '{}' not found in CSV",
col_name
))
})?;
match col_type {
ColType::Float => {
let values: Vec<Option<f64>> =
rows.iter().map(|row| parse_f64(&row[idx])).collect();
let list = PyList::new(py, values.iter().map(|v| v.map(|x| x)))?;
dict.set_item(*col_name, list)?;
}
ColType::Int => {
let values: Vec<Option<i32>> =
rows.iter().map(|row| parse_i32(&row[idx])).collect();
let list = PyList::new(py, values.iter().map(|v| v.map(|x| x)))?;
dict.set_item(*col_name, list)?;
}
ColType::Str => {
let values: Vec<&str> = rows.iter().map(|row| row[idx].as_str()).collect();
let list = PyList::new(py, values)?;
dict.set_item(*col_name, list)?;
}
}
}
dict.set_item("_nrow", rows.len())?;
dict.set_item("_ncol", schema.len())?;
Ok(dict.into())
}
#[pyfunction]
pub fn load_lung(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("inst", ColType::Int),
("time", ColType::Int),
("status", ColType::Int),
("age", ColType::Int),
("sex", ColType::Int),
("ph.ecog", ColType::Int),
("ph.karno", ColType::Int),
("pat.karno", ColType::Int),
("meal.cal", ColType::Int),
("wt.loss", ColType::Int),
];
csv_to_dict(py, LUNG_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_aml(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("time", ColType::Int),
("cens", ColType::Int),
("group", ColType::Int),
];
csv_to_dict(py, AML_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_veteran(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("trt", ColType::Int),
("celltype", ColType::Str),
("time", ColType::Float),
("status", ColType::Int),
("karno", ColType::Int),
("diagtime", ColType::Int),
("age", ColType::Int),
("prior", ColType::Int),
];
csv_to_dict(py, VETERAN_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_ovarian(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("futime", ColType::Float),
("fustat", ColType::Int),
("age", ColType::Float),
("resid.ds", ColType::Int),
("rx", ColType::Int),
("ecog.ps", ColType::Int),
];
csv_to_dict(py, OVARIAN_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_colon(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("study", ColType::Int),
("rx", ColType::Str),
("sex", ColType::Int),
("age", ColType::Int),
("obstruct", ColType::Int),
("perfor", ColType::Int),
("adhere", ColType::Int),
("nodes", ColType::Int),
("time", ColType::Int),
("status", ColType::Int),
("differ", ColType::Int),
("extent", ColType::Int),
("surg", ColType::Int),
("node4", ColType::Int),
("etype", ColType::Int),
];
csv_to_dict(py, COLON_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_pbc(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("time", ColType::Int),
("status", ColType::Int),
("trt", ColType::Int),
("age", ColType::Float),
("sex", ColType::Str),
("ascites", ColType::Int),
("hepato", ColType::Int),
("spiders", ColType::Int),
("edema", ColType::Float),
("bili", ColType::Float),
("chol", ColType::Int),
("albumin", ColType::Float),
("copper", ColType::Int),
("alk.phos", ColType::Float),
("ast", ColType::Float),
("trig", ColType::Int),
("platelet", ColType::Int),
("protime", ColType::Float),
("stage", ColType::Int),
];
csv_to_dict(py, PBC_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_cgd(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("center", ColType::Int),
("random", ColType::Str),
("treat", ColType::Str),
("sex", ColType::Str),
("age", ColType::Float),
("height", ColType::Float),
("weight", ColType::Float),
("inherit", ColType::Str),
("steroids", ColType::Int),
("propylac", ColType::Int),
("hos.cat", ColType::Str),
("tstart", ColType::Int),
("enum", ColType::Int),
("tstop", ColType::Int),
("status", ColType::Int),
];
csv_to_dict(py, CGD_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_bladder(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("rx", ColType::Int),
("number", ColType::Int),
("size", ColType::Int),
("stop", ColType::Int),
("event", ColType::Int),
("enum", ColType::Int),
];
csv_to_dict(py, BLADDER_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_heart(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("start", ColType::Int),
("stop", ColType::Int),
("event", ColType::Int),
("age", ColType::Float),
("year", ColType::Float),
("surgery", ColType::Int),
("transplant", ColType::Int),
("id", ColType::Int),
];
csv_to_dict(py, HEART_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_kidney(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("time", ColType::Float),
("status", ColType::Int),
("age", ColType::Int),
("sex", ColType::Int),
("disease", ColType::Str),
("frail", ColType::Float),
];
csv_to_dict(py, KIDNEY_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_rats(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("group", ColType::Int),
("n", ColType::Int),
("y", ColType::Int),
];
csv_to_dict(py, RATS_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_stanford2(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("time", ColType::Float),
("status", ColType::Int),
("age", ColType::Float),
("t5", ColType::Float),
];
csv_to_dict(py, STANFORD2_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_udca(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("trt", ColType::Int),
("entry.dt", ColType::Str),
("last.dt", ColType::Str),
("stage", ColType::Int),
("bili", ColType::Float),
("riskscore", ColType::Float),
("death.dt", ColType::Str),
("tx.dt", ColType::Str),
("hprogress.dt", ColType::Str),
("varices.dt", ColType::Str),
("ascites.dt", ColType::Str),
("enceph.dt", ColType::Str),
("double.dt", ColType::Str),
("worsen.dt", ColType::Str),
];
csv_to_dict(py, UDCA_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_myeloid(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("trt", ColType::Str),
("sex", ColType::Str),
("flt3", ColType::Str),
("futime", ColType::Int),
("death", ColType::Int),
("txtime", ColType::Int),
("crtime", ColType::Int),
("rltime", ColType::Int),
];
csv_to_dict(py, MYELOID_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_flchain(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("age", ColType::Int),
("sex", ColType::Str),
("sample.yr", ColType::Int),
("kappa", ColType::Float),
("lambda", ColType::Float),
("flc.grp", ColType::Int),
("creatinine", ColType::Float),
("mgus", ColType::Int),
("futime", ColType::Int),
("death", ColType::Int),
("chapter", ColType::Str),
];
csv_to_dict(py, FLCHAIN_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_transplant(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("age", ColType::Float),
("sex", ColType::Str),
("abo", ColType::Str),
("year", ColType::Int),
("futime", ColType::Int),
("event", ColType::Str),
];
csv_to_dict(py, TRANSPLANT_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_mgus(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("age", ColType::Int),
("sex", ColType::Str),
("dxyr", ColType::Int),
("pcdx", ColType::Float),
("pctime", ColType::Int),
("futime", ColType::Int),
("death", ColType::Int),
("alb", ColType::Float),
("creat", ColType::Float),
("hgb", ColType::Float),
("mspike", ColType::Float),
];
csv_to_dict(py, MGUS_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_mgus2(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("age", ColType::Int),
("sex", ColType::Str),
("dxyr", ColType::Int),
("hgb", ColType::Float),
("creat", ColType::Float),
("mspike", ColType::Float),
("ptime", ColType::Int),
("pstat", ColType::Int),
("futime", ColType::Int),
("death", ColType::Int),
];
csv_to_dict(py, MGUS2_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_diabetic(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("laser", ColType::Str),
("age", ColType::Int),
("eye", ColType::Str),
("trt", ColType::Int),
("risk", ColType::Int),
("time", ColType::Float),
("status", ColType::Int),
];
csv_to_dict(py, DIABETIC_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_retinopathy(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("laser", ColType::Str),
("eye", ColType::Str),
("age", ColType::Int),
("type", ColType::Str),
("trt", ColType::Int),
("futime", ColType::Float),
("status", ColType::Int),
("risk", ColType::Int),
];
csv_to_dict(py, RETINOPATHY_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_gbsg(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("pid", ColType::Int),
("age", ColType::Int),
("meno", ColType::Int),
("size", ColType::Int),
("grade", ColType::Int),
("nodes", ColType::Int),
("pgr", ColType::Int),
("er", ColType::Int),
("hormon", ColType::Int),
("rfstime", ColType::Int),
("status", ColType::Int),
];
csv_to_dict(py, GBSG_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_rotterdam(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("pid", ColType::Int),
("year", ColType::Int),
("age", ColType::Int),
("meno", ColType::Int),
("size", ColType::Int),
("grade", ColType::Int),
("nodes", ColType::Int),
("pgr", ColType::Int),
("er", ColType::Int),
("hormon", ColType::Int),
("chemo", ColType::Int),
("rtime", ColType::Int),
("recur", ColType::Int),
("dtime", ColType::Int),
("death", ColType::Int),
];
csv_to_dict(py, ROTTERDAM_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_logan(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("occupation", ColType::Int),
("focc", ColType::Int),
("education", ColType::Int),
("race", ColType::Str),
];
csv_to_dict(py, LOGAN_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_nwtco(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("seqno", ColType::Int),
("instit", ColType::Int),
("histol", ColType::Int),
("stage", ColType::Int),
("study", ColType::Int),
("rel", ColType::Int),
("edrel", ColType::Float),
("age", ColType::Int),
("in.subcohort", ColType::Int),
];
csv_to_dict(py, NWTCO_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_solder(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("Opening", ColType::Str),
("Solder", ColType::Str),
("Mask", ColType::Str),
("PadType", ColType::Str),
("Panel", ColType::Int),
("skips", ColType::Int),
];
csv_to_dict(py, SOLDER_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_tobin(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("durable", ColType::Float),
("age", ColType::Int),
("quant", ColType::Int),
];
csv_to_dict(py, TOBIN_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_rats2(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("trt", ColType::Int),
("obs", ColType::Int),
("time1", ColType::Int),
("time2", ColType::Int),
("status", ColType::Int),
];
csv_to_dict(py, RATS2_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_nafld(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("age", ColType::Float),
("male", ColType::Int),
("weight", ColType::Float),
("height", ColType::Float),
("bmi", ColType::Float),
("case.id", ColType::Int),
("futime", ColType::Int),
("status", ColType::Int),
];
csv_to_dict(py, NAFLD_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_cgd0(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("center", ColType::Int),
("random", ColType::Str),
("treat", ColType::Str),
("sex", ColType::Str),
("age", ColType::Float),
("height", ColType::Float),
("weight", ColType::Float),
("inherit", ColType::Str),
("steroids", ColType::Int),
("propylac", ColType::Int),
("hos.cat", ColType::Str),
("futime", ColType::Int),
("etime1", ColType::Int),
("etime2", ColType::Int),
("etime3", ColType::Int),
("etime4", ColType::Int),
("etime5", ColType::Int),
("etime6", ColType::Int),
("etime7", ColType::Int),
];
csv_to_dict(py, CGDRAW_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_pbcseq(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("futime", ColType::Int),
("status", ColType::Int),
("trt", ColType::Int),
("age", ColType::Float),
("sex", ColType::Str),
("day", ColType::Int),
("ascites", ColType::Int),
("hepato", ColType::Int),
("spiders", ColType::Int),
("edema", ColType::Float),
("bili", ColType::Float),
("chol", ColType::Int),
("albumin", ColType::Float),
("alk.phos", ColType::Float),
("ast", ColType::Float),
("platelet", ColType::Int),
("protime", ColType::Float),
("stage", ColType::Int),
];
csv_to_dict(py, PBCSEQ_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_hoel(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("time", ColType::Int),
("status", ColType::Int),
("cause", ColType::Int),
];
csv_to_dict(py, HOEL_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_myeloma(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("time", ColType::Int),
("status", ColType::Int),
("hgb", ColType::Float),
("bun", ColType::Int),
("ca", ColType::Int),
("protein", ColType::Int),
("pcells", ColType::Int),
("age", ColType::Int),
];
csv_to_dict(py, MYELOMA_CSV, SCHEMA)
}
#[pyfunction]
pub fn load_rhdnase(py: Python<'_>) -> PyResult<Py<PyDict>> {
const SCHEMA: &[(&str, ColType)] = &[
("id", ColType::Int),
("inst", ColType::Int),
("trt", ColType::Int),
("fev", ColType::Float),
("entry", ColType::Int),
("fev.last", ColType::Float),
("ivstart", ColType::Str),
("ivstop", ColType::Str),
];
csv_to_dict(py, RHDNASE_CSV, SCHEMA)
}