use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
use serde::{Deserialize, Serialize};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum EtlError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Arrow error: {0}")]
Arrow(#[from] arrow::error::ArrowError),
#[error("CSV error: {0}")]
Csv(#[from] csv::Error),
#[error("Parquet error: {0}")]
Parquet(#[from] parquet::errors::ParquetError),
#[error("Missing column: {0}")]
MissingColumn(String),
#[error("Invalid data format: {0}")]
InvalidFormat(String),
#[error("Configuration error: {0}")]
Config(String),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetConfig {
pub root_path: String,
pub tables: Vec<String>,
pub batch_size: usize,
pub parallelism: usize,
pub use_memmap: bool,
}
impl DatasetConfig {
pub fn finish(self) -> Result<()> {
Ok(())
}
}
impl Default for DatasetConfig {
fn default() -> Self {
Self {
root_path: "data/mimic-iv".to_string(),
tables: vec![
"admissions".to_string(),
"patients".to_string(),
"diagnoses_icd".to_string(),
"procedures_icd".to_string(),
"prescriptions".to_string(),
"labevents".to_string(),
],
batch_size: 128_000,
parallelism: num_cpus::get(),
use_memmap: true,
}
}
}
#[must_use]
pub fn clinical_event_schema() -> Schema {
Schema::new(vec![
Field::new("subject_id", DataType::Int64, false),
Field::new("hadm_id", DataType::Int64, true),
Field::new("stay_id", DataType::Int64, true),
Field::new(
"charttime",
DataType::Timestamp(TimeUnit::Microsecond, None),
true,
),
Field::new("event_type", DataType::Utf8, false),
Field::new("event_id", DataType::Utf8, true),
Field::new("value", DataType::Utf8, true),
Field::new("value_num", DataType::Float64, true),
Field::new("units", DataType::Utf8, true),
])
}
#[derive(Debug, Clone)]
pub struct ClinicalEvent {
pub subject_id: i64,
pub hadm_id: Option<i64>,
pub stay_id: Option<i64>,
pub charttime: Option<i64>,
pub event_type: String,
pub event_id: Option<String>,
pub value: Option<String>,
pub value_num: Option<f64>,
pub units: Option<String>,
}
pub type Result<T> = std::result::Result<T, EtlError>;