use std::path::Path;
use anyhow::{Context, Result};
use serde::Deserialize;
use std::collections::HashMap;
use crate::{config::MAX_SERIES_LEN, data::batch::{normalize, Sample}};
const LEAD_ORDER: &[&str] = &[
"I", "II", "III", "aVR", "aVL", "aVF",
"V1", "V2", "V3", "V4", "V5", "V6",
];
#[derive(Debug, Deserialize)]
struct EcgRow {
question: String,
rationale: String,
#[serde(default)]
clinical_context: String,
#[serde(default)]
template_id: Option<u32>,
#[serde(default)]
question_type: Option<String>,
leads: HashMap<String, Vec<f32>>,
}
pub struct EcgSplits {
pub train: Vec<Sample>,
pub val: Vec<Sample>,
pub test: Vec<Sample>,
}
pub fn load_ecg_splits(data_dir: &Path) -> Result<EcgSplits> {
Ok(EcgSplits {
train: load_split(data_dir, "train.jsonl")?,
val: load_split(data_dir, "val.jsonl")?,
test: load_split(data_dir, "test.jsonl")?,
})
}
fn load_split(base: &Path, filename: &str) -> Result<Vec<Sample>> {
let file = base.join("ecg_qa_cot").join(filename);
let text = std::fs::read_to_string(&file)
.with_context(|| format!("Cannot read ECG-QA file {file:?}"))?;
text.lines()
.filter(|l| !l.trim().is_empty())
.enumerate()
.map(|(i, line)| {
let row: EcgRow = serde_json::from_str(line)
.with_context(|| format!("{filename} line {i}: parse error"))?;
row_to_sample(&row)
})
.collect()
}
fn row_to_sample(row: &EcgRow) -> Result<Sample> {
let mut ts_texts = Vec::new();
let mut ts_data = Vec::new();
for &lead_name in LEAD_ORDER {
if let Some(raw) = row.leads.get(lead_name) {
let raw = &raw[..raw.len().min(MAX_SERIES_LEN)];
let (normed, mean, std) = normalize(raw);
ts_texts.push(format!(
"This is ECG Lead {lead_name}, it has mean {mean:.4} and std {std:.4}:"
));
ts_data.push(normed);
}
}
if ts_data.is_empty() {
anyhow::bail!("ECG row has no lead data");
}
let pre_prompt = format!(
"You are an expert cardiologist analyzing an ECG (electrocardiogram).
Clinical Context: {}
Your task is to examine the ECG signal and answer the following medical question:
Question: {}
Instructions:
- Begin by analyzing the time series without assuming a specific answer.
- Think step-by-step about what the observed patterns suggest regarding the cardiac condition.
- Write your rationale as a single, natural paragraph — do not use bullet points.
- Do **not** mention any final answer until the very end.",
row.clinical_context, row.question
);
let post_prompt = "\
Based on your analysis of the ECG data, provide your answer.
Make sure that your last word is the answer. You MUST end your response with \"Answer: \""
.to_string();
Ok(Sample {
pre_prompt,
time_series_text: ts_texts,
time_series: ts_data,
post_prompt,
answer: row.rationale.trim().to_string(),
label: None,
})
}