use std::path::Path;
use anyhow::{Context, Result};
use serde::Deserialize;
use crate::data::batch::{normalize, Sample};
const ACTIVITIES: &[&str] = &[
"biking", "lying", "running", "sitting",
"standing", "walking", "walking_down", "walking_up",
];
#[derive(Debug, Deserialize)]
struct HarRow {
x_axis: Vec<f32>,
y_axis: Vec<f32>,
z_axis: Vec<f32>,
label: String,
rationale: String,
}
pub struct HarSplits {
pub train: Vec<Sample>,
pub val: Vec<Sample>,
pub test: Vec<Sample>,
}
pub fn load_har_splits(data_dir: &Path) -> Result<HarSplits> {
Ok(HarSplits {
train: load_split(data_dir, "train.jsonl")?,
val: load_split(data_dir, "val.jsonl")?,
test: load_split(data_dir, "test.jsonl")?,
})
}
fn load_split(base: &Path, filename: &str) -> Result<Vec<Sample>> {
let file = base.join("har_cot").join(filename);
let text = std::fs::read_to_string(&file)
.with_context(|| format!("Cannot read HAR file {file:?}"))?;
text.lines()
.filter(|l| !l.trim().is_empty())
.enumerate()
.map(|(i, line)| {
let row: HarRow = serde_json::from_str(line)
.with_context(|| format!("{filename} line {i}: parse error"))?;
row_to_sample(&row)
})
.collect()
}
fn row_to_sample(row: &HarRow) -> Result<Sample> {
let axes = [("x-axis", &row.x_axis), ("y-axis", &row.y_axis), ("z-axis", &row.z_axis)];
let mut ts_texts = Vec::new();
let mut ts_data = Vec::new();
for (axis_name, raw) in &axes {
let (normed, mean, std) = normalize(raw);
ts_texts.push(format!(
"The following is the accelerometer data on the {axis_name}, \
it has mean {mean:.4} and std {std:.4}:"
));
ts_data.push(normed);
}
let activity_list = ACTIVITIES.join(", ");
let pre_prompt = format!(
"\
You are given accelerometer data in all three dimensions. Your task is to classify the activity \
based on analysis of the data.
Instructions:
- Begin by analyzing the time series without assuming a specific label.
- Think step-by-step about what the observed patterns suggest regarding movement intensity and behavior.
- Write your rationale as a single, natural paragraph — do not use bullet points, numbered steps, or section headings.
- Do **not** mention any class label until the final sentence.
Possible activity labels are:
{activity_list}."
);
Ok(Sample {
pre_prompt,
time_series_text: ts_texts,
time_series: ts_data,
post_prompt: "Rationale:".to_string(),
answer: row.rationale.trim().to_string(),
label: Some(row.label.trim().to_string()),
})
}