opentslm 0.1.0 - Docs.rs

//! HAR chain-of-thought dataset loader — mirrors `HARCoTQADataset` in Python.
//!
//! Human Activity Recognition from 3-axis wrist accelerometer data
//! (WISDM-W dataset, 12 activity classes, 50 Hz, 4-second windows = 200 samples).
//! Each sample pairs x/y/z acceleration windows with a chain-of-thought
//! rationale ending in `"Answer: <label>"`.
//!
//! # JSONL row schema
//!
//! ```json
//! { "x_axis": [...], "y_axis": [...], "z_axis": [...],
//!   "label": "walking",
//!   "rationale": "The wrist accelerometer shows ... Answer: walking" }
//! ```
//!
//! # Curriculum stage
//!
//! Stage 3 — HAR chain-of-thought reasoning.

use std::path::Path;
use anyhow::{Context, Result};
use serde::Deserialize;

use crate::data::batch::{normalize, Sample};

/// Activity labels recognised by the HAR CoT classifier.
const ACTIVITIES: &[&str] = &[
    "biking", "lying", "running", "sitting",
    "standing", "walking", "walking_down", "walking_up",
];

/// Deserialised row from a HAR CoT JSONL file.
#[derive(Debug, Deserialize)]
struct HarRow {
    x_axis: Vec<f32>,
    y_axis: Vec<f32>,
    z_axis: Vec<f32>,
    label: String,
    rationale: String,
}

/// Train / val / test splits for the HAR CoT dataset.
pub struct HarSplits {
    pub train: Vec<Sample>,
    pub val:   Vec<Sample>,
    pub test:  Vec<Sample>,
}

/// Load HAR CoT splits from `<data_dir>/har_cot/{train,val,test}.jsonl`.
///
/// Each split is loaded independently from a separate file (no in-memory
/// splitting required; the downloader writes three files directly).
pub fn load_har_splits(data_dir: &Path) -> Result<HarSplits> {
    Ok(HarSplits {
        train: load_split(data_dir, "train.jsonl")?,
        val:   load_split(data_dir, "val.jsonl")?,
        test:  load_split(data_dir, "test.jsonl")?,
    })
}

fn load_split(base: &Path, filename: &str) -> Result<Vec<Sample>> {
    let file = base.join("har_cot").join(filename);
    let text = std::fs::read_to_string(&file)
        .with_context(|| format!("Cannot read HAR file {file:?}"))?;

    text.lines()
        .filter(|l| !l.trim().is_empty())
        .enumerate()
        .map(|(i, line)| {
            let row: HarRow = serde_json::from_str(line)
                .with_context(|| format!("{filename} line {i}: parse error"))?;
            row_to_sample(&row)
        })
        .collect()
}

fn row_to_sample(row: &HarRow) -> Result<Sample> {
    let axes = [("x-axis", &row.x_axis), ("y-axis", &row.y_axis), ("z-axis", &row.z_axis)];

    let mut ts_texts = Vec::new();
    let mut ts_data  = Vec::new();

    for (axis_name, raw) in &axes {
        let (normed, mean, std) = normalize(raw);
        ts_texts.push(format!(
            "The following is the accelerometer data on the {axis_name}, \
             it has mean {mean:.4} and std {std:.4}:"
        ));
        ts_data.push(normed);
    }

    let activity_list = ACTIVITIES.join(", ");
    let pre_prompt = format!(
        "\
You are given accelerometer data in all three dimensions. Your task is to classify the activity \
based on analysis of the data.

Instructions:
- Begin by analyzing the time series without assuming a specific label.
- Think step-by-step about what the observed patterns suggest regarding movement intensity and behavior.
- Write your rationale as a single, natural paragraph — do not use bullet points, numbered steps, or section headings.
- Do **not** mention any class label until the final sentence.

Possible activity labels are:
{activity_list}."
    );

    Ok(Sample {
        pre_prompt,
        time_series_text: ts_texts,
        time_series: ts_data,
        post_prompt: "Rationale:".to_string(),
        answer: row.rationale.trim().to_string(),
        label: Some(row.label.trim().to_string()),
    })
}