opentslm 0.1.0 - Docs.rs

//! opentslm — Rust / Burn / llama.cpp port of the OpenTSLM time-series
//! language model.
//!
//! # Quick start
//!
//! ```text
//! cargo run --release -- train
//! ```
//!
//! Model weights (~2.5 GB GGUF) and all training datasets are downloaded
//! automatically on first run; no manual setup is needed.
//!
//! # Architecture overview
//!
//! ```text
//! Raw time series  [B, L]
//!       │
//!       ▼
//! TransformerCnnEncoder  (Burn, trainable)
//!   Conv1d patch embedding (stride = PATCH_SIZE)
//!   Learnable positional embeddings
//!   TransformerEncoder (6 layers, 8 heads, GELU)
//!       │  [B, N_patches, 128]
//!       ▼
//! mean-pool + LogitBiasHead (Linear 128 → vocab)
//!       │  [B, vocab]  — additive logit bias
//!       ▼
//! Frozen Qwen3/Llama GGUF  (llama-cpp-4, NOT in autodiff graph)
//! ```
//!
//! Only the encoder and logit-head carry learnable parameters.  The LLM
//! backbone is a quantised GGUF file loaded through `llama-cpp-4`; it is
//! **never** part of Burn's autodiff graph.
//!
//! # Curriculum stages
//!
//! | Stage | Task | Dataset |
//! |-------|------|---------|
//! | 1 | MCQ on time series | TSQA (WISDM-W) |
//! | 2 | Time-series captioning | M4 (WISDM-W) |
//! | 3 | HAR chain-of-thought | HAR CoT (WISDM-W) |
//! | 4 | Sleep-stage CoT | SleepEDF |
//! | 5 | ECG QA CoT | Synthetic PTB-XL-style |
//!
//! # GPU acceleration
//!
//! The default GPU backend is selected automatically by platform:
//!
//! | Platform | Default | Override |
//! |----------|---------|---------|
//! | macOS | Metal *(always on Apple)* | `--features vulkan` (needs MoltenVK) |
//! | Linux / Windows | Vulkan | `--features cuda` (needs CUDA toolkit) |
//!
//! ```text
//! cargo build --release                            # defaults (Metal / Vulkan)
//! cargo build --release --features cuda            # + CUDA on Linux/Windows
//! cargo build --release --features vulkan          # + Vulkan on macOS via MoltenVK
//! cargo build --release --features vulkan --features cuda  # Vulkan + CUDA
//! ```
//!
//! See [`model::llm::llama_cpp`] for full details including the MoltenVK
//! setup steps required for `--features vulkan` on macOS.
//!
//! # Modules
//!
//! - [`config`]   — all hyper-parameters and compile-time constants
//! - [`data`]     — dataset loaders, batch utilities, optional auto-downloader
//! - [`model`]    — encoder, MLP projector, llama.cpp wrapper, OpenTSLMSP
//! - [`training`] — curriculum trainer, metrics collection, SVG/HTML plots

// wgpu-core 26 has deeply-nested Send/Sync chains that overflow the default
// recursion limit of 128 in the Rust trait solver.
#![recursion_limit = "256"]

// opentslm — zero-configuration quick start:
//
//   cargo run --release -- train
//
// Model  : downloaded automatically from HuggingFace on first run (~2.5 GB)
// Data   : downloaded automatically if `data/` is missing or incomplete
//
// All flags are optional — sensible defaults are chosen for everything.
#![allow(dead_code, unused_imports, unused_variables)]

mod config;
mod data;
mod model;
mod training;

use std::path::PathBuf;

use anyhow::Result;
use burn::backend::{wgpu::WgpuDevice, Autodiff, Wgpu};
use clap::{Parser, Subcommand};
use tracing_subscriber::EnvFilter;

type WgpuBackend = Wgpu<f32, i32>;
type AutoWgpu    = Autodiff<WgpuBackend>;

// ── CLI ───────────────────────────────────────────────────────────────────────

#[derive(Parser)]
#[command(
    name = "opentslm",
    about = "OpenTSLM — Time-Series Language Model (Rust / Burn / llama.cpp)",
    long_about = "\
Zero-configuration quick start:

  opentslm train

Model  — downloaded automatically from HuggingFace on first run and cached in
         ~/.cache/huggingface/hub/.  Set $HF_TOKEN for private repos.
Data   — downloaded automatically if data/ is missing or incomplete.
         All sources are public; no login required."
)]
struct Cli {
    #[command(subcommand)]
    command: Commands,

    /// Log level: error | warn | info | debug | trace
    #[arg(long, global = true, default_value = "info")]
    log_level: String,
}

#[derive(Subcommand)]
enum Commands {
    /// Run curriculum training (all stages, or a subset).
    /// Model and data are downloaded automatically if needed.
    Train {
        /// Path to a GGUF model file.
        /// Default: Qwen3-4B-Q4_K_M downloaded from HuggingFace into the HF cache.
        #[arg(long)]
        model: Option<PathBuf>,

        /// Directory containing (or to receive) the JSONL training datasets.
        /// Default: data/  — created and populated automatically if missing.
        #[arg(long, default_value = "data/")]
        data_dir: PathBuf,

        /// Stages to run, e.g. --stages stage1_mcq stage2_captioning
        /// Default: all five stages.
        #[arg(long, num_args = 1.., value_delimiter = ' ')]
        stages: Option<Vec<String>>,

        /// Override batch size (default: 4).
        #[arg(long)]
        batch_size: Option<usize>,
    },

    /// Evaluate a trained checkpoint on a stage's test set.
    /// Model and data are downloaded automatically if needed.
    Eval {
        /// Path to a GGUF model file.  Default: HF-cached Qwen3-4B Q4_K_M.
        #[arg(long)]
        model: Option<PathBuf>,

        /// Dataset directory.  Default: data/
        #[arg(long, default_value = "data/")]
        data_dir: PathBuf,

        /// Stage to evaluate, e.g. stage1_mcq
        #[arg(long)]
        stage: String,
    },

    /// Ask the model a question about a time series.
    /// Model is downloaded automatically if needed (no data required).
    Infer {
        /// Path to a GGUF model file.  Default: HF-cached Qwen3-4B Q4_K_M.
        #[arg(long)]
        model: Option<PathBuf>,

        /// Comma-separated float values, e.g. "0.1,0.2,-0.1,0.5"
        #[arg(long)]
        series: String,

        /// Text prompt / question
        #[arg(long)]
        prompt: String,

        /// Maximum tokens to generate (default: 200)
        #[arg(long, default_value_t = 200)]
        max_tokens: usize,
    },

    /// Re-generate metric plots from previously saved metrics.csv files.
    /// Does not require a model, GPU, or training data.
    ///
    /// Example:
    ///   opentslm plot
    ///   opentslm plot --stages stage1_mcq stage2_captioning
    ///   opentslm plot --figures-dir /tmp/my_figures
    Plot {
        /// Stages to plot.  Default: all five curriculum stages.
        #[arg(long, num_args = 1.., value_delimiter = ' ')]
        stages: Option<Vec<String>>,

        /// Root directory that contains the per-stage metrics.csv files.
        /// Default: figures/
        #[arg(long, default_value = "figures/")]
        figures_dir: PathBuf,
    },

    /// Download and convert wearable datasets from HuggingFace.
    /// Called automatically by train/eval when data is missing.
    /// Requires the `download` feature (enabled by default in release builds).
    #[cfg(feature = "download")]
    DownloadData {
        /// Output directory.  Default: data/
        #[arg(long, default_value = "data/")]
        out_dir: PathBuf,

        /// Cap each source at N rows (e.g. 500 for a quick smoke-test)
        #[arg(long)]
        limit: Option<usize>,

        /// Download only one source: har | sleep | ecg
        #[arg(long, value_parser = ["har", "sleep", "ecg"])]
        only: Option<String>,
    },
}

// ── Main ──────────────────────────────────────────────────────────────────────

fn main() -> Result<()> {
    let cli = Cli::parse();

    // Build the log filter.
    //
    // Priority:
    //   1. $RUST_LOG — always honoured when set (user override).
    //   2. `--log-level` + cubecl suppression (default path):
    //      Without the `verbose` feature, the cubecl autotune targets are
    //      capped at WARN so the training progress bar isn't buried under
    //      hundreds of "Tuning MatmulAutotuneKey …" / "Load autotune cache …"
    //      lines.  Enable them again with `--features verbose` at compile
    //      time, or set $RUST_LOG at runtime.
    let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| {
        #[cfg(not(feature = "verbose"))]
        {
            // Suppress noisy cubecl autotune / cache chatter.
            EnvFilter::new(format!(
                "{level},\
                 cubecl_runtime::tune=warn,\
                 cubecl_wgpu=warn",
                level = cli.log_level,
            ))
        }
        #[cfg(feature = "verbose")]
        {
            EnvFilter::new(&cli.log_level)
        }
    });

    tracing_subscriber::fmt()
        .with_env_filter(filter)
        .init();

    match cli.command {
        Commands::Train { model, data_dir, stages, batch_size } =>
            cmd_train(model, data_dir, stages, batch_size),
        Commands::Eval  { model, data_dir, stage } =>
            cmd_eval(model, data_dir, stage),
        Commands::Infer { model, series, prompt, max_tokens } =>
            cmd_infer(model, series, prompt, max_tokens),
        Commands::Plot  { stages, figures_dir } =>
            cmd_plot(stages, figures_dir),
        #[cfg(feature = "download")]
        Commands::DownloadData { out_dir, limit, only } =>
            cmd_download_data(out_dir, limit, only),
    }
}

// ── Sub-commands ──────────────────────────────────────────────────────────────

fn cmd_train(
    model:      Option<PathBuf>,
    data_dir:   PathBuf,
    stages:     Option<Vec<String>>,
    batch_size: Option<usize>,
) -> Result<()> {
    let model = resolve_model(model)?;
    ensure_data(&data_dir)?;

    tracing::info!("Model    : {}", model.display());
    tracing::info!("Data dir : {}", data_dir.display());

    let mut trainer =
        training::curriculum::CurriculumTrainer::new(&model, &data_dir, "wgpu");
    if let Some(bs) = batch_size { trainer.batch_size = bs; }

    let run_stages: Vec<String> = stages.unwrap_or_else(|| {
        config::CURRICULUM_STAGES.iter().map(|s| s.to_string()).collect()
    });
    for stage in &run_stages {
        trainer.run_stage::<AutoWgpu>(stage)?;
    }

    // Regenerate the combined curriculum overview from saved CSV files so it
    // reflects whatever stages were (re-)run in this invocation.
    {
        use training::metrics::{StageMetrics, plot_curriculum_overview};
        let figs = PathBuf::from("figures");
        let all: Vec<StageMetrics> = config::CURRICULUM_STAGES.iter()
            .filter_map(|s| StageMetrics::from_csv(s, &figs).ok())
            .collect();
        if !all.is_empty() {
            let refs: Vec<&StageMetrics> = all.iter().collect();
            if let Err(e) = plot_curriculum_overview(&refs, &figs) {
                tracing::warn!("Could not write curriculum overview: {e}");
            }
        }
    }
    Ok(())
}

fn cmd_eval(model: Option<PathBuf>, data_dir: PathBuf, stage: String) -> Result<()> {
    let model = resolve_model(model)?;
    ensure_data(&data_dir)?;
    training::curriculum::CurriculumTrainer::new(&model, &data_dir, "wgpu")
        .run_stage::<AutoWgpu>(&stage)
}

fn cmd_infer(
    model:      Option<PathBuf>,
    series_str: String,
    prompt:     String,
    max_tokens: usize,
) -> Result<()> {
    use model::llm::{llama_cpp::LlamaCppBackend, opentslm_sp::OpenTslmSp};

    let model  = resolve_model(model)?;
    let device = WgpuDevice::default();
    let llm    = LlamaCppBackend::load(&model, config::N_GPU_LAYERS, config::CTX_SIZE)?;

    let sp_model: OpenTslmSp<AutoWgpu> = OpenTslmSp::new(&llm, &device);

    let series: Vec<f32> = series_str
        .split(',')
        .map(|s| s.trim().parse::<f32>().unwrap_or(0.0))
        .collect();

    let sample = data::batch::Sample {
        pre_prompt:       prompt,
        time_series_text: vec!["Time series data:".to_string()],
        time_series:      vec![series],
        post_prompt:      String::new(),
        answer:           String::new(),
        label:            None,
    };

    let outputs = sp_model.generate(&[sample], &llm, max_tokens, &device);
    println!("\n─── Model Output ────────────────────────────────────────");
    println!("{}", outputs.into_iter().next().unwrap_or_default());
    println!("─────────────────────────────────────────────────────────");
    Ok(())
}

fn cmd_plot(stages: Option<Vec<String>>, figures_dir: PathBuf) -> Result<()> {
    use training::metrics::{StageMetrics, plot_curriculum_overview, write_html_index};

    let run_stages: Vec<String> = stages.unwrap_or_else(|| {
        config::CURRICULUM_STAGES.iter().map(|s| s.to_string()).collect()
    });

    let mut loaded: Vec<StageMetrics> = Vec::new();
    let mut any_ok = false;
    for stage in &run_stages {
        match StageMetrics::from_csv(stage, &figures_dir) {
            Ok(m) => {
                m.save(&figures_dir)?;
                write_html_index(&m, &figures_dir)?;
                tracing::info!("{stage}: plots written → {}/", figures_dir.join(stage).display());
                loaded.push(m);
                any_ok = true;
            }
            Err(e) => tracing::warn!("{stage}: skipped — {e}"),
        }
    }

    if !any_ok {
        anyhow::bail!(
            "No metrics.csv files found in {}.\n\
             Run training first:  cargo run --release -- train",
            figures_dir.display()
        );
    }

    // Combined curriculum overview across all successfully loaded stages.
    let refs: Vec<&StageMetrics> = loaded.iter().collect();
    if let Err(e) = plot_curriculum_overview(&refs, &figures_dir) {
        tracing::warn!("Could not write curriculum overview: {e}");
    }
    Ok(())
}

#[cfg(feature = "download")]
fn cmd_download_data(
    out_dir: PathBuf,
    limit:   Option<usize>,
    only:    Option<String>,
) -> Result<()> {
    tracing::info!("Downloading wearable datasets → {}", out_dir.display());
    data::downloader::run(&data::downloader::DownloadConfig { out_dir, limit, only })
}

// ── Data auto-provisioning ────────────────────────────────────────────────────

/// The JSONL files that must exist for a complete dataset.
const REQUIRED_DATA_FILES: &[&str] = &[
    "tsqa/train.jsonl",
    "m4/train_samples.jsonl",
    "har_cot/train.jsonl",
    "har_cot/val.jsonl",
    "har_cot/test.jsonl",
    "sleep_cot/train.jsonl",
    "sleep_cot/val.jsonl",
    "sleep_cot/test.jsonl",
    "ecg_qa_cot/train.jsonl",
    "ecg_qa_cot/val.jsonl",
    "ecg_qa_cot/test.jsonl",
];

/// Check that all required JSONL files exist under `data_dir`.
///
/// If every file is present, returns immediately.  If one or more are missing:
/// - When the crate was compiled with the `download` feature (the default for
///   release builds), the downloader is invoked automatically and the missing
///   files are fetched from HuggingFace.
/// - Otherwise an [`anyhow::Error`] is returned listing the missing files and
///   explaining how to build with `--features download`.
fn ensure_data(data_dir: &PathBuf) -> Result<()> {
    let missing: Vec<&str> = REQUIRED_DATA_FILES
        .iter()
        .copied()
        .filter(|f| !data_dir.join(f).exists())
        .collect();

    if missing.is_empty() {
        return Ok(());
    }

    tracing::info!(
        "Data directory {:?} is missing {} file(s) — downloading now …",
        data_dir,
        missing.len(),
    );
    for f in &missing {
        tracing::info!("  missing: {f}");
    }

    #[cfg(feature = "download")]
    {
        data::downloader::run(&data::downloader::DownloadConfig {
            out_dir: data_dir.clone(),
            limit:   None,
            only:    None,
        })
    }

    #[cfg(not(feature = "download"))]
    anyhow::bail!(
        "Dataset files are missing and the binary was built without the \
         `download` feature.\n\
         Re-build with: cargo build --features download\n\
         Or place the required JSONL files in {:?} manually.",
        data_dir
    )
}

// ── HuggingFace model resolution ─────────────────────────────────────────────

/// Resolve a GGUF model path.
///
/// If `explicit` is `Some`, that path is returned without any network access.
/// Otherwise [`DEFAULT_MODEL_REPO`](config::DEFAULT_MODEL_REPO) /
/// [`DEFAULT_MODEL_FILE`](config::DEFAULT_MODEL_FILE) is fetched through the
/// HuggingFace hub cache (downloading only when not already cached).
fn resolve_model(explicit: Option<PathBuf>) -> Result<PathBuf> {
    if let Some(p) = explicit {
        return Ok(p);
    }
    hf_get(config::DEFAULT_MODEL_REPO, config::DEFAULT_MODEL_FILE)
}

/// Fetch `filename` from HuggingFace `repo` through the local hub cache.
///
/// Returns the local cache path immediately if the file is already present.
/// Downloads the file (with a progress bar) otherwise.  The download location
/// follows the standard HuggingFace cache convention:
/// `~/.cache/huggingface/hub/models--<repo>/snapshots/<hash>/<filename>`.
///
/// Set the `HF_TOKEN` environment variable before calling this function if
/// the model repository is private or gated.
fn hf_get(repo: &str, filename: &str) -> Result<PathBuf> {
    use hf_hub::api::sync::Api;

    let api = Api::new()
        .map_err(|e| anyhow::anyhow!("HF Hub init failed: {e}"))?;

    tracing::info!("Resolving {repo}/{filename} (downloading to HF cache if needed) …");

    let path = api
        .model(repo.to_string())
        .get(filename)
        .map_err(|e| anyhow::anyhow!(
            "Could not fetch {repo}/{filename}: {e}\n\
             \n\
             To download manually:\n  huggingface-cli download {repo} {filename}\n\
             To use a local file:\n  --model /path/to/{filename}"
        ))?;

    tracing::info!("Model  : {}", path.display());
    Ok(path)
}