taxa-core 0.1.0

//! The query-capable binding schema.
//!
//! There are two types here, with a deliberate split:
//!
//! - [`Dataset`] is the PUBLIC, on-disk manifest: the SHARED fields (title,
//!   axes, filters, entity nouns, loading knobs) plus the named `sources`, the
//!   `frames` map (REQUIRED — the only place per-frame source/id/label/metrics/
//!   timestamp data lives) and the optional `views` map. It has NO flat or
//!   `series_*` fields — there is ONE canonical manifest shape.
//!
//! - [`FrameDataset`] is the INTERNAL, flat per-frame view the ENGINE consumes
//!   (treemap/series/query/adapter all take `&FrameDataset`). It is synthesized
//!   by [`Dataset::frame_dataset`] — combining one [`Frame`] with the dataset's
//!   shared fields — and is never authored or serialized into a manifest file.

use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use serde_json::Value as Json;

/// Aggregations and how they decompose (so an "Other" fold is correct).
pub const ADDITIVE: &[&str] = &["sum", "count", "min", "max"];
pub const SUFFICIENT_STAT: &[&str] = &["mean", "weighted_mean"];
pub const NON_DECOMPOSABLE: &[&str] = &["count_distinct", "median"];

fn default_sum() -> String {
    "sum".into()
}
fn default_number() -> String {
    "number".into()
}
fn default_additive() -> String {
    "additive".into()
}
fn default_last() -> String {
    "last".into()
}
fn default_drop() -> String {
    "drop".into()
}
fn default_categorical() -> String {
    "categorical".into()
}
fn default_title() -> String {
    "taxa".into()
}
fn default_entity() -> String {
    "entity".into()
}
fn default_entities() -> String {
    "entities".into()
}

/// Which row to keep per entity-grain group (a `kind:"entity"` metric). `by` is
/// the ordering column; `take` selects the latest/earliest along it. See
/// docs/METRIC_SEMANTICS.md.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Pick {
    pub by: String,
    #[serde(default = "default_last")]
    pub take: String, // last | first (max | min aliases: by the `by` column)
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Metric {
    pub id: String,
    /// What KIND of quantity this is (docs/METRIC_SEMANTICS.md):
    /// - `additive` (default): sums along any axis; `agg` governs.
    /// - `entity`: an attribute of an entity at a declared `grain`, never
    ///   decomposed — dedup to one row per grain group (`pick`), then `rollup`
    ///   up the hierarchy. Generalizes the old `agg:"last"`+`entity_column`.
    /// - `ratio` (Stage C): a `numerator`/`denominator` over two metrics,
    ///   evaluated AFTER aggregation so it's correct at every node.
    #[serde(default = "default_additive")]
    pub kind: String,
    #[serde(default = "default_sum")]
    pub agg: String,
    /// The WITHIN-entity, ACROSS-TIME aggregation a coarse series bucket applies
    /// BEFORE the cross-sectional `agg` (`last` | `mean` | `sum` | `min` | `max`).
    /// A *stock* metric (e.g. market cap) is `last`: a month's value for an entity
    /// is its latest weekly snapshot, not the sum of its ~4 weekly rows. A *flow*
    /// metric (e.g. volume) is `sum`. Default (`None`) = the metric's `agg`, so
    /// existing single-stage behavior is unchanged. Only consulted by `series()`
    /// when the requested resolution is coarser than the source cadence.
    /// ORTHOGONAL to `kind`: `time_agg` is the across-TIME fold; `kind` governs
    /// the cross-sectional/decomposition behavior.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub time_agg: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub column: Option<String>,
    /// Expression AST (see `formula.rs`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub formula: Option<Json>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub weight_column: Option<String>,
    /// `kind:"entity"` — the entity grain to dedup by (e.g. `["org_uuid"]` or
    /// `["country"]`). The value is taken from each group's `pick` row AMONG THE
    /// FILTERED ROWS, then `rollup`'d up the hierarchy — so an entity never
    /// double-counts across sub-axes, and a time filter yields its in-window row.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub grain: Vec<String>,
    /// `kind:"entity"` — which row to keep per grain group (default: latest by
    /// the ordering column). Omit for a single-row-per-grain source.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub pick: Option<Pick>,
    /// `kind:"entity"` — how the per-entity values combine up the hierarchy
    /// (`sum` | `mean` | `min` | `max` | `count`). Default `sum`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub rollup: Option<String>,
    /// `kind:"ratio"` (Stage C) — metric ids of the numerator/denominator.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub numerator: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub denominator: Option<String>,
    #[serde(default = "default_number")]
    pub unit: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub label: Option<String>,
    #[serde(default = "default_drop")]
    pub null_policy: String, // drop | zero
}

impl Metric {
    /// The within-entity, across-time aggregation `series()` applies per
    /// (entity, coarse_bucket): the explicit `time_agg`, else a default. An
    /// entity (stock) metric defaults to `last` (take the latest snapshot, never
    /// sum repeated rows); other metrics default to their `agg`.
    pub fn resolved_time_agg(&self) -> &str {
        if let Some(ta) = &self.time_agg {
            return ta;
        }
        if self.is_entity() {
            "last"
        } else {
            &self.agg
        }
    }

    pub fn is_entity(&self) -> bool {
        self.kind == "entity"
    }
    pub fn is_ratio(&self) -> bool {
        self.kind == "ratio"
    }

    /// The CROSS-SECTIONAL aggregation (how rows combine across entities / up the
    /// hierarchy): an entity metric's `rollup` (default sum), else its `agg`.
    pub fn cross_agg(&self) -> &str {
        if self.is_entity() {
            self.rollup.as_deref().unwrap_or("sum")
        } else {
            &self.agg
        }
    }

    pub fn validate(&self) -> Result<(), String> {
        const KINDS: &[&str] = &["additive", "entity", "ratio"];
        if !KINDS.contains(&self.kind.as_str()) {
            return Err(format!(
                "unknown metric kind {:?} (additive|entity|ratio)",
                self.kind
            ));
        }
        // The cross-sectional agg must be a known sufficient-stat agg. For an
        // entity metric that's `rollup`; otherwise `agg`. (`last` is no longer an
        // agg — a stock metric is now `kind:"entity"`.)
        const AGGS: &[&str] = &[
            "sum",
            "count",
            "min",
            "max",
            "mean",
            "weighted_mean",
            "count_distinct",
            "median",
        ];
        if !AGGS.contains(&self.cross_agg()) {
            return Err(format!(
                "unknown {} {:?}",
                if self.is_entity() { "rollup" } else { "agg" },
                self.cross_agg()
            ));
        }
        if self.is_entity() && self.grain.is_empty() {
            return Err(format!(
                "entity metric {:?} requires a non-empty `grain`",
                self.id
            ));
        }
        if self.cross_agg() == "weighted_mean" && self.weight_column.is_none() {
            return Err("weighted_mean requires weight_column".into());
        }
        if self.is_ratio() && (self.numerator.is_none() || self.denominator.is_none()) {
            return Err(format!(
                "ratio metric {:?} requires `numerator` and `denominator`",
                self.id
            ));
        }
        // time_agg is a simple per-entity time fold (no sufficient-stat machinery).
        const TIME_AGGS: &[&str] = &["last", "first", "mean", "median", "sum", "min", "max"];
        if let Some(ta) = &self.time_agg {
            if !TIME_AGGS.contains(&ta.as_str()) {
                return Err(format!("unknown time_agg {ta:?}"));
            }
        }
        Ok(())
    }
}

fn default_sep() -> String {
    "/".into()
}

/// A single column holding a delimited path (`a/b/c.txt`) split into a variable
/// number of hierarchy components. When an axis carries a `PathSpec`, its
/// `levels` are DERIVED from the data (max component count) rather than authored.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PathSpec {
    pub column: String,
    #[serde(default = "default_sep")]
    pub sep: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Axis {
    pub id: String,
    /// Physical column names forming a fixed-depth hierarchy. For a path axis
    /// (`path.is_some()`) this is DERIVED, not authored, and may be empty/absent.
    #[serde(default)]
    pub levels: Vec<String>,
    /// Human label per level, parallel to `levels` (e.g. ["Sector","Company","Round"]).
    /// Used by search to tag each node ("Company Stripe"). Missing/short → the column
    /// name is humanized as a fallback.
    #[serde(default)]
    pub level_labels: Vec<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub label: Option<String>,
    /// When set, this is a variable-depth path axis (see `PathSpec`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub path: Option<PathSpec>,
    /// A per-axis row predicate (a `formula.rs` AST that compiles to a boolean
    /// `Expr`, e.g. `{"op": ">=", "args": [{"col": "mcap_usd"}, {"lit": 1e7}]}`).
    /// Applied — ONLY when this axis is selected — after the base filters/focus
    /// and before grouping, so each axis can carry its own row universe (e.g. the
    /// mcap axis drops sub-$10M entities while geography/gics keep everyone).
    /// Default `None` = no filter (existing behavior unchanged).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub row_filter: Option<Json>,
    /// Optional per-axis default size-by metric id. When the user selects this axis,
    /// the frontend switches the treemap's size-by to this metric (useful when axes
    /// expose disjoint metrics, e.g. a budget axis sized in $ vs an equipment axis
    /// sized in unit counts). `None` keeps the current/global size-by.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub default_size_by: Option<String>,
    /// Optional whitelist of size-by metric ids VALID for this axis. The frontend
    /// greys out (disables) any global size-by metric not in this list, since some
    /// metrics are meaningless for a given bucket (e.g. dollar amounts on an
    /// equipment-model axis). `None` = all global size-by metrics are valid.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub size_by: Option<Vec<String>>,
}

impl Axis {
    /// Display label for level `i`: the authored `level_labels[i]`, else the level's
    /// column name humanized (`org_name` → "Org name").
    pub fn level_label(&self, i: usize) -> String {
        if let Some(l) = self.level_labels.get(i) {
            if !l.is_empty() {
                return l.clone();
            }
        }
        match self.levels.get(i) {
            Some(c) => {
                let mut s = c.replace('_', " ");
                if let Some(first) = s.get_mut(0..1) {
                    first.make_ascii_uppercase();
                }
                s
            }
            None => String::new(),
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Filter {
    pub id: String,
    /// The main-frame column to filter on. For a `tags` filter this is unused
    /// for matching (tags live in a companion frame); leave it as the entity
    /// column so legacy tooling that reads `column` still gets a real column.
    #[serde(default)]
    pub column: String,
    #[serde(default = "default_categorical")]
    pub r#type: String, // categorical | range | bool | tags
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub label: Option<String>,
    /// `tags` filter (multi-valued dimension): the named frame holding the
    /// entity↔tag long table. An entity matches if ANY of its tags is selected,
    /// and is counted ONCE (the match compiles to an `is_in` predicate on the
    /// main frame, never a row-multiplying join). See docs/METRIC_SEMANTICS.md.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tags_frame: Option<String>,
    /// `tags` filter: the join key present on BOTH the main frame and the tag
    /// frame (e.g. `org_uuid`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub entity_column: Option<String>,
    /// `tags` filter: the tag-value column in `tags_frame` (e.g. `category`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tag_column: Option<String>,
    /// Optional UI control override: "select" (typeahead dropdown), "multiselect"
    /// (button row), or "range". Defaults by `type` when unset.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub control: Option<String>,
    /// Optional default selection applied when the view first loads (before any
    /// user interaction). For `select` a scalar (e.g. a year), for `multiselect`
    /// an array. Lets a view open on a specific value instead of "Any".
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub default: Option<serde_json::Value>,
}

/// A named raw source the plan-provider path binds `taxa://<name>` leaves to.
///
/// Tagged by variant name (`{"sql": {...}}`); add new source kinds as variants.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SourceSpec {
    /// A Postgres query ingested once at build time (see taxa-sql `SqlSource`).
    Sql { dsn: String, query: String },
}

/// A single data *frame* in a manifest: its own source/transform, id and label
/// columns, metrics, and (for a timestamped frame) the timestamp column with
/// series resolution knobs.
///
/// The SHARED axes, filters, title, etc. live on the top-level [`Dataset`];
/// [`Dataset::frame_dataset`] synthesizes a per-frame [`FrameDataset`] by
/// combining a `Frame` with those shared fields.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Frame {
    /// The frame's raw source: a key into the dataset's `sources` map, or — for a
    /// plain single-file dataset — an absolute data-file path. `transform`
    /// overrides it (the plan-provider path).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub source: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub transform: Option<String>,
    pub id_column: String,
    /// Optional — a series frame often has no separate label column; defaults to
    /// `id_column` when omitted (resolved in `frame_dataset`).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub label_column: Option<String>,
    #[serde(default)]
    pub metrics: Vec<Metric>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub timestamp: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub resolutions: Option<Vec<String>>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub default_resolution: Option<String>,
}

/// A *view* (treemap / scatter / detail / series) binds to a [`Frame`] by name and
/// optionally references other frames for cross-frame behaviors:
/// - `series_frame`: the detail page's time chart reads this (timestamped) frame.
/// - `dims_from`: the series frame is narrow `{id,ts,metric}`; enrich it at serve
///   time with the named frame's axis-level columns (joined on the id column).
/// - `branch_set`: the series Line tab's branch set comes from the named view
///   (`"treemap"`) — the snapshot ranking — not the series frame's own ranking.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct View {
    pub frame: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub dims_from: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub branch_set: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub series_frame: Option<String>,
}

/// The PUBLIC, on-disk manifest. ONE canonical shape: SHARED fields + named
/// `sources` + a REQUIRED `frames` map + an OPTIONAL `views` map. There are no
/// flat per-frame fields here — those live on [`Frame`], reached via
/// [`Dataset::frame_dataset`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Dataset {
    pub axes: Vec<Axis>,
    #[serde(default)]
    pub filters: Vec<Filter>,
    #[serde(default = "default_title")]
    pub title: String,
    #[serde(default = "default_entity")]
    pub entity_noun: String,
    #[serde(default = "default_entities")]
    pub entity_noun_plural: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub default_axis: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub default_size_by: Option<String>,

    /// Named raw sources the frames' `source`/`transform` bind to. `IndexMap`
    /// preserves authored order. Optional — a single-file frame may carry an
    /// absolute path directly in its `source`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub sources: Option<IndexMap<String, SourceSpec>>,

    /// Named data frames, each with its own source/transform/id/label/metrics
    /// (+ timestamp for a series frame). REQUIRED — a single-frame dataset is
    /// just `frames: {"main": {...}}`. The shared axes/filters/title are SHARED
    /// across frames; `frame_dataset` synthesizes a per-frame `FrameDataset`.
    pub frames: IndexMap<String, Frame>,
    /// Named views (treemap/scatter/detail/series) binding to frames. OPTIONAL —
    /// when omitted, the [default views](Dataset::resolved_views) apply (a
    /// single-frame dataset binds every view to its sole frame).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub views: Option<IndexMap<String, View>>,

    /// Ordered fact columns to show on the entity-detail page. When set, the
    /// detail view emits exactly these (in order) as facts; when absent it falls
    /// back to the first 10 non-id/non-label/non-metric columns in source order.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub detail_fields: Option<Vec<String>>,

    // ── treemap loading knobs (creator-configurable) ──
    /// Levels prefetched beyond the displayed depth. `Some(n)` (default 2) =
    /// windowed/bounded fetch; **explicit `null` = load the whole tree at once**
    /// (client-side zoom, no re-fetch). serde applies the default only when the
    /// field is absent, so `null` is distinguishable from omitted.
    #[serde(default = "default_lookahead")]
    pub lookahead: Option<i64>,
    /// Per-parent branch cap for internal levels (the windowed top-K).
    #[serde(default = "default_branch_cap")]
    pub branch_cap: i64,
    /// Per-parent cap at the entity/leaf level (used in full-load mode).
    #[serde(default = "default_leaf_cap")]
    pub leaf_cap: i64,
    /// Default number of treemap levels displayed.
    #[serde(default = "default_levels")]
    pub default_levels: i64,
}

fn default_lookahead() -> Option<i64> {
    Some(2)
}
fn default_branch_cap() -> i64 {
    12
}
fn default_leaf_cap() -> i64 {
    50
}
fn default_levels() -> i64 {
    2
}

/// The four view names a manifest may bind. Used both for default-view synthesis
/// and for validating an authored `views` map.
pub const VIEW_NAMES: &[&str] = &["treemap", "scatter", "detail", "series"];

impl Dataset {
    pub fn axis(&self, id: &str) -> Option<&Axis> {
        self.axes.iter().find(|a| a.id == id)
    }

    /// Look up a named frame.
    pub fn frame(&self, name: &str) -> Option<&Frame> {
        self.frames.get(name)
    }

    /// The sole frame's name when there is exactly one, else `None`.
    fn sole_frame(&self) -> Option<&String> {
        if self.frames.len() == 1 {
            self.frames.keys().next()
        } else {
            None
        }
    }

    /// The effective views map: the authored `views` if present, else the
    /// DEFAULT views. The default rule (single-frame terseness):
    ///
    /// - bind `treemap`, `scatter`, and `detail` to the sole frame;
    /// - bind `series` to the sole frame too **iff** that frame has a `timestamp`.
    ///
    /// With multiple frames and no authored `views`, the binding is ambiguous —
    /// `Err`. The CLI loader surfaces this at load time.
    pub fn resolved_views(&self) -> Result<IndexMap<String, View>, String> {
        if let Some(v) = &self.views {
            return Ok(v.clone());
        }
        let sole = self.sole_frame().ok_or_else(|| {
            "manifest has multiple frames but no `views` — bindings are ambiguous; \
             declare a `views` map"
                .to_string()
        })?;
        let mk = |frame: &str| View {
            frame: frame.to_string(),
            dims_from: None,
            branch_set: None,
            series_frame: None,
        };
        let mut views: IndexMap<String, View> = IndexMap::new();
        views.insert("treemap".into(), mk(sole));
        views.insert("scatter".into(), mk(sole));
        // The detail page's time chart reads the sole frame's series iff it is
        // timestamped.
        let has_ts = self
            .frame(sole)
            .and_then(|f| f.timestamp.as_ref())
            .is_some();
        let mut detail = mk(sole);
        if has_ts {
            detail.series_frame = Some(sole.clone());
        }
        views.insert("detail".into(), detail);
        if has_ts {
            views.insert("series".into(), mk(sole));
        }
        Ok(views)
    }

    /// Look up a named view from the effective (authored-or-default) views.
    pub fn view(&self, name: &str) -> Option<View> {
        self.resolved_views()
            .ok()
            .and_then(|v| v.get(name).cloned())
    }

    /// Synthesize the INTERNAL flat per-frame [`FrameDataset`] the engine
    /// consumes: the frame's source/id/label/metrics/timestamp + the SHARED
    /// axes/filters/title/entity/loading-knobs from `self`. This is the resolver
    /// every view is served through. Errors if `frame_name` is unknown.
    pub fn frame_dataset(&self, frame_name: &str) -> crate::error::Result<FrameDataset> {
        let f = self.frame(frame_name).ok_or_else(|| {
            crate::error::Error::Schema(format!("frame {frame_name:?} not in `frames`"))
        })?;
        Ok(FrameDataset {
            source: f.source.clone().unwrap_or_default(),
            transform: f.transform.clone(),
            tag_indices: std::collections::HashMap::new(), // populated by the loader
            id_column: f.id_column.clone(),
            // A frame without a label column reuses its id column.
            label_column: f
                .label_column
                .clone()
                .unwrap_or_else(|| f.id_column.clone()),
            metrics: f.metrics.clone(),
            timestamp_column: f.timestamp.clone(),
            series_resolutions: f.resolutions.clone(),
            series_default_resolution: f.default_resolution.clone(),
            // Series routing is explicit via the views map; a synthesized
            // frame-Dataset never carries nested series_* source naming. The CLI
            // loader fills these on the MAIN frame-Dataset when a `series` view
            // exists (so `boot_manifest` exposes the Line tab).
            series_source: None,
            series_metrics: None,
            axes: self.axes.clone(),
            filters: self.filters.clone(),
            title: self.title.clone(),
            entity_noun: self.entity_noun.clone(),
            entity_noun_plural: self.entity_noun_plural.clone(),
            default_axis: self.default_axis.clone(),
            default_size_by: self.default_size_by.clone(),
            detail_fields: self.detail_fields.clone(),
            lookahead: self.lookahead,
            branch_cap: self.branch_cap,
            leaf_cap: self.leaf_cap,
            default_levels: self.default_levels,
        })
    }
}

/// The INTERNAL flat per-frame view the engine consumes (treemap/series/query/
/// adapter all take `&FrameDataset`). Synthesized by [`Dataset::frame_dataset`];
/// never authored into a manifest file. It carries one frame's flat
/// source/id/label/metrics/timestamp plus the dataset's shared fields.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FrameDataset {
    #[serde(default)]
    pub source: String,
    #[serde(default)]
    pub id_column: String,
    #[serde(default)]
    pub label_column: String,
    /// In-memory tag indices for `tags` filters (filter id → index), built at
    /// load from each filter's `tags_frame`. Runtime-only (never (de)serialized);
    /// `filter_exprs` reads it to turn a tags selection into an `is_in`
    /// predicate. Empty when no tags filter / not yet loaded.
    #[serde(skip)]
    pub tag_indices: std::collections::HashMap<String, crate::tags::TagIndex>,
    pub axes: Vec<Axis>,
    #[serde(default)]
    pub metrics: Vec<Metric>,
    #[serde(default)]
    pub filters: Vec<Filter>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub timestamp_column: Option<String>,
    #[serde(default = "default_title")]
    pub title: String,
    #[serde(default = "default_entity")]
    pub entity_noun: String,
    #[serde(default = "default_entities")]
    pub entity_noun_plural: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub default_axis: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub default_size_by: Option<String>,
    /// A serialized `DslPlan` for this frame (the plan-provider path).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub transform: Option<String>,

    // ── series (Line tab) routing — set by the CLI loader on the MAIN frame
    //    when a `series` view exists, so `boot_manifest` exposes the Line tab. ──
    /// The series frame's name (a marker that the Line tab is available).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub series_source: Option<String>,
    /// Metric ids the Line tab offers (the series frame's metrics).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub series_metrics: Option<Vec<String>>,
    /// Resolutions the Line tab offers. Absent → the default `["d","w","m"]`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub series_resolutions: Option<Vec<String>>,
    /// The resolution the Line tab selects by default. Absent → `"d"`.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub series_default_resolution: Option<String>,

    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub detail_fields: Option<Vec<String>>,

    #[serde(default = "default_lookahead")]
    pub lookahead: Option<i64>,
    #[serde(default = "default_branch_cap")]
    pub branch_cap: i64,
    #[serde(default = "default_leaf_cap")]
    pub leaf_cap: i64,
    #[serde(default = "default_levels")]
    pub default_levels: i64,
}

impl FrameDataset {
    pub fn metric(&self, id: &str) -> Option<&Metric> {
        self.metrics.iter().find(|m| m.id == id)
    }
    pub fn axis(&self, id: &str) -> Option<&Axis> {
        self.axes.iter().find(|a| a.id == id)
    }
    /// The metric id a `size_by`/default resolves to (mirrors the Python
    /// `size_by or default_size_by or metrics[0].id`).
    pub fn resolve_size_by(&self, size_by: Option<&str>) -> Option<String> {
        size_by
            .map(str::to_string)
            .or_else(|| self.default_size_by.clone())
            .or_else(|| self.metrics.first().map(|m| m.id.clone()))
    }
    pub fn validate(&self) -> Result<(), String> {
        for m in &self.metrics {
            m.validate()?;
        }
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn single_frame_manifest_round_trips() {
        let json = r#"{
            "axes": [{"id": "owner_repo", "levels": ["owner", "repo"]}],
            "frames": {"main": {
                "source": "repos", "id_column": "repo", "label_column": "repo",
                "metrics": [
                    {"id": "stars", "agg": "sum", "column": "stars", "unit": "count"},
                    {"id": "repos", "agg": "count", "unit": "count"}
                ]
            }},
            "default_axis": "owner_repo", "default_size_by": "stars"
        }"#;
        let ds: Dataset = serde_json::from_str(json).unwrap();
        assert_eq!(ds.axis("owner_repo").unwrap().levels, ["owner", "repo"]);
        // defaults filled
        assert_eq!(ds.title, "taxa");
        // sole-frame defaults: treemap/scatter/detail bound, no series (no ts).
        let views = ds.resolved_views().unwrap();
        assert_eq!(
            views.keys().collect::<Vec<_>>(),
            ["treemap", "scatter", "detail"]
        );

        // frame_dataset synthesizes the flat engine view.
        let fd = ds.frame_dataset("main").unwrap();
        assert_eq!(fd.source, "repos");
        assert_eq!(fd.id_column, "repo");
        assert_eq!(fd.metric("repos").unwrap().agg, "count");
        assert_eq!(fd.resolve_size_by(None).as_deref(), Some("stars"));

        // An unknown frame is an Err, NOT a panic (the resolver is total).
        let err = ds.frame_dataset("does-not-exist");
        assert!(err.is_err(), "unknown frame must error, not panic");
        assert!(format!("{}", err.unwrap_err()).contains("does-not-exist"));
        fd.validate().unwrap();
        assert_eq!(fd.metric("stars").unwrap().null_policy, "drop");
    }

    #[test]
    fn single_frame_with_timestamp_gets_default_series_view() {
        let json = r#"{
            "axes": [{"id": "g", "levels": ["g", "id"]}],
            "frames": {"main": {
                "source": "tvl", "id_column": "id", "timestamp": "dt",
                "metrics": [{"id": "tvl", "agg": "sum", "column": "tvl"}],
                "resolutions": ["w"], "default_resolution": "w"
            }}
        }"#;
        let ds: Dataset = serde_json::from_str(json).unwrap();
        // With a timestamp, the default views include `series` and a detail
        // series_frame.
        let views = ds.resolved_views().unwrap();
        assert_eq!(
            views.keys().collect::<Vec<_>>(),
            ["treemap", "scatter", "detail", "series"]
        );
        assert_eq!(views["detail"].series_frame.as_deref(), Some("main"));
        assert_eq!(views["series"].frame, "main");

        let fd = ds.frame_dataset("main").unwrap();
        assert_eq!(fd.timestamp_column.as_deref(), Some("dt"));
        assert_eq!(
            fd.series_resolutions.as_deref(),
            Some(["w".to_string()].as_slice())
        );
        assert_eq!(fd.series_default_resolution.as_deref(), Some("w"));
    }

    #[test]
    fn validate_accepts_engine_time_aggs() {
        // Fix 4: `series()` folds with first/median (among others); `validate` must
        // accept the SAME set the engine supports, not a narrower allowlist.
        let mk = |ta: &str| Metric {
            id: "m".into(),
            kind: "additive".into(),
            agg: "sum".into(),
            time_agg: Some(ta.into()),
            column: Some("c".into()),
            formula: None,
            weight_column: None,
            grain: vec![],
            pick: None,
            rollup: None,
            numerator: None,
            denominator: None,
            unit: "number".into(),
            label: None,
            null_policy: "drop".into(),
        };
        for ta in ["last", "first", "mean", "median", "sum", "min", "max"] {
            assert!(mk(ta).validate().is_ok(), "time_agg {ta:?} must validate");
        }
        assert!(mk("bogus").validate().is_err());
    }

    #[test]
    fn multi_frame_without_views_is_ambiguous() {
        let json = r#"{
            "axes": [{"id": "a", "levels": ["a", "id"]}],
            "frames": {
                "snapshot": {"source": "snap", "id_column": "id",
                             "metrics": [{"id": "m", "agg": "sum", "column": "m"}]},
                "series": {"source": "facts", "id_column": "id", "timestamp": "dt",
                           "metrics": [{"id": "m", "agg": "sum", "column": "m"}]}
            }
        }"#;
        let ds: Dataset = serde_json::from_str(json).unwrap();
        assert!(ds.resolved_views().is_err());
    }

    #[test]
    fn frames_views_round_trip_and_frame_dataset_resolves() {
        // A multi-frame manifest: shared axes/filters at the top level; per-frame
        // source/id/label/metrics/timestamp; views binding to frames.
        let json = r#"{
            "title": "Companies", "entity_noun": "company",
            "axes": [{"id": "sector", "levels": ["sector", "symbol"]}],
            "filters": [{"id": "sector", "column": "sector", "type": "categorical"}],
            "sources": {
                "snap": {"sql": {"dsn": "host=/tmp dbname=x", "query": "SELECT * FROM snap"}},
                "facts": {"sql": {"dsn": "host=/tmp dbname=x", "query": "SELECT * FROM facts"}}
            },
            "frames": {
                "snapshot": {"source": "snap", "id_column": "symbol", "label_column": "name",
                             "metrics": [{"id": "mcap", "agg": "sum", "column": "mcap", "unit": "money"}]},
                "series": {"source": "facts", "id_column": "symbol", "timestamp": "date",
                           "metrics": [{"id": "mcap_usd", "agg": "sum", "column": "mcap_usd"}],
                           "resolutions": ["w"], "default_resolution": "w"}
            },
            "views": {
                "treemap": {"frame": "snapshot"},
                "scatter": {"frame": "snapshot"},
                "detail":  {"frame": "snapshot", "series_frame": "series"},
                "series":  {"frame": "series", "dims_from": "snapshot", "branch_set": "treemap"}
            }
        }"#;
        let ds: Dataset = serde_json::from_str(json).unwrap();

        // Frames/views parsed; authored order preserved.
        assert_eq!(ds.frames.keys().collect::<Vec<_>>(), ["snapshot", "series"]);
        let views = ds.views.as_ref().expect("views present");
        assert_eq!(
            views.keys().collect::<Vec<_>>(),
            ["treemap", "scatter", "detail", "series"]
        );
        assert_eq!(
            ds.view("series").unwrap().dims_from.as_deref(),
            Some("snapshot")
        );
        assert_eq!(
            ds.view("series").unwrap().branch_set.as_deref(),
            Some("treemap")
        );
        assert_eq!(
            ds.view("detail").unwrap().series_frame.as_deref(),
            Some("series")
        );

        // `frame_dataset("snapshot")` synthesizes a flat FrameDataset: the frame's
        // source/id/label/metrics + the SHARED axes/filters/title.
        let snap = ds.frame_dataset("snapshot").unwrap();
        assert_eq!(snap.source, "snap");
        assert_eq!(snap.id_column, "symbol");
        assert_eq!(snap.label_column, "name");
        assert_eq!(snap.metrics.len(), 1);
        assert_eq!(snap.metrics[0].id, "mcap");
        assert!(snap.timestamp_column.is_none());
        // shared axes/filters/title carried through
        assert_eq!(snap.axes.len(), 1);
        assert_eq!(snap.axes[0].id, "sector");
        assert_eq!(snap.filters.len(), 1);
        assert_eq!(snap.title, "Companies");
        assert_eq!(snap.entity_noun, "company");

        // `frame_dataset("series")` carries the timestamp + series resolution knobs.
        let ser = ds.frame_dataset("series").unwrap();
        assert_eq!(ser.source, "facts");
        assert_eq!(ser.timestamp_column.as_deref(), Some("date"));
        assert_eq!(ser.metrics[0].id, "mcap_usd");
        assert_eq!(
            ser.series_resolutions.as_deref(),
            Some(["w".to_string()].as_slice())
        );
        assert_eq!(ser.series_default_resolution.as_deref(), Some("w"));
        // shared axes still present (so the engine can roll up branches)
        assert_eq!(ser.axes[0].id, "sector");

        // Round-trip through text; frames/views survive with order intact.
        let text = serde_json::to_string(&ds).unwrap();
        let ds2: Dataset = serde_json::from_str(&text).unwrap();
        assert_eq!(
            ds2.frames.keys().collect::<Vec<_>>(),
            ["snapshot", "series"]
        );
        assert_eq!(
            ds2.view("series").unwrap().dims_from.as_deref(),
            Some("snapshot")
        );
    }

    #[test]
    fn manifest_with_sources_and_transform_round_trips() {
        let json = r#"{
            "axes": [{"id": "owner_repo", "levels": ["owner", "repo"]}],
            "sources": {
                "repos": {"sql": {"dsn": "host=/tmp dbname=investing", "query": "SELECT * FROM repos"}},
                "owners": {"sql": {"dsn": "host=/tmp dbname=investing", "query": "SELECT * FROM owners"}}
            },
            "frames": {"main": {
                "source": "repos", "transform": "plan.bin", "id_column": "repo", "label_column": "repo",
                "metrics": [{"id": "stars", "agg": "sum", "column": "stars"}]
            }}
        }"#;
        let ds: Dataset = serde_json::from_str(json).unwrap();

        let sources = ds.sources.as_ref().expect("sources present");
        // IndexMap preserves authored order.
        assert_eq!(sources.keys().collect::<Vec<_>>(), ["repos", "owners"]);
        let SourceSpec::Sql { dsn, query } = &sources["repos"];
        assert_eq!(dsn, "host=/tmp dbname=investing");
        assert_eq!(query, "SELECT * FROM repos");
        assert_eq!(
            ds.frame("main").unwrap().transform.as_deref(),
            Some("plan.bin")
        );

        // Re-serialize and confirm sources survive with authored order.
        let text = serde_json::to_string(&ds).unwrap();
        let repos_at = text.find("\"repos\"").unwrap();
        let owners_at = text.find("\"owners\"").unwrap();
        assert!(repos_at < owners_at, "authored source order not preserved");

        let ds2: Dataset = serde_json::from_str(&text).unwrap();
        let s2 = ds2.sources.as_ref().unwrap();
        assert_eq!(s2.keys().collect::<Vec<_>>(), ["repos", "owners"]);
    }
}