etl-unit 0.1.0 - Docs.rs

//! Group-by aggregation: collapse the subject dimension by aggregating
//! measurements across subjects that share one or more quality values.
//!
//! Group-by is the **subject-dimension** analog of the interval module's
//! **time-dimension** reduction:
//!
//! - `interval` collapses `(subject × time)` → `(subject × bucket)`
//! - `group` collapses `(subject × time)` → `(group × time)` where
//!   group is defined by one or more qualities (parish, region, …)
//!
//! They compose cleanly. When both are set on a subset request, the
//! pipeline applies interval first, then group — the statistical
//! interpretation is "aggregate over time per subject, then aggregate
//! across subjects per group".
//!
//! # Module layout
//!
//! - [`planner`] — pure decision logic: given a measurement + request
//!   aggregation overrides, produce a [`GroupAggregationPlan`]. Zero
//!   polars work, fully unit-testable.
//! - [`aggregate`] — imperative DataFrame side: join qualities, apply
//!   missing-value policy, group by qualities + time, emit stats.
//!
//! # Fairness through N
//!
//! Every group-aggregated cell carries `n_subjects_contributing` —
//! the count of subjects in the group that had a non-null value in
//! this `(group, time, measurement)` cell. A group spanning 12 stations
//! that's missing data in one month will show `n_subjects_contributing`
//! decline for that month — making the basis of the aggregate visible
//! for analytics and UI tooltips.

pub mod aggregate;
pub mod planner;

use std::collections::HashMap;

use serde::{Deserialize, Serialize};

use crate::{CanonicalColumnName, aggregation::Aggregate};

pub use aggregate::{
    GroupAggregateOutput, GroupStats, apply_group_by, build_group_plans, run_group_by,
};
pub use planner::{GroupAggregationPlan, GroupAggregationPlanner};

// ============================================================================
// GroupBy — request-side specification
// ============================================================================

/// Group subjects by one or more qualities and aggregate measurements
/// across group members.
///
/// The output's subject column carries the group label (concatenation
/// of the grouping qualities' values). Individual subjects disappear
/// into their group; their quality values and the per-group N can be
/// recovered from [`GroupStats`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroupBy {
    /// Qualities whose values define the groups. Must already exist in
    /// the schema (direct, composite, or derived). Multiple qualities
    /// produce composite groups via cartesian product — a subject with
    /// `parish = "Orleans"` and `pump_type = "Large"` lands in the
    /// `"Orleans | Large"` group.
    pub qualities: Vec<CanonicalColumnName>,

    /// Per-measurement aggregation override for the subject → group
    /// collapse. Absent entries fall back to each measurement's
    /// schema-configured `signal_aggregation`.
    #[serde(default)]
    pub aggregation_override: Option<HashMap<CanonicalColumnName, Aggregate>>,

    /// What to do with subjects whose grouping quality values are null.
    #[serde(default)]
    pub missing_policy: MissingQualityPolicy,
}

/// How to handle subjects whose grouping-quality values are null.
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum MissingQualityPolicy {
    /// Assign such subjects to a synthetic group labeled
    /// `"__unspecified__"`. Preserves every subject in the output so
    /// data gaps are visible rather than silently dropped.
    #[default]
    SyntheticGroup,
    /// Filter out subjects with missing quality values before grouping.
    Drop,
    /// Return an [`crate::error::EtlError`] listing which subjects are
    /// missing which qualities. Strictest option — appropriate when
    /// the caller is asserting a complete grouping.
    Error,
}

impl MissingQualityPolicy {
    /// The synthetic group label assigned under [`Self::SyntheticGroup`].
    pub const SYNTHETIC_LABEL: &'static str = "__unspecified__";
}