nodedb 0.2.0 - Docs.rs

// SPDX-License-Identifier: BUSL-1.1

//! Aggregate handler: GROUP BY, HAVING, and aggregate function execution.
//!
//! The generic (non-columnar) path uses **streaming accumulators** — see
//! `accum.rs`.  Raw document bytes are never stored; only the extracted
//! scalar / approximate values needed by each aggregate function are kept.
//! Memory is O(num_groups × num_aggregates) instead of
//! O(total_matching_docs × avg_doc_size).

use std::collections::HashMap;

use sonic_rs;
use tracing::debug;

use super::accum::GroupState;
use super::spill::groupby::GroupBySpiller;
use crate::bridge::envelope::{ErrorCode, Response};
use crate::bridge::physical_plan::AggregateSpec;
use crate::bridge::scan_filter::ScanFilter;
use crate::data::executor::core_loop::CoreLoop;
use crate::data::executor::task::ExecutionTask;
use nodedb_query::agg_key::canonical_agg_key;
use nodedb_query::msgpack_scan;

// ── Cache key ──────────────────────────────────────────────────────────────

fn aggregate_cache_key(
    tid: u64,
    collection: &str,
    group_by: &[String],
    aggregates: &[AggregateSpec],
    sub_group_by: &[String],
    sub_aggregates: &[AggregateSpec],
) -> (crate::types::TenantId, String) {
    use std::fmt::Write;
    let mut rest = format!(
        "{collection}\0{}\0{}",
        group_by.join(","),
        aggregates
            .iter()
            .map(|agg| {
                if agg.expr.is_some() {
                    format!("{}(expr)->{}", agg.function, agg.alias)
                } else {
                    format!("{}({})->{}", agg.function, agg.field, agg.alias)
                }
            })
            .collect::<Vec<_>>()
            .join(",")
    );
    if !sub_group_by.is_empty() || !sub_aggregates.is_empty() {
        let _ = write!(
            rest,
            "\0sub:{}\0{}",
            sub_group_by.join(","),
            sub_aggregates
                .iter()
                .map(|agg| {
                    if agg.expr.is_some() {
                        format!("{}(expr)->{}", agg.function, agg.alias)
                    } else {
                        format!("{}({})->{}", agg.function, agg.field, agg.alias)
                    }
                })
                .collect::<Vec<_>>()
                .join(",")
        );
    }
    (crate::types::TenantId::new(tid), rest)
}

fn legacy_aggregate_pairs(aggregates: &[AggregateSpec]) -> Option<Vec<(String, String)>> {
    aggregates
        .iter()
        .map(|agg| {
            if agg.expr.is_some() {
                None
            } else {
                Some((agg.function.clone(), agg.field.clone()))
            }
        })
        .collect()
}

fn apply_user_aliases_to_rows(rows: &mut [serde_json::Value], aggregates: &[AggregateSpec]) {
    let renames: Vec<(&str, &str)> = aggregates
        .iter()
        .filter_map(|agg| {
            agg.user_alias
                .as_deref()
                .filter(|alias| *alias != agg.alias)
                .map(|alias| (agg.alias.as_str(), alias))
        })
        .collect();

    if renames.is_empty() {
        return;
    }

    for row in rows {
        if let Some(obj) = row.as_object_mut() {
            for (from, to) in &renames {
                if let Some(value) = obj.remove(*from) {
                    obj.insert((*to).to_string(), value);
                }
            }
        }
    }
}

/// Sort aggregated rows by `sort_keys = [(column, ascending), ...]`.
///
/// Each row is a `serde_json::Value::Object`; for every key, the
/// extracted value is converted to a comparable form (numbers compared
/// numerically, strings lexically, nulls last). Keys missing from a
/// row sort as null. The sort is stable to preserve relative order of
/// equal-key rows.
fn sort_aggregated_rows(rows: &mut [serde_json::Value], sort_keys: &[(String, bool)]) {
    if sort_keys.is_empty() {
        return;
    }
    rows.sort_by(|a, b| {
        for (column, ascending) in sort_keys {
            let av = a.get(column);
            let bv = b.get(column);
            let ord = compare_json_values(av, bv);
            if ord != std::cmp::Ordering::Equal {
                return if *ascending { ord } else { ord.reverse() };
            }
        }
        std::cmp::Ordering::Equal
    });
}

/// Compare two `Option<&serde_json::Value>` for sort. Nulls / absent
/// keys sort last; numbers compare numerically; everything else falls
/// back to string comparison.
fn compare_json_values(
    a: Option<&serde_json::Value>,
    b: Option<&serde_json::Value>,
) -> std::cmp::Ordering {
    use serde_json::Value as V;
    use std::cmp::Ordering;
    let a_is_null = matches!(a, None | Some(V::Null));
    let b_is_null = matches!(b, None | Some(V::Null));
    if a_is_null && b_is_null {
        return Ordering::Equal;
    }
    if a_is_null {
        return Ordering::Greater;
    }
    if b_is_null {
        return Ordering::Less;
    }
    match (a.unwrap(), b.unwrap()) {
        (V::Number(x), V::Number(y)) => {
            let xf = x.as_f64().unwrap_or(0.0);
            let yf = y.as_f64().unwrap_or(0.0);
            xf.partial_cmp(&yf).unwrap_or(Ordering::Equal)
        }
        (V::String(x), V::String(y)) => x.cmp(y),
        (V::Bool(x), V::Bool(y)) => x.cmp(y),
        (x, y) => x.to_string().cmp(&y.to_string()),
    }
}

// ── CoreLoop impl ──────────────────────────────────────────────────────────

impl CoreLoop {
    #[allow(clippy::too_many_arguments)]
    pub(in crate::data::executor) fn execute_aggregate(
        &mut self,
        task: &ExecutionTask,
        tid: u64,
        collection: &str,
        group_by: &[String],
        aggregates: &[AggregateSpec],
        filters: &[u8],
        having: &[u8],
        limit: usize,
        sub_group_by: &[String],
        sub_aggregates: &[AggregateSpec],
        grouping_sets: &[Vec<u32>],
        sort_keys: &[(String, bool)],
    ) -> Response {
        debug!(core = self.core_id, %collection, group_fields = group_by.len(), aggs = aggregates.len(), "aggregate");

        // ROLLUP / CUBE / GROUPING SETS path: union results from each set.
        if !grouping_sets.is_empty() {
            return super::grouping_sets_exec::execute_grouping_sets(
                self,
                task,
                tid,
                collection,
                group_by,
                aggregates,
                filters,
                having,
                limit,
                grouping_sets,
            );
        }

        // Fast path: incremental aggregate cache.
        if filters.is_empty() && having.is_empty() {
            let cache_key = aggregate_cache_key(
                tid,
                collection,
                group_by,
                aggregates,
                sub_group_by,
                sub_aggregates,
            );
            if let Some(cached) = self.aggregate_cache.get(&cache_key) {
                debug!(core = self.core_id, %collection, "aggregate cache hit");
                return self.response_with_payload(task, cached.clone());
            }
        }

        // Fast path: index-backed COUNT/GROUP BY.
        if group_by.len() == 1
            && filters.is_empty()
            && having.is_empty()
            && aggregates.len() == 1
            && aggregates[0].expr.is_none()
            && aggregates[0].function == "count"
        {
            let field = &group_by[0];
            if let Ok(groups) = self.sparse.scan_index_groups(tid, collection, field)
                && !groups.is_empty()
            {
                let mut payload_buf = Vec::with_capacity(groups.len() * 64);
                let row_count = groups.len().min(limit);
                let count_key = aggregates[0]
                    .user_alias
                    .clone()
                    .unwrap_or_else(|| canonical_agg_key("count", "*"));
                msgpack_scan::write_array_header(&mut payload_buf, row_count);
                for (value, count) in groups.into_iter().take(limit) {
                    msgpack_scan::write_map_header(&mut payload_buf, 2);
                    msgpack_scan::write_kv_str(&mut payload_buf, field, &value);
                    msgpack_scan::write_kv_i64(&mut payload_buf, &count_key, count as i64);
                }
                return match Ok::<Vec<u8>, crate::Error>(payload_buf) {
                    Ok(payload) => self.response_with_payload(task, payload),
                    Err(e) => self.response_error(
                        task,
                        ErrorCode::Internal {
                            detail: e.to_string(),
                        },
                    ),
                };
            }
        }

        let scan_limit = self.query_tuning.aggregate_scan_cap;

        let mt_key = (crate::types::TenantId::new(tid), collection.to_string());
        let columnar_mt = self
            .columnar_memtables
            .get(&mt_key)
            .filter(|mt| !mt.is_empty());

        // Fast path: native columnar aggregation.
        if let Some(mt) =
            columnar_mt.filter(|_| sub_group_by.is_empty() && sub_aggregates.is_empty())
        {
            let filter_predicates: Vec<ScanFilter> = if filters.is_empty() {
                Vec::new()
            } else {
                match zerompk::from_msgpack(filters) {
                    Ok(f) => f,
                    Err(e) => {
                        tracing::warn!(core = self.core_id, error = %e, "filter predicate deserialization failed");
                        Vec::new()
                    }
                }
            };

            let legacy_aggs = legacy_aggregate_pairs(aggregates);
            let columnar_spill_dir = self
                .data_dir
                .join("groupby-spill")
                .join(format!("core-{}-columnar", self.core_id));
            let columnar_spill_cap = self.query_tuning.groupby_max_groups_in_mem;
            if let Some(mut agg_result) = legacy_aggs.and_then(|pairs| {
                super::columnar_agg::try_columnar_aggregate(
                    &super::columnar_agg::ColumnarAggParams {
                        mt,
                        group_by,
                        aggregates: &pairs,
                        filters: &filter_predicates,
                        limit,
                        scan_limit,
                        spill_dir: &columnar_spill_dir,
                        spill_cap: columnar_spill_cap,
                        governor: self.governor.clone(),
                        db: task.request.database_id,
                        tenant: task.request.tenant_id,
                    },
                )
            }) {
                if !having.is_empty() {
                    let having_predicates: Vec<ScanFilter> = match zerompk::from_msgpack(having) {
                        Ok(h) => h,
                        Err(e) => {
                            tracing::warn!(core = self.core_id, error = %e, "having predicate deserialization failed");
                            Vec::new()
                        }
                    };
                    if !having_predicates.is_empty() {
                        agg_result.rows.retain(|row| {
                            let mp = nodedb_types::json_to_msgpack_or_empty(row);
                            having_predicates.iter().all(|f| f.matches_binary(&mp))
                        });
                    }
                }

                apply_user_aliases_to_rows(&mut agg_result.rows, aggregates);
                // Post-aggregate ORDER BY: sort the finalised group rows
                // before truncating to LIMIT so the visible top-N
                // reflects the requested sort, not hash-map iteration
                // order.
                sort_aggregated_rows(&mut agg_result.rows, sort_keys);
                agg_result.rows.truncate(limit);

                return match super::super::response_codec::encode_json_vec(&agg_result.rows) {
                    Ok(payload) => {
                        if filters.is_empty() && having.is_empty() {
                            let cache_key = aggregate_cache_key(
                                tid,
                                collection,
                                group_by,
                                aggregates,
                                sub_group_by,
                                sub_aggregates,
                            );
                            if self.aggregate_cache.len() < 256 {
                                self.aggregate_cache.insert(cache_key, payload.clone());
                            }
                        }
                        self.response_with_payload(task, payload)
                    }
                    Err(e) => self.response_error(
                        task,
                        ErrorCode::Internal {
                            detail: e.to_string(),
                        },
                    ),
                };
            }
        }

        // ── Streaming aggregation ──────────────────────────────────────────
        // Documents are processed one at a time.  Per-group accumulators hold
        // only the derived scalar / approximate state needed for the final
        // result — no raw document bytes are retained.
        // Memory: O(num_groups × num_aggregates) instead of O(all_docs).

        let filter_predicates: Vec<ScanFilter> = if filters.is_empty() {
            Vec::new()
        } else {
            match zerompk::from_msgpack(filters) {
                Ok(f) => f,
                Err(e) => {
                    tracing::warn!(core = self.core_id, error = %e, "filter predicate deserialization failed");
                    Vec::new()
                }
            }
        };

        let use_field_index = filter_predicates.len() + group_by.len() >= 2;
        let need_sub = !sub_group_by.is_empty() && !sub_aggregates.is_empty();

        // Spill-to-disk GROUP BY accumulator.
        //
        // Sub-groups are flattened into the same spiller using composite keys:
        //   outer_key + '\x1F' + sub_key
        // U+001F (ASCII Unit Separator) cannot appear in JSON-encoded string
        // values, so the composite key is unambiguous.  At finalize time, keys
        // containing '\x1F' are split to reconstruct outer/sub structure.
        let spill_dir = self
            .data_dir
            .join("groupby-spill")
            .join(format!("core-{}", self.core_id));
        let cap = self.query_tuning.groupby_max_groups_in_mem;

        let mut spiller = match GroupBySpiller::new(spill_dir, cap, self.governor.clone()) {
            Ok(s) => s,
            Err(e) => {
                return self.response_error(
                    task,
                    ErrorCode::Internal {
                        detail: e.to_string(),
                    },
                );
            }
        };

        // Accumulate all matching documents. Spill errors are collected and
        // surfaced after the scan completes.
        let mut spill_err: Option<crate::Error> = None;
        let chunk_size = 10_000;

        let scan_result = self
            .scan_collection(tid, collection, scan_limit)
            .map(|docs| {
                for chunk in docs.chunks(chunk_size) {
                    if spill_err.is_some() {
                        break;
                    }
                    for (_, value) in chunk {
                        let outer_key = if use_field_index {
                            let idx = msgpack_scan::FieldIndex::build(value, 0)
                                .unwrap_or_else(msgpack_scan::FieldIndex::empty);
                            if !filter_predicates
                                .iter()
                                .all(|f| f.matches_binary_indexed(value, &idx))
                            {
                                continue;
                            }
                            msgpack_scan::group_key::build_group_key_indexed(value, group_by, &idx)
                        } else {
                            if !filter_predicates.iter().all(|f| f.matches_binary(value)) {
                                continue;
                            }
                            msgpack_scan::build_group_key(value, group_by)
                        };

                        if let Err(e) = spiller.feed(outer_key.clone(), aggregates, value) {
                            spill_err = Some(e);
                            break;
                        }

                        if need_sub {
                            let sub_key = msgpack_scan::build_group_key(value, sub_group_by);
                            // Composite key: outer + U+001F + sub.
                            let composite = format!("{outer_key}\x1F{sub_key}");
                            if let Err(e) = spiller.feed(composite, sub_aggregates, value) {
                                spill_err = Some(e);
                                break;
                            }
                        }
                    }
                }
            });

        // Surface scan-level or spill-level errors before proceeding.
        if let Some(e) = spill_err {
            return self.response_error(
                task,
                ErrorCode::Internal {
                    detail: e.to_string(),
                },
            );
        }

        match scan_result {
            Ok(()) => {
                // Merge all spill runs into the consolidated map.
                let consolidated = match spiller.finalize() {
                    Ok(m) => m,
                    Err(e) => {
                        return self.response_error(
                            task,
                            ErrorCode::Internal {
                                detail: e.to_string(),
                            },
                        );
                    }
                };

                // Separate outer groups from sub-group composite entries.
                let mut groups: HashMap<String, GroupState> = HashMap::new();
                // outer_key → sub_key → GroupState
                let mut sub_groups: HashMap<String, HashMap<String, GroupState>> = HashMap::new();

                for (key, state) in consolidated {
                    if let Some(sep_pos) = key.find('\x1F') {
                        // Sub-group composite key.
                        let outer = key[..sep_pos].to_string();
                        let sub = key[sep_pos + 1..].to_string();
                        sub_groups.entry(outer).or_default().insert(sub, state);
                    } else {
                        groups.insert(key, state);
                    }
                }

                let mut results: Vec<serde_json::Value> = Vec::new();

                for (group_key, state) in groups {
                    let mut row = serde_json::Map::new();

                    if !group_by.is_empty()
                        && let Ok(parts) = sonic_rs::from_str::<Vec<serde_json::Value>>(&group_key)
                    {
                        for (i, field) in group_by.iter().enumerate() {
                            let val = parts.get(i).cloned().unwrap_or(serde_json::Value::Null);
                            row.insert(field.clone(), val);
                        }
                    }

                    for (alias, val) in state.finalize(aggregates) {
                        let json_val: serde_json::Value = val.into();
                        row.insert(alias, json_val);
                    }

                    if need_sub {
                        let sub_map = sub_groups.remove(&group_key).unwrap_or_default();
                        let mut sub_results: Vec<serde_json::Value> = Vec::new();
                        for (sub_key, sub_state) in sub_map {
                            let mut sub_row = serde_json::Map::new();
                            if let Ok(parts) =
                                sonic_rs::from_str::<Vec<serde_json::Value>>(&sub_key)
                            {
                                for (i, field) in sub_group_by.iter().enumerate() {
                                    let val =
                                        parts.get(i).cloned().unwrap_or(serde_json::Value::Null);
                                    sub_row.insert(field.clone(), val);
                                }
                            }
                            for (alias, val) in sub_state.finalize(sub_aggregates) {
                                let json_val: serde_json::Value = val.into();
                                sub_row.insert(alias, json_val);
                            }
                            let mut sub_value = serde_json::Value::Object(sub_row);
                            apply_user_aliases_to_rows(
                                std::slice::from_mut(&mut sub_value),
                                sub_aggregates,
                            );
                            sub_results.push(sub_value);
                        }
                        row.insert(
                            "sub_groups".to_string(),
                            serde_json::Value::Array(sub_results),
                        );
                    }

                    results.push(serde_json::Value::Object(row));
                }

                if !having.is_empty() {
                    let having_predicates: Vec<ScanFilter> = match zerompk::from_msgpack(having) {
                        Ok(f) => f,
                        Err(e) => {
                            tracing::warn!(
                                core = self.core_id,
                                error = %e,
                                "HAVING predicate deserialization failed (schemaless)"
                            );
                            Vec::new()
                        }
                    };
                    if !having_predicates.is_empty() {
                        results.retain(|row| {
                            let mp = nodedb_types::json_to_msgpack_or_empty(row);
                            having_predicates.iter().all(|f| f.matches_binary(&mp))
                        });
                    }
                }

                apply_user_aliases_to_rows(&mut results, aggregates);
                // Post-aggregate ORDER BY: sort group rows before
                // truncating so LIMIT picks the requested top-N.
                sort_aggregated_rows(&mut results, sort_keys);
                results.truncate(limit);

                match super::super::response_codec::encode_json_vec(&results) {
                    Ok(payload) => {
                        if filters.is_empty() && having.is_empty() {
                            let cache_key = aggregate_cache_key(
                                tid,
                                collection,
                                group_by,
                                aggregates,
                                sub_group_by,
                                sub_aggregates,
                            );
                            if self.aggregate_cache.len() < 256 {
                                self.aggregate_cache.insert(cache_key, payload.clone());
                            }
                        }
                        self.response_with_payload(task, payload)
                    }
                    Err(e) => self.response_error(
                        task,
                        ErrorCode::Internal {
                            detail: e.to_string(),
                        },
                    ),
                }
            }
            Err(e) => self.response_error(
                task,
                ErrorCode::Internal {
                    detail: e.to_string(),
                },
            ),
        }
    }
}