nodedb 0.2.0

Local-first, real-time, edge-to-cloud hybrid database for multi-modal workloads
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
// SPDX-License-Identifier: BUSL-1.1

//! Aggregate handler: GROUP BY, HAVING, and aggregate function execution.
//!
//! The generic (non-columnar) path uses **streaming accumulators** — see
//! `accum.rs`.  Raw document bytes are never stored; only the extracted
//! scalar / approximate values needed by each aggregate function are kept.
//! Memory is O(num_groups × num_aggregates) instead of
//! O(total_matching_docs × avg_doc_size).

use std::collections::HashMap;

use sonic_rs;
use tracing::debug;

use super::accum::GroupState;
use super::spill::groupby::GroupBySpiller;
use crate::bridge::envelope::{ErrorCode, Response};
use crate::bridge::physical_plan::AggregateSpec;
use crate::bridge::scan_filter::ScanFilter;
use crate::data::executor::core_loop::CoreLoop;
use crate::data::executor::task::ExecutionTask;
use nodedb_query::agg_key::canonical_agg_key;
use nodedb_query::msgpack_scan;

// ── Cache key ──────────────────────────────────────────────────────────────

fn aggregate_cache_key(
    tid: u64,
    collection: &str,
    group_by: &[String],
    aggregates: &[AggregateSpec],
    sub_group_by: &[String],
    sub_aggregates: &[AggregateSpec],
) -> (crate::types::TenantId, String) {
    use std::fmt::Write;
    let mut rest = format!(
        "{collection}\0{}\0{}",
        group_by.join(","),
        aggregates
            .iter()
            .map(|agg| {
                if agg.expr.is_some() {
                    format!("{}(expr)->{}", agg.function, agg.alias)
                } else {
                    format!("{}({})->{}", agg.function, agg.field, agg.alias)
                }
            })
            .collect::<Vec<_>>()
            .join(",")
    );
    if !sub_group_by.is_empty() || !sub_aggregates.is_empty() {
        let _ = write!(
            rest,
            "\0sub:{}\0{}",
            sub_group_by.join(","),
            sub_aggregates
                .iter()
                .map(|agg| {
                    if agg.expr.is_some() {
                        format!("{}(expr)->{}", agg.function, agg.alias)
                    } else {
                        format!("{}({})->{}", agg.function, agg.field, agg.alias)
                    }
                })
                .collect::<Vec<_>>()
                .join(",")
        );
    }
    (crate::types::TenantId::new(tid), rest)
}

fn legacy_aggregate_pairs(aggregates: &[AggregateSpec]) -> Option<Vec<(String, String)>> {
    aggregates
        .iter()
        .map(|agg| {
            if agg.expr.is_some() {
                None
            } else {
                Some((agg.function.clone(), agg.field.clone()))
            }
        })
        .collect()
}

fn apply_user_aliases_to_rows(rows: &mut [serde_json::Value], aggregates: &[AggregateSpec]) {
    let renames: Vec<(&str, &str)> = aggregates
        .iter()
        .filter_map(|agg| {
            agg.user_alias
                .as_deref()
                .filter(|alias| *alias != agg.alias)
                .map(|alias| (agg.alias.as_str(), alias))
        })
        .collect();

    if renames.is_empty() {
        return;
    }

    for row in rows {
        if let Some(obj) = row.as_object_mut() {
            for (from, to) in &renames {
                if let Some(value) = obj.remove(*from) {
                    obj.insert((*to).to_string(), value);
                }
            }
        }
    }
}

/// Sort aggregated rows by `sort_keys = [(column, ascending), ...]`.
///
/// Each row is a `serde_json::Value::Object`; for every key, the
/// extracted value is converted to a comparable form (numbers compared
/// numerically, strings lexically, nulls last). Keys missing from a
/// row sort as null. The sort is stable to preserve relative order of
/// equal-key rows.
fn sort_aggregated_rows(rows: &mut [serde_json::Value], sort_keys: &[(String, bool)]) {
    if sort_keys.is_empty() {
        return;
    }
    rows.sort_by(|a, b| {
        for (column, ascending) in sort_keys {
            let av = a.get(column);
            let bv = b.get(column);
            let ord = compare_json_values(av, bv);
            if ord != std::cmp::Ordering::Equal {
                return if *ascending { ord } else { ord.reverse() };
            }
        }
        std::cmp::Ordering::Equal
    });
}

/// Compare two `Option<&serde_json::Value>` for sort. Nulls / absent
/// keys sort last; numbers compare numerically; everything else falls
/// back to string comparison.
fn compare_json_values(
    a: Option<&serde_json::Value>,
    b: Option<&serde_json::Value>,
) -> std::cmp::Ordering {
    use serde_json::Value as V;
    use std::cmp::Ordering;
    let a_is_null = matches!(a, None | Some(V::Null));
    let b_is_null = matches!(b, None | Some(V::Null));
    if a_is_null && b_is_null {
        return Ordering::Equal;
    }
    if a_is_null {
        return Ordering::Greater;
    }
    if b_is_null {
        return Ordering::Less;
    }
    match (a.unwrap(), b.unwrap()) {
        (V::Number(x), V::Number(y)) => {
            let xf = x.as_f64().unwrap_or(0.0);
            let yf = y.as_f64().unwrap_or(0.0);
            xf.partial_cmp(&yf).unwrap_or(Ordering::Equal)
        }
        (V::String(x), V::String(y)) => x.cmp(y),
        (V::Bool(x), V::Bool(y)) => x.cmp(y),
        (x, y) => x.to_string().cmp(&y.to_string()),
    }
}

// ── CoreLoop impl ──────────────────────────────────────────────────────────

impl CoreLoop {
    #[allow(clippy::too_many_arguments)]
    pub(in crate::data::executor) fn execute_aggregate(
        &mut self,
        task: &ExecutionTask,
        tid: u64,
        collection: &str,
        group_by: &[String],
        aggregates: &[AggregateSpec],
        filters: &[u8],
        having: &[u8],
        limit: usize,
        sub_group_by: &[String],
        sub_aggregates: &[AggregateSpec],
        grouping_sets: &[Vec<u32>],
        sort_keys: &[(String, bool)],
    ) -> Response {
        debug!(core = self.core_id, %collection, group_fields = group_by.len(), aggs = aggregates.len(), "aggregate");

        // ROLLUP / CUBE / GROUPING SETS path: union results from each set.
        if !grouping_sets.is_empty() {
            return super::grouping_sets_exec::execute_grouping_sets(
                self,
                task,
                tid,
                collection,
                group_by,
                aggregates,
                filters,
                having,
                limit,
                grouping_sets,
            );
        }

        // Fast path: incremental aggregate cache.
        if filters.is_empty() && having.is_empty() {
            let cache_key = aggregate_cache_key(
                tid,
                collection,
                group_by,
                aggregates,
                sub_group_by,
                sub_aggregates,
            );
            if let Some(cached) = self.aggregate_cache.get(&cache_key) {
                debug!(core = self.core_id, %collection, "aggregate cache hit");
                return self.response_with_payload(task, cached.clone());
            }
        }

        // Fast path: index-backed COUNT/GROUP BY.
        if group_by.len() == 1
            && filters.is_empty()
            && having.is_empty()
            && aggregates.len() == 1
            && aggregates[0].expr.is_none()
            && aggregates[0].function == "count"
        {
            let field = &group_by[0];
            if let Ok(groups) = self.sparse.scan_index_groups(tid, collection, field)
                && !groups.is_empty()
            {
                let mut payload_buf = Vec::with_capacity(groups.len() * 64);
                let row_count = groups.len().min(limit);
                let count_key = aggregates[0]
                    .user_alias
                    .clone()
                    .unwrap_or_else(|| canonical_agg_key("count", "*"));
                msgpack_scan::write_array_header(&mut payload_buf, row_count);
                for (value, count) in groups.into_iter().take(limit) {
                    msgpack_scan::write_map_header(&mut payload_buf, 2);
                    msgpack_scan::write_kv_str(&mut payload_buf, field, &value);
                    msgpack_scan::write_kv_i64(&mut payload_buf, &count_key, count as i64);
                }
                return match Ok::<Vec<u8>, crate::Error>(payload_buf) {
                    Ok(payload) => self.response_with_payload(task, payload),
                    Err(e) => self.response_error(
                        task,
                        ErrorCode::Internal {
                            detail: e.to_string(),
                        },
                    ),
                };
            }
        }

        let scan_limit = self.query_tuning.aggregate_scan_cap;

        let mt_key = (crate::types::TenantId::new(tid), collection.to_string());
        let columnar_mt = self
            .columnar_memtables
            .get(&mt_key)
            .filter(|mt| !mt.is_empty());

        // Fast path: native columnar aggregation.
        if let Some(mt) =
            columnar_mt.filter(|_| sub_group_by.is_empty() && sub_aggregates.is_empty())
        {
            let filter_predicates: Vec<ScanFilter> = if filters.is_empty() {
                Vec::new()
            } else {
                match zerompk::from_msgpack(filters) {
                    Ok(f) => f,
                    Err(e) => {
                        tracing::warn!(core = self.core_id, error = %e, "filter predicate deserialization failed");
                        Vec::new()
                    }
                }
            };

            let legacy_aggs = legacy_aggregate_pairs(aggregates);
            let columnar_spill_dir = self
                .data_dir
                .join("groupby-spill")
                .join(format!("core-{}-columnar", self.core_id));
            let columnar_spill_cap = self.query_tuning.groupby_max_groups_in_mem;
            if let Some(mut agg_result) = legacy_aggs.and_then(|pairs| {
                super::columnar_agg::try_columnar_aggregate(
                    &super::columnar_agg::ColumnarAggParams {
                        mt,
                        group_by,
                        aggregates: &pairs,
                        filters: &filter_predicates,
                        limit,
                        scan_limit,
                        spill_dir: &columnar_spill_dir,
                        spill_cap: columnar_spill_cap,
                        governor: self.governor.clone(),
                        db: task.request.database_id,
                        tenant: task.request.tenant_id,
                    },
                )
            }) {
                if !having.is_empty() {
                    let having_predicates: Vec<ScanFilter> = match zerompk::from_msgpack(having) {
                        Ok(h) => h,
                        Err(e) => {
                            tracing::warn!(core = self.core_id, error = %e, "having predicate deserialization failed");
                            Vec::new()
                        }
                    };
                    if !having_predicates.is_empty() {
                        agg_result.rows.retain(|row| {
                            let mp = nodedb_types::json_to_msgpack_or_empty(row);
                            having_predicates.iter().all(|f| f.matches_binary(&mp))
                        });
                    }
                }

                apply_user_aliases_to_rows(&mut agg_result.rows, aggregates);
                // Post-aggregate ORDER BY: sort the finalised group rows
                // before truncating to LIMIT so the visible top-N
                // reflects the requested sort, not hash-map iteration
                // order.
                sort_aggregated_rows(&mut agg_result.rows, sort_keys);
                agg_result.rows.truncate(limit);

                return match super::super::response_codec::encode_json_vec(&agg_result.rows) {
                    Ok(payload) => {
                        if filters.is_empty() && having.is_empty() {
                            let cache_key = aggregate_cache_key(
                                tid,
                                collection,
                                group_by,
                                aggregates,
                                sub_group_by,
                                sub_aggregates,
                            );
                            if self.aggregate_cache.len() < 256 {
                                self.aggregate_cache.insert(cache_key, payload.clone());
                            }
                        }
                        self.response_with_payload(task, payload)
                    }
                    Err(e) => self.response_error(
                        task,
                        ErrorCode::Internal {
                            detail: e.to_string(),
                        },
                    ),
                };
            }
        }

        // ── Streaming aggregation ──────────────────────────────────────────
        // Documents are processed one at a time.  Per-group accumulators hold
        // only the derived scalar / approximate state needed for the final
        // result — no raw document bytes are retained.
        // Memory: O(num_groups × num_aggregates) instead of O(all_docs).

        let filter_predicates: Vec<ScanFilter> = if filters.is_empty() {
            Vec::new()
        } else {
            match zerompk::from_msgpack(filters) {
                Ok(f) => f,
                Err(e) => {
                    tracing::warn!(core = self.core_id, error = %e, "filter predicate deserialization failed");
                    Vec::new()
                }
            }
        };

        let use_field_index = filter_predicates.len() + group_by.len() >= 2;
        let need_sub = !sub_group_by.is_empty() && !sub_aggregates.is_empty();

        // Spill-to-disk GROUP BY accumulator.
        //
        // Sub-groups are flattened into the same spiller using composite keys:
        //   outer_key + '\x1F' + sub_key
        // U+001F (ASCII Unit Separator) cannot appear in JSON-encoded string
        // values, so the composite key is unambiguous.  At finalize time, keys
        // containing '\x1F' are split to reconstruct outer/sub structure.
        let spill_dir = self
            .data_dir
            .join("groupby-spill")
            .join(format!("core-{}", self.core_id));
        let cap = self.query_tuning.groupby_max_groups_in_mem;

        let mut spiller = match GroupBySpiller::new(spill_dir, cap, self.governor.clone()) {
            Ok(s) => s,
            Err(e) => {
                return self.response_error(
                    task,
                    ErrorCode::Internal {
                        detail: e.to_string(),
                    },
                );
            }
        };

        // Accumulate all matching documents. Spill errors are collected and
        // surfaced after the scan completes.
        let mut spill_err: Option<crate::Error> = None;
        let chunk_size = 10_000;

        let scan_result = self
            .scan_collection(tid, collection, scan_limit)
            .map(|docs| {
                for chunk in docs.chunks(chunk_size) {
                    if spill_err.is_some() {
                        break;
                    }
                    for (_, value) in chunk {
                        let outer_key = if use_field_index {
                            let idx = msgpack_scan::FieldIndex::build(value, 0)
                                .unwrap_or_else(msgpack_scan::FieldIndex::empty);
                            if !filter_predicates
                                .iter()
                                .all(|f| f.matches_binary_indexed(value, &idx))
                            {
                                continue;
                            }
                            msgpack_scan::group_key::build_group_key_indexed(value, group_by, &idx)
                        } else {
                            if !filter_predicates.iter().all(|f| f.matches_binary(value)) {
                                continue;
                            }
                            msgpack_scan::build_group_key(value, group_by)
                        };

                        if let Err(e) = spiller.feed(outer_key.clone(), aggregates, value) {
                            spill_err = Some(e);
                            break;
                        }

                        if need_sub {
                            let sub_key = msgpack_scan::build_group_key(value, sub_group_by);
                            // Composite key: outer + U+001F + sub.
                            let composite = format!("{outer_key}\x1F{sub_key}");
                            if let Err(e) = spiller.feed(composite, sub_aggregates, value) {
                                spill_err = Some(e);
                                break;
                            }
                        }
                    }
                }
            });

        // Surface scan-level or spill-level errors before proceeding.
        if let Some(e) = spill_err {
            return self.response_error(
                task,
                ErrorCode::Internal {
                    detail: e.to_string(),
                },
            );
        }

        match scan_result {
            Ok(()) => {
                // Merge all spill runs into the consolidated map.
                let consolidated = match spiller.finalize() {
                    Ok(m) => m,
                    Err(e) => {
                        return self.response_error(
                            task,
                            ErrorCode::Internal {
                                detail: e.to_string(),
                            },
                        );
                    }
                };

                // Separate outer groups from sub-group composite entries.
                let mut groups: HashMap<String, GroupState> = HashMap::new();
                // outer_key → sub_key → GroupState
                let mut sub_groups: HashMap<String, HashMap<String, GroupState>> = HashMap::new();

                for (key, state) in consolidated {
                    if let Some(sep_pos) = key.find('\x1F') {
                        // Sub-group composite key.
                        let outer = key[..sep_pos].to_string();
                        let sub = key[sep_pos + 1..].to_string();
                        sub_groups.entry(outer).or_default().insert(sub, state);
                    } else {
                        groups.insert(key, state);
                    }
                }

                let mut results: Vec<serde_json::Value> = Vec::new();

                for (group_key, state) in groups {
                    let mut row = serde_json::Map::new();

                    if !group_by.is_empty()
                        && let Ok(parts) = sonic_rs::from_str::<Vec<serde_json::Value>>(&group_key)
                    {
                        for (i, field) in group_by.iter().enumerate() {
                            let val = parts.get(i).cloned().unwrap_or(serde_json::Value::Null);
                            row.insert(field.clone(), val);
                        }
                    }

                    for (alias, val) in state.finalize(aggregates) {
                        let json_val: serde_json::Value = val.into();
                        row.insert(alias, json_val);
                    }

                    if need_sub {
                        let sub_map = sub_groups.remove(&group_key).unwrap_or_default();
                        let mut sub_results: Vec<serde_json::Value> = Vec::new();
                        for (sub_key, sub_state) in sub_map {
                            let mut sub_row = serde_json::Map::new();
                            if let Ok(parts) =
                                sonic_rs::from_str::<Vec<serde_json::Value>>(&sub_key)
                            {
                                for (i, field) in sub_group_by.iter().enumerate() {
                                    let val =
                                        parts.get(i).cloned().unwrap_or(serde_json::Value::Null);
                                    sub_row.insert(field.clone(), val);
                                }
                            }
                            for (alias, val) in sub_state.finalize(sub_aggregates) {
                                let json_val: serde_json::Value = val.into();
                                sub_row.insert(alias, json_val);
                            }
                            let mut sub_value = serde_json::Value::Object(sub_row);
                            apply_user_aliases_to_rows(
                                std::slice::from_mut(&mut sub_value),
                                sub_aggregates,
                            );
                            sub_results.push(sub_value);
                        }
                        row.insert(
                            "sub_groups".to_string(),
                            serde_json::Value::Array(sub_results),
                        );
                    }

                    results.push(serde_json::Value::Object(row));
                }

                if !having.is_empty() {
                    let having_predicates: Vec<ScanFilter> = match zerompk::from_msgpack(having) {
                        Ok(f) => f,
                        Err(e) => {
                            tracing::warn!(
                                core = self.core_id,
                                error = %e,
                                "HAVING predicate deserialization failed (schemaless)"
                            );
                            Vec::new()
                        }
                    };
                    if !having_predicates.is_empty() {
                        results.retain(|row| {
                            let mp = nodedb_types::json_to_msgpack_or_empty(row);
                            having_predicates.iter().all(|f| f.matches_binary(&mp))
                        });
                    }
                }

                apply_user_aliases_to_rows(&mut results, aggregates);
                // Post-aggregate ORDER BY: sort group rows before
                // truncating so LIMIT picks the requested top-N.
                sort_aggregated_rows(&mut results, sort_keys);
                results.truncate(limit);

                match super::super::response_codec::encode_json_vec(&results) {
                    Ok(payload) => {
                        if filters.is_empty() && having.is_empty() {
                            let cache_key = aggregate_cache_key(
                                tid,
                                collection,
                                group_by,
                                aggregates,
                                sub_group_by,
                                sub_aggregates,
                            );
                            if self.aggregate_cache.len() < 256 {
                                self.aggregate_cache.insert(cache_key, payload.clone());
                            }
                        }
                        self.response_with_payload(task, payload)
                    }
                    Err(e) => self.response_error(
                        task,
                        ErrorCode::Internal {
                            detail: e.to_string(),
                        },
                    ),
                }
            }
            Err(e) => self.response_error(
                task,
                ErrorCode::Internal {
                    detail: e.to_string(),
                },
            ),
        }
    }
}