Skip to main content

reddb_server/runtime/
config_matrix.rs

1//! Performance / operational config matrix.
2//!
3//! Two tiers:
4//!
5//! - **Tier A (`Critical`)** — self-healing on boot. If the key is
6//!   missing from `red_config`, the loader writes the default in.
7//!   Operators always see these via `SHOW CONFIG` so they know what
8//!   guarantees and tuning they have.
9//! - **Tier B (`Optional`)** — in-memory default. Never self-populated.
10//!   Appears in `SHOW CONFIG` only after an explicit `SET CONFIG`.
11//!
12//! The matrix is the single source of truth for perf / durability /
13//! concurrency / storage keys introduced by the perf-parity push.
14//! It intentionally does **not** cover the pre-existing `red.*`
15//! trees (ai, server, storage, search, etc.) — those have their own
16//! lifecycle in `impl_core`. Keys here live under the new
17//! `cache.*`, `durability.*`, `concurrency.*`, `storage.*` namespaces.
18
19use crate::serde_json::Value as JsonValue;
20use crate::storage::UnifiedStore;
21
22#[inline]
23fn num(v: f64) -> JsonValue {
24    JsonValue::Number(v)
25}
26
27#[inline]
28fn text(s: &str) -> JsonValue {
29    JsonValue::String(s.to_string())
30}
31
32/// Default value encoded as JSON so the loader can delegate to
33/// `set_config_tree` which already handles every `Value` variant.
34#[derive(Debug, Clone)]
35pub struct ConfigDefault {
36    pub key: &'static str,
37    pub tier: Tier,
38    /// Lazily produced JSON default. A closure because `bgwriter.delay_ms`
39    /// etc. are unsigned and `serde_json::Value::from(u64)` is fine, but
40    /// we want the option of composing richer defaults later.
41    pub default: fn() -> JsonValue,
42}
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum Tier {
46    /// Self-healing on boot. Always visible in `SHOW CONFIG`.
47    Critical,
48    /// In-memory default. Only visible in `SHOW CONFIG` after user writes.
49    Optional,
50}
51
52/// The full matrix. Keep sorted by namespace for readability.
53pub const MATRIX: &[ConfigDefault] = &[
54    // ask.*
55    ConfigDefault {
56        key: "ask.max_prompt_tokens",
57        tier: Tier::Optional,
58        default: || num(8192.0),
59    },
60    ConfigDefault {
61        key: "ask.max_completion_tokens",
62        tier: Tier::Optional,
63        default: || num(1024.0),
64    },
65    ConfigDefault {
66        key: "ask.max_sources_bytes",
67        tier: Tier::Optional,
68        default: || num(262_144.0),
69    },
70    ConfigDefault {
71        key: "ask.timeout_ms",
72        tier: Tier::Optional,
73        default: || num(30_000.0),
74    },
75    ConfigDefault {
76        key: "ask.daily_cost_cap_usd",
77        tier: Tier::Optional,
78        default: || text(""),
79    },
80    ConfigDefault {
81        key: "ask.audit.include_answer",
82        tier: Tier::Optional,
83        default: || JsonValue::Bool(false),
84    },
85    ConfigDefault {
86        key: "ask.audit.retention_days",
87        tier: Tier::Optional,
88        default: || num(90.0),
89    },
90    ConfigDefault {
91        key: "ask.cache.enabled",
92        tier: Tier::Optional,
93        default: || JsonValue::Bool(false),
94    },
95    ConfigDefault {
96        key: "ask.cache.default_ttl",
97        tier: Tier::Optional,
98        default: || text(""),
99    },
100    ConfigDefault {
101        key: "ask.cache.max_entries",
102        tier: Tier::Optional,
103        default: || num(1024.0),
104    },
105    // cache.blob.*
106    ConfigDefault {
107        key: "cache.blob.l1_bytes_max",
108        tier: Tier::Critical,
109        default: || num(crate::storage::cache::DEFAULT_BLOB_L1_BYTES_MAX as f64),
110    },
111    ConfigDefault {
112        key: "cache.blob.l2_bytes_max",
113        tier: Tier::Critical,
114        default: || num(crate::storage::cache::DEFAULT_BLOB_L2_BYTES_MAX as f64),
115    },
116    ConfigDefault {
117        key: "cache.blob.max_namespaces",
118        tier: Tier::Critical,
119        default: || num(crate::storage::cache::DEFAULT_BLOB_MAX_NAMESPACES as f64),
120    },
121    // durability.*
122    ConfigDefault {
123        key: "durability.mode",
124        tier: Tier::Critical,
125        default: || text("sync"),
126    },
127    // runtime.result_cache.*
128    ConfigDefault {
129        key: "runtime.result_cache.backend",
130        tier: Tier::Critical,
131        default: || text("legacy"),
132    },
133    // Kill-switch (issue #802). Critical so it self-heals to `true` on
134    // boot and is always visible in SHOW CONFIG — operators flip it to
135    // `false` to disable result caching wholesale for debugging.
136    ConfigDefault {
137        key: "runtime.result_cache.enabled",
138        tier: Tier::Critical,
139        default: || JsonValue::Bool(true),
140    },
141    // Per-entry freshness window in seconds (issue #802). Mirrors the
142    // former `RESULT_CACHE_TTL_SECS` constant.
143    ConfigDefault {
144        key: "runtime.result_cache.ttl_seconds",
145        tier: Tier::Optional,
146        default: || num(30.0),
147    },
148    // LRU capacity in entries (issue #802). Mirrors the former
149    // `RESULT_CACHE_MAX_ENTRIES` constant.
150    ConfigDefault {
151        key: "runtime.result_cache.capacity_entries",
152        tier: Tier::Optional,
153        default: || num(1000.0),
154    },
155    // concurrency.*
156    ConfigDefault {
157        key: "concurrency.locking.enabled",
158        tier: Tier::Critical,
159        default: || JsonValue::Bool(true),
160    },
161    ConfigDefault {
162        key: "concurrency.locking.deadlock_timeout_ms",
163        tier: Tier::Optional,
164        default: || num(5000.0),
165    },
166    // storage.wal.*
167    ConfigDefault {
168        key: "storage.wal.max_interval_ms",
169        tier: Tier::Critical,
170        default: || num(10.0),
171    },
172    ConfigDefault {
173        key: "storage.wal.min_batch_size",
174        tier: Tier::Optional,
175        default: || num(4.0),
176    },
177    // storage.bgwriter.*
178    ConfigDefault {
179        key: "storage.bgwriter.delay_ms",
180        tier: Tier::Critical,
181        default: || num(200.0),
182    },
183    ConfigDefault {
184        key: "storage.bgwriter.max_pages_per_round",
185        tier: Tier::Optional,
186        default: || num(100.0),
187    },
188    ConfigDefault {
189        key: "storage.bgwriter.lru_multiplier",
190        tier: Tier::Optional,
191        default: || num(2.0),
192    },
193    // storage.bulk_insert.*
194    ConfigDefault {
195        key: "storage.bulk_insert.max_buffered_rows",
196        tier: Tier::Optional,
197        default: || num(1000.0),
198    },
199    ConfigDefault {
200        key: "storage.bulk_insert.max_buffered_bytes",
201        tier: Tier::Optional,
202        default: || num(65536.0),
203    },
204    // storage.hot_update.*
205    ConfigDefault {
206        key: "storage.hot_update.max_chain_hops",
207        tier: Tier::Optional,
208        default: || num(32.0),
209    },
210    // storage.btree.*
211    ConfigDefault {
212        key: "storage.btree.lehman_yao",
213        tier: Tier::Critical,
214        default: || JsonValue::Bool(true),
215    },
216    // ai.ner.* — opt-in LLM backend for AskPipeline Stage 1 (issue #189).
217    // Default backend stays heuristic so existing deployments keep
218    // their current behaviour without operator action.
219    ConfigDefault {
220        key: "ai.ner.backend",
221        tier: Tier::Optional,
222        default: || text("heuristic"),
223    },
224    ConfigDefault {
225        key: "ai.ner.endpoint",
226        tier: Tier::Optional,
227        default: || text(""),
228    },
229    ConfigDefault {
230        key: "ai.ner.model",
231        tier: Tier::Optional,
232        default: || text(""),
233    },
234    ConfigDefault {
235        key: "ai.ner.timeout_ms",
236        tier: Tier::Optional,
237        default: || num(5000.0),
238    },
239    ConfigDefault {
240        key: "ai.ner.fallback",
241        tier: Tier::Optional,
242        default: || text("use_heuristic"),
243    },
244    // runtime.ai.transport.* — shared outbound AI HTTP client foundation
245    // (issue #274). Provider rewiring can opt into these defaults
246    // incrementally.
247    ConfigDefault {
248        key: "runtime.ai.transport_pool_size",
249        tier: Tier::Optional,
250        default: || num(16.0),
251    },
252    ConfigDefault {
253        key: "runtime.ai.transport_timeout_ms",
254        tier: Tier::Optional,
255        default: || num(30000.0),
256    },
257    ConfigDefault {
258        key: "runtime.ai.transport_retry_max_attempts",
259        tier: Tier::Optional,
260        default: || num(3.0),
261    },
262    ConfigDefault {
263        key: "runtime.ai.transport_retry_base_ms",
264        tier: Tier::Optional,
265        default: || num(500.0),
266    },
267    // cache.blob.policy.* — extended TTL hot-path opt-in (issue #189).
268    ConfigDefault {
269        key: "cache.blob.policy.extended",
270        tier: Tier::Optional,
271        default: || text("off"),
272    },
273    // cache.blob.async_promotion — async L2->L1 promotion pool opt-in
274    // (issue #193). When "on", L2 hits return bytes to the caller
275    // immediately and the L1 install runs on a background worker.
276    // Default "off" for safe rollout — legacy synchronous promotion path.
277    ConfigDefault {
278        key: "cache.blob.async_promotion",
279        tier: Tier::Optional,
280        default: || text("off"),
281    },
282];
283
284/// Fetch the JSON default for a matrix key. Returns `None` when the
285/// key is not in the matrix (callers should treat that as a
286/// programming error — unknown key, unknown tier, unknown semantics).
287pub fn default_for(key: &str) -> Option<JsonValue> {
288    MATRIX
289        .iter()
290        .find(|entry| entry.key == key)
291        .map(|entry| (entry.default)())
292}
293
294/// Tier lookup — useful for tests and for introspection commands
295/// that want to report whether a key is expected to self-heal.
296pub fn tier_for(key: &str) -> Option<Tier> {
297    MATRIX
298        .iter()
299        .find(|entry| entry.key == key)
300        .map(|entry| entry.tier)
301}
302
303/// Boot-time self-healing pass: for every `Tier::Critical` key, if
304/// `red_config` does not already contain the key, write the default
305/// in. Idempotent — re-running produces no writes.
306///
307/// `Tier::Optional` keys are never touched here; they stay
308/// transparent-default until a user `SET CONFIG` elevates them.
309pub fn heal_critical_keys(store: &UnifiedStore) {
310    // `set_config_tree` dot-splits the key and stores one row per
311    // leaf, so we handle each matrix entry individually.
312    for entry in MATRIX {
313        if entry.tier != Tier::Critical {
314            continue;
315        }
316        if is_key_present(store, entry.key) {
317            continue;
318        }
319        store.set_config_tree(entry.key, &(entry.default)());
320    }
321}
322
323/// Lightweight presence probe. Avoids loading the whole red_config
324/// collection; scans until the first hit.
325fn is_key_present(store: &UnifiedStore, key: &str) -> bool {
326    let Some(manager) = store.get_collection("red_config") else {
327        return false;
328    };
329    let mut found = false;
330    manager.for_each_entity(|entity| {
331        if let Some(row) = entity.data.as_row() {
332            let entry_key = row.get_field("key").and_then(|v| match v {
333                crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
334                _ => None,
335            });
336            if entry_key == Some(key) {
337                found = true;
338                return false; // short-circuit
339            }
340        }
341        true
342    });
343    found
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn every_matrix_entry_has_a_default_that_resolves() {
352        for entry in MATRIX {
353            let value = (entry.default)();
354            assert!(
355                !matches!(value, JsonValue::Null),
356                "matrix key {} has a null default, defeats self-heal",
357                entry.key
358            );
359        }
360    }
361
362    #[test]
363    fn critical_keys_cover_the_core_guarantees() {
364        // This list is a tripwire — if someone drops one of these
365        // from Tier A without updating callers, the test catches it.
366        let required_critical = [
367            "cache.blob.l1_bytes_max",
368            "cache.blob.l2_bytes_max",
369            "cache.blob.max_namespaces",
370            "durability.mode",
371            "runtime.result_cache.backend",
372            "concurrency.locking.enabled",
373            "storage.wal.max_interval_ms",
374            "storage.bgwriter.delay_ms",
375            "storage.btree.lehman_yao",
376        ];
377        for key in required_critical {
378            assert_eq!(
379                tier_for(key),
380                Some(Tier::Critical),
381                "{key} must be a Tier A (Critical) key",
382            );
383        }
384    }
385
386    #[test]
387    fn optional_keys_are_not_self_healed() {
388        let must_be_optional = [
389            "concurrency.locking.deadlock_timeout_ms",
390            "storage.wal.min_batch_size",
391            "storage.bgwriter.max_pages_per_round",
392            "storage.bgwriter.lru_multiplier",
393            "storage.bulk_insert.max_buffered_rows",
394            "storage.bulk_insert.max_buffered_bytes",
395            "storage.hot_update.max_chain_hops",
396        ];
397        for key in must_be_optional {
398            assert_eq!(tier_for(key), Some(Tier::Optional), "{key} tier mismatch");
399        }
400    }
401
402    #[test]
403    fn unknown_key_returns_none() {
404        assert!(default_for("nonexistent.key").is_none());
405        assert!(tier_for("nonexistent.key").is_none());
406    }
407}