uni_common/config.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2024-2026 Dragonscale Team
3
4use std::path::{Path, PathBuf};
5use std::thread;
6use std::time::Duration;
7
8#[derive(Clone, Debug)]
9pub struct CompactionConfig {
10 /// Enable background compaction (default: true)
11 pub enabled: bool,
12
13 /// Max uncompacted flush generations before triggering compaction (default: 8)
14 pub max_l1_runs: usize,
15
16 /// Max L1 size in bytes before compaction (default: 256MB)
17 pub max_l1_size_bytes: u64,
18
19 /// Max age of oldest L1 run before compaction (default: 1 hour)
20 pub max_l1_age: Duration,
21
22 /// Background check interval (default: 10s)
23 pub check_interval: Duration,
24
25 /// Number of compaction worker threads (default: 1)
26 pub worker_threads: usize,
27
28 /// Number of frozen L0-csr overlay segments that must accumulate before
29 /// `AdjacencyManager::compact` is spawned post-flush (default: 2).
30 ///
31 /// Each frozen segment adds per-read overhead until merged back into the
32 /// Main CSR. Lowering this triggers compaction sooner; higher values
33 /// batch more segments per compaction at the cost of slower reads while
34 /// they accumulate. The default of 2 keeps the read-side overhead
35 /// bounded across a wide range of write rates. See issue #55.
36 pub frozen_segments_compact_threshold: usize,
37}
38
39impl Default for CompactionConfig {
40 fn default() -> Self {
41 Self {
42 enabled: true,
43 max_l1_runs: 8,
44 max_l1_size_bytes: 256 * 1024 * 1024,
45 max_l1_age: Duration::from_secs(3600),
46 check_interval: Duration::from_secs(10),
47 worker_threads: 1,
48 frozen_segments_compact_threshold: 2,
49 }
50 }
51}
52
53/// Configuration for background index rebuilding.
54#[derive(Clone, Debug)]
55pub struct IndexRebuildConfig {
56 /// Maximum number of retry attempts for failed index builds (default: 3).
57 pub max_retries: u32,
58
59 /// Delay between retry attempts (default: 60s).
60 pub retry_delay: Duration,
61
62 /// How often to check for pending index rebuild tasks (default: 5s).
63 pub worker_check_interval: Duration,
64
65 /// Row growth ratio to trigger rebuild (default: 0.5 = 50%). Set 0.0 to disable.
66 pub growth_trigger_ratio: f64,
67
68 /// Max index age before rebuild. `None` disables the time-based trigger.
69 pub max_index_age: Option<Duration>,
70
71 /// Enable post-flush automatic rebuild scheduling (default: false).
72 pub auto_rebuild_enabled: bool,
73}
74
75impl Default for IndexRebuildConfig {
76 fn default() -> Self {
77 Self {
78 max_retries: 3,
79 retry_delay: Duration::from_secs(60),
80 worker_check_interval: Duration::from_secs(5),
81 growth_trigger_ratio: 0.5,
82 max_index_age: None,
83 auto_rebuild_enabled: false,
84 }
85 }
86}
87
88#[derive(Clone, Copy, Debug)]
89pub struct WriteThrottleConfig {
90 /// Uncompacted flush generations to start throttling (default: 16)
91 pub soft_limit: usize,
92
93 /// Uncompacted flush generations to stop writes entirely (default: 32)
94 pub hard_limit: usize,
95
96 /// Base delay when throttling (default: 10ms)
97 pub base_delay: Duration,
98}
99
100impl Default for WriteThrottleConfig {
101 fn default() -> Self {
102 Self {
103 soft_limit: 16,
104 hard_limit: 32,
105 base_delay: Duration::from_millis(10),
106 }
107 }
108}
109
110#[derive(Clone, Debug)]
111pub struct ObjectStoreConfig {
112 pub connect_timeout: Duration,
113 pub read_timeout: Duration,
114 pub write_timeout: Duration,
115 pub max_retries: u32,
116 pub retry_backoff_base: Duration,
117 pub retry_backoff_max: Duration,
118}
119
120impl Default for ObjectStoreConfig {
121 fn default() -> Self {
122 Self {
123 connect_timeout: Duration::from_secs(10),
124 read_timeout: Duration::from_secs(30),
125 write_timeout: Duration::from_secs(60),
126 max_retries: 3,
127 retry_backoff_base: Duration::from_millis(100),
128 retry_backoff_max: Duration::from_secs(10),
129 }
130 }
131}
132
133/// Security configuration for file system operations.
134/// Controls which paths can be accessed by BACKUP, COPY, and EXPORT commands.
135///
136/// Disabled by default for backward compatibility in embedded mode.
137/// MUST be enabled for server mode with untrusted clients.
138#[derive(Clone, Debug, Default)]
139pub struct FileSandboxConfig {
140 /// If true, file operations are restricted to allowed_paths.
141 /// If false, all paths are allowed (NOT RECOMMENDED for server mode).
142 pub enabled: bool,
143
144 /// List of allowed base directories for file operations.
145 /// Paths must be absolute and canonical.
146 /// File operations are only allowed within these directories.
147 pub allowed_paths: Vec<PathBuf>,
148}
149
150/// Deployment mode for the database.
151///
152/// Used to determine appropriate security defaults.
153#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
154pub enum DeploymentMode {
155 /// Embedded/library mode where the host application controls access.
156 /// File sandbox is disabled by default for backward compatibility.
157 #[default]
158 Embedded,
159 /// Server mode with untrusted clients.
160 /// File sandbox is enabled by default with restricted paths.
161 Server,
162}
163
164/// HTTP server configuration.
165///
166/// Controls CORS, authentication, and other HTTP-related security settings.
167///
168/// # Security
169///
170/// **CWE-942 (Overly Permissive CORS)**, **CWE-306 (Missing Authentication)**:
171/// Production deployments should configure explicit `allowed_origins` and
172/// enable API key authentication.
173#[derive(Clone, Debug)]
174pub struct ServerConfig {
175 /// Allowed CORS origins.
176 ///
177 /// - Empty vector: No CORS headers (most restrictive)
178 /// - `["*"]`: Allow all origins (NOT RECOMMENDED for production)
179 /// - Explicit list: Only allow specified origins (RECOMMENDED)
180 ///
181 /// # Security
182 ///
183 /// **CWE-942**: Using `["*"]` allows any website to make requests to
184 /// your server, potentially exposing sensitive data.
185 pub allowed_origins: Vec<String>,
186
187 /// Optional API key for request authentication.
188 ///
189 /// When set, all API requests must include the header:
190 /// `X-API-Key: <key>`
191 ///
192 /// # Security
193 ///
194 /// **CWE-306**: Without authentication, any client can execute queries.
195 /// Enable this for any deployment accessible beyond localhost.
196 pub api_key: Option<String>,
197
198 /// Whether to require API key for metrics endpoint.
199 ///
200 /// Default: false (metrics are public for observability tooling)
201 pub require_auth_for_metrics: bool,
202}
203
204impl Default for ServerConfig {
205 fn default() -> Self {
206 Self {
207 // Default to localhost-only origin for development safety
208 allowed_origins: vec!["http://localhost:3000".to_string()],
209 api_key: None,
210 require_auth_for_metrics: false,
211 }
212 }
213}
214
215impl ServerConfig {
216 /// Create a permissive config for local development only.
217 ///
218 /// # Security
219 ///
220 /// **WARNING**: Do not use in production. This config allows all CORS origins
221 /// and has no authentication.
222 #[must_use]
223 pub fn development() -> Self {
224 Self {
225 allowed_origins: vec!["*".to_string()],
226 api_key: None,
227 require_auth_for_metrics: false,
228 }
229 }
230
231 /// Create a production config with explicit origins and required API key.
232 ///
233 /// # Panics
234 ///
235 /// Panics if `api_key` is empty.
236 #[must_use]
237 pub fn production(allowed_origins: Vec<String>, api_key: String) -> Self {
238 assert!(
239 !api_key.is_empty(),
240 "API key must not be empty for production"
241 );
242 Self {
243 allowed_origins,
244 api_key: Some(api_key),
245 require_auth_for_metrics: true,
246 }
247 }
248
249 /// Returns a security warning if the config is insecure.
250 pub fn security_warning(&self) -> Option<&'static str> {
251 if self.allowed_origins.contains(&"*".to_string()) && self.api_key.is_none() {
252 Some(
253 "Server config has permissive CORS (allow all origins) and no API key. \
254 This is insecure for production deployments.",
255 )
256 } else if self.allowed_origins.contains(&"*".to_string()) {
257 Some(
258 "Server config has permissive CORS (allow all origins). \
259 Consider restricting to specific origins for production.",
260 )
261 } else if self.api_key.is_none() {
262 Some(
263 "Server config has no API key authentication. \
264 Enable api_key for production deployments.",
265 )
266 } else {
267 None
268 }
269 }
270}
271
272impl FileSandboxConfig {
273 /// Creates a sandboxed config that only allows operations in the specified directories.
274 pub fn sandboxed(paths: Vec<PathBuf>) -> Self {
275 Self {
276 enabled: true,
277 allowed_paths: paths,
278 }
279 }
280
281 /// Creates a config with appropriate defaults for the deployment mode.
282 ///
283 /// # Security
284 ///
285 /// - **Embedded mode**: Sandbox disabled (host application controls access)
286 /// - **Server mode**: Sandbox enabled with default paths `/var/lib/uni/data` and
287 /// `/var/lib/uni/backups`
288 ///
289 /// **CWE-22 (Path Traversal)**: Server deployments MUST enable the sandbox to
290 /// prevent arbitrary file read/write via BACKUP, COPY, and EXPORT commands.
291 pub fn default_for_mode(mode: DeploymentMode) -> Self {
292 match mode {
293 DeploymentMode::Embedded => Self {
294 enabled: false,
295 allowed_paths: vec![],
296 },
297 DeploymentMode::Server => Self {
298 enabled: true,
299 allowed_paths: vec![
300 PathBuf::from("/var/lib/uni/data"),
301 PathBuf::from("/var/lib/uni/backups"),
302 ],
303 },
304 }
305 }
306
307 /// Returns a security warning message if the sandbox is disabled.
308 ///
309 /// Call this at startup to alert administrators about potential security risks.
310 /// Returns `Some(message)` if a warning should be displayed, `None` otherwise.
311 ///
312 /// # Security
313 ///
314 /// **CWE-22 (Path Traversal)**, **CWE-73 (External Control of File Name)**:
315 /// Disabled sandbox allows unrestricted filesystem access for BACKUP, COPY,
316 /// and EXPORT commands, which can lead to:
317 /// - Arbitrary file read/write in server deployments
318 /// - Data exfiltration to attacker-controlled paths
319 /// - Potential privilege escalation via file overwrites
320 ///
321 /// # Example
322 ///
323 /// ```ignore
324 /// if let Some(warning) = config.file_sandbox.security_warning() {
325 /// tracing::warn!(target: "uni_db::security", "{}", warning);
326 /// }
327 /// ```
328 pub fn security_warning(&self) -> Option<&'static str> {
329 if !self.enabled {
330 Some(
331 "File sandbox is DISABLED. This allows unrestricted filesystem access \
332 for BACKUP, COPY, and EXPORT commands. Enable sandbox for server \
333 deployments: file_sandbox.enabled = true",
334 )
335 } else {
336 None
337 }
338 }
339
340 /// Returns whether the sandbox is in a potentially insecure state.
341 ///
342 /// Returns `true` if the sandbox is disabled or enabled with no allowed paths.
343 pub fn is_potentially_insecure(&self) -> bool {
344 !self.enabled || self.allowed_paths.is_empty()
345 }
346
347 /// Validate that a path is within the allowed sandbox.
348 /// Returns Ok(canonical_path) if allowed, Err if not.
349 pub fn validate_path(&self, path: &str) -> Result<PathBuf, String> {
350 if !self.enabled {
351 // Sandbox disabled - allow all paths
352 return Ok(PathBuf::from(path));
353 }
354
355 if self.allowed_paths.is_empty() {
356 return Err("File sandbox is enabled but no allowed paths configured".to_string());
357 }
358
359 // Resolve the path to canonical form to prevent traversal attacks
360 let input_path = Path::new(path);
361
362 // For paths that don't exist yet (e.g., export destinations), we need to
363 // check their parent directory exists and is within allowed paths
364 let canonical = if input_path.exists() {
365 input_path
366 .canonicalize()
367 .map_err(|e| format!("Failed to canonicalize path: {}", e))?
368 } else {
369 // Path doesn't exist - check parent
370 let parent = input_path
371 .parent()
372 .ok_or_else(|| "Invalid path: no parent directory".to_string())?;
373 if !parent.exists() {
374 return Err(format!(
375 "Parent directory does not exist: {}",
376 parent.display()
377 ));
378 }
379 let canonical_parent = parent
380 .canonicalize()
381 .map_err(|e| format!("Failed to canonicalize parent: {}", e))?;
382 // Reconstruct with canonical parent + original filename
383 let filename = input_path
384 .file_name()
385 .ok_or_else(|| "Invalid path: no filename".to_string())?;
386 canonical_parent.join(filename)
387 };
388
389 // Check if the canonical path is within any allowed directory
390 for allowed in &self.allowed_paths {
391 // Ensure allowed path is canonical too
392 let canonical_allowed = if allowed.exists() {
393 allowed.canonicalize().unwrap_or_else(|_| allowed.clone())
394 } else {
395 allowed.clone()
396 };
397
398 if canonical.starts_with(&canonical_allowed) {
399 return Ok(canonical);
400 }
401 }
402
403 Err(format!(
404 "Path '{}' is outside allowed sandbox directories. Allowed: {:?}",
405 path, self.allowed_paths
406 ))
407 }
408}
409
410#[derive(Clone, Debug)]
411pub struct UniConfig {
412 /// Maximum adjacency cache size in bytes (default: 1GB)
413 pub cache_size: usize,
414
415 /// Number of worker threads for parallel execution
416 pub parallelism: usize,
417
418 /// Size of each data morsel/batch (number of rows)
419 pub batch_size: usize,
420
421 /// Maximum size of traversal frontier before pruning
422 pub max_frontier_size: usize,
423
424 /// Auto-flush threshold for L0 buffer (default: 10_000 mutations)
425 pub auto_flush_threshold: usize,
426
427 /// Auto-flush interval for L0 buffer (default: 5 seconds).
428 /// Flush triggers if time elapsed AND mutation count >= auto_flush_min_mutations.
429 /// Set to None to disable time-based flush.
430 pub auto_flush_interval: Option<Duration>,
431
432 /// Minimum mutations required before the time-based flush triggers
433 /// (default: 1).
434 ///
435 /// Prevents unnecessary flushes when activity is minimal. Raising this
436 /// (e.g., to 1000) lets small bursts coalesce into one flush — useful
437 /// for benchmark workloads — but for active databases with high write
438 /// rates, raising it reduces flush frequency and lets the active overlay
439 /// grow larger between flushes, which can hurt read latency. Tune with
440 /// `compaction.frozen_segments_compact_threshold` together. See issue
441 /// #55 for the trade-off discussion.
442 pub auto_flush_min_mutations: usize,
443
444 /// Enable write-ahead logging (default: true)
445 pub wal_enabled: bool,
446
447 /// Compaction configuration
448 pub compaction: CompactionConfig,
449
450 /// Write throttling configuration
451 pub throttle: WriteThrottleConfig,
452
453 /// File sandbox configuration for BACKUP/COPY/EXPORT commands.
454 /// MUST be enabled with allowed paths in server mode to prevent arbitrary file access.
455 pub file_sandbox: FileSandboxConfig,
456
457 /// Default query execution timeout (default: 30s)
458 pub query_timeout: Duration,
459
460 /// Maximum wall time a transaction commit may take before it is aborted with
461 /// `CommitTimeout` (default: 5s). This guards against a commit blocking on the
462 /// writer/flush lock, but it also bounds the commit's own compute time — so
463 /// workloads that commit very large transactions in a single shot (bulk-history
464 /// backfills, or unoptimized debug builds) may need to raise it.
465 pub commit_timeout: Duration,
466
467 /// Default maximum memory per query (default: 1GB)
468 pub max_query_memory: usize,
469
470 /// Maximum transaction buffer memory in bytes (default: 1GB).
471 /// Limits memory usage during transactions to prevent OOM.
472 pub max_transaction_memory: usize,
473
474 /// Maximum rows for in-memory compaction (default: 5M, ~725MB at 145 bytes/row).
475 /// Configurable OOM guard to prevent memory exhaustion during compaction.
476 pub max_compaction_rows: usize,
477
478 /// Maximum iterations for recursive CTE evaluation (default: 1000).
479 pub max_recursive_cte_iterations: usize,
480
481 /// Object store resilience configuration
482 pub object_store: ObjectStoreConfig,
483
484 /// Background index rebuild configuration
485 pub index_rebuild: IndexRebuildConfig,
486
487 /// When true, reject writes that reference labels or edge types not declared
488 /// in the schema. Default: false (schemaless mode — any label or edge type
489 /// is accepted and dynamically registered).
490 pub strict_schema: bool,
491
492 /// Enable Lance `MergeInsert` for SET-only flushes (default: false).
493 ///
494 /// When true, `Writer::insert_vertex_partial` records the touched
495 /// property keys into L0 and the flush emits a partial-column source
496 /// to Lance via `MergeInsertBuilder` — skipping the read of (and write
497 /// of) the unchanged columns. Wide-row schemas with vector indexes
498 /// benefit most (~17 ms/row → ~3 ms/row on the issue #72 ingest
499 /// workload). See the Round-11 plan section in
500 /// `plan-and-implement-a-valiant-flame.md`.
501 pub partial_lance_writes: bool,
502
503 /// When true, auto-embedding for vertex writes is deferred from the
504 /// per-row `insert_vertex_*` path to the next L1 flush, where the
505 /// existing `process_embeddings_for_batch` issues one model call for
506 /// the whole flush batch instead of N per-row calls.
507 ///
508 /// Trade-off: in-tx reads of the embedding column on a freshly
509 /// SET/inserted vertex see the OLD storage value (or no value, for
510 /// new vertices) until flush. Existing behavior is identical to
511 /// today's `process_embeddings_impl(target_prop present)` short-circuit
512 /// (writer.rs:2727) — updating only the source text never refreshes
513 /// the embedding mid-tx, deferred or not. Opt-in for workloads that
514 /// don't read embeddings between write and commit.
515 ///
516 /// Default: `false` (preserves bit-for-bit compatibility with
517 /// pre-Phase-B releases).
518 pub defer_embeddings: bool,
519
520 /// Per-fork L1 fragment-count threshold above which a `tracing::warn!`
521 /// fires once per crossing during fork flush. Long-lived heavy-write
522 /// forks accumulate fragments because fork compaction is deferred to
523 /// Phase 5; this surfaces the risk operationally. Default: 256.
524 pub fork_fragment_warn_threshold: usize,
525
526 /// Per-transaction VID/EID reservoir refill size. Each `Transaction`
527 /// pre-reserves this many IDs at a time from the global `IdAllocator`,
528 /// amortizing its `tokio::Mutex` over `N` allocations. Tradeoff:
529 /// larger = fewer global-mutex acquisitions but more wasted IDs on
530 /// short transactions (capped at `batch_size - 1` per tx). u64 ID space
531 /// makes the waste negligible. Default: 16.
532 pub tx_id_reservoir_batch: usize,
533
534 /// When `true`, `check_flush` on the commit path dispatches via the
535 /// async path (`flush_to_l1_async`): rotate L0 under `flush_lock`,
536 /// then spawn the streaming + finalize work on a background task.
537 /// Concurrent committers no longer queue on the flush's long I/O.
538 ///
539 /// When `false` (default for now), `check_flush` calls the original
540 /// synchronous `flush_to_l1` and holds `flush_lock` across the full
541 /// L1-streaming write. This is the kill-switch.
542 pub async_flush_enabled: bool,
543
544 /// Maximum number of L0→L1 flushes that may be in-flight simultaneously
545 /// when `async_flush_enabled` is true. The (N+1)th rotate blocks until
546 /// one of the in-flight flushes finalizes. Bounds WAL retention and
547 /// memory growth. Default: 2.
548 pub max_pending_flushes: usize,
549
550 /// Maximum time `drop_fork` will wait for pending async flushes on
551 /// that fork before failing with `PendingFlushTimeout`. Only meaningful
552 /// when `async_flush_enabled` is true. Default: 10s.
553 pub drop_fork_drain_timeout: Duration,
554
555 /// Phase 4a: cap on total fork count (Active + Pending + Tombstoned).
556 /// `None` = unbounded. When set, `Session::fork(name).await` errors
557 /// with `UniError::ForkBudgetExceeded` once the cap is reached.
558 /// Tombstoned forks count because they still hold branch state on
559 /// disk until recovery completes; counting them prevents churn-thrash.
560 ///
561 /// **Production guidance (L11):** the default is `None` (unbounded) to
562 /// avoid surprising existing embedders, but each fork's branches scale
563 /// with schema size and persist until dropped, so unbounded fork churn
564 /// is an on-disk growth risk. Production deployments that create forks
565 /// from untrusted/automated callers SHOULD set an explicit `max_forks`
566 /// (and ideally `fork_default_ttl`).
567 pub max_forks: Option<usize>,
568
569 /// Phase 4a: default TTL applied to forks when the user does not
570 /// supply one via `session.fork(name).ttl(...)`. `None` = no TTL.
571 /// The background sweeper drops forks whose `ttl_expires_at` is in
572 /// the past via `drop_fork_cascade`.
573 pub fork_default_ttl: Option<Duration>,
574
575 /// Phase 4a: how often the background TTL sweeper polls the
576 /// registry for expired forks. Default: 60 seconds.
577 pub fork_sweeper_interval: Duration,
578
579 /// Phase 4a: skip spawning the TTL sweeper. Tests should set this
580 /// to `true` when they want deterministic control over fork
581 /// lifetimes; production should leave it `false`.
582 pub disable_fork_sweeper: bool,
583
584 /// Phase 5a: minimum per-fork row count (per label) before the
585 /// background `IndexRebuildManager` schedules a fork-local index
586 /// build. Below this threshold, fork reads inherit primary's
587 /// indexes through Lance `base_paths`; above it, the planner
588 /// switches to `FusedIndexScan` once the build completes. Default
589 /// 10,000 rows per spec §8.
590 pub fork_index_build_threshold: u64,
591
592 /// Phase 5a-impl Step 7: how often the background fork index
593 /// builder polls active forks for build candidates. Default
594 /// 30 seconds.
595 pub fork_index_builder_interval: Duration,
596
597 /// Phase 5a-impl Step 7: skip spawning the background fork index
598 /// builder. Tests that exercise the manual `Session::build_fork_local_index`
599 /// trigger should set this to `true` so timing isn't dependent on
600 /// the polling cadence.
601 pub disable_fork_index_builder: bool,
602
603 /// Enable Serializable Snapshot Isolation and optimistic concurrency
604 /// control (default: `true`).
605 ///
606 /// When `true`, read-write transactions read from a pinned L0 snapshot,
607 /// track an item-level read/write-set, and validate at commit under
608 /// `flush_lock`: a write-write or read-write conflict against a commit
609 /// landed since the transaction's snapshot aborts with
610 /// `UniError::SerializationConflict`, a duplicate concurrent `MERGE` on a
611 /// unique key aborts with `UniError::ConstraintConflict`, and `FOR UPDATE`
612 /// acquires per-key row locks. Callers should wrap contended writes in
613 /// `Session::transact_with_retry`, which re-runs retriable conflicts.
614 ///
615 /// When `false`, the engine reverts to last-writer-wins: concurrent
616 /// read-modify-write transactions can silently lose updates, concurrent
617 /// `MERGE` can create duplicate unique keys, and `FOR UPDATE` is a no-op
618 /// (a `tracing::warn!` is emitted when a query requests it). Reads run
619 /// against the live L0 with no snapshot pinning. This reproduces the
620 /// pre-SSI behavior bit-for-bit and skips the (near-zero, but non-nil)
621 /// read-set/validation overhead — appropriate only for single-writer
622 /// workloads or callers that guard read-modify-write externally.
623 ///
624 /// Defaults to `true` because silent lost updates are a correctness hazard
625 /// for any concurrent-writer workload.
626 pub ssi_enabled: bool,
627}
628
629impl Default for UniConfig {
630 fn default() -> Self {
631 let parallelism = thread::available_parallelism()
632 .map(|n| n.get())
633 .unwrap_or(4);
634
635 Self {
636 cache_size: 1024 * 1024 * 1024, // 1GB
637 parallelism,
638 batch_size: 1024, // Default morsel size
639 max_frontier_size: 1_000_000,
640 auto_flush_threshold: 10_000,
641 auto_flush_interval: Some(Duration::from_secs(5)),
642 auto_flush_min_mutations: 1,
643 wal_enabled: true,
644 compaction: CompactionConfig::default(),
645 throttle: WriteThrottleConfig::default(),
646 file_sandbox: FileSandboxConfig::default(),
647 query_timeout: Duration::from_secs(30),
648 commit_timeout: Duration::from_secs(5),
649 max_query_memory: 1024 * 1024 * 1024, // 1GB
650 max_transaction_memory: 1024 * 1024 * 1024, // 1GB
651 max_compaction_rows: 5_000_000, // 5M rows
652 max_recursive_cte_iterations: 1000,
653 object_store: ObjectStoreConfig::default(),
654 index_rebuild: IndexRebuildConfig::default(),
655 strict_schema: false,
656 partial_lance_writes: false,
657 defer_embeddings: false,
658 fork_fragment_warn_threshold: 256,
659 tx_id_reservoir_batch: 16,
660 // Default ON as of the Item-B-deep-fix landing (per-table
661 // write serialization + Lance Table cache removed + drain
662 // in flush_to_l1). Validated by full UNI_ASYNC_FLUSH=1
663 // cross-crate nextest: 1754/1754 pass.
664 //
665 // `UNI_ASYNC_FLUSH=0` / `=false` / `=no` (case-insensitive)
666 // explicitly DISABLES async flush — useful for bisecting
667 // suspected async-flush regressions and for the sync-only
668 // benchmarks in `flush_pressure.rs`. Unset = default
669 // behavior (true).
670 async_flush_enabled: std::env::var("UNI_ASYNC_FLUSH")
671 .ok()
672 .map(|v| {
673 let v = v.to_ascii_lowercase();
674 !(v == "0" || v == "false" || v == "no")
675 })
676 .unwrap_or(true),
677 max_pending_flushes: 2,
678 drop_fork_drain_timeout: Duration::from_secs(10),
679 max_forks: None,
680 fork_default_ttl: None,
681 fork_sweeper_interval: Duration::from_secs(60),
682 disable_fork_sweeper: false,
683 fork_index_build_threshold: 10_000,
684 fork_index_builder_interval: Duration::from_secs(30),
685 disable_fork_index_builder: false,
686 // Correctness-first default: SSI/OCC on. See the field docs for
687 // the behavioral contract and the migration note (concurrent
688 // writers now observe aborts instead of silent lost updates;
689 // wrap them in `Session::transact_with_retry`).
690 ssi_enabled: true,
691 }
692 }
693}
694
695/// Cloud storage backend configuration.
696///
697/// Supports Amazon S3, Google Cloud Storage, and Azure Blob Storage.
698/// Each variant contains the credentials and connection parameters for
699/// its respective cloud provider.
700///
701/// # Examples
702///
703/// ```ignore
704/// // Create S3 configuration from environment variables
705/// let config = CloudStorageConfig::s3_from_env("my-bucket");
706///
707/// // Create explicit S3 configuration for LocalStack testing
708/// let config = CloudStorageConfig::S3 {
709/// bucket: "test-bucket".to_string(),
710/// region: Some("us-east-1".to_string()),
711/// endpoint: Some("http://localhost:4566".to_string()),
712/// access_key_id: Some("test".to_string()),
713/// secret_access_key: Some("test".to_string()),
714/// session_token: None,
715/// virtual_hosted_style: false,
716/// };
717/// ```
718#[derive(Clone, Debug)]
719pub enum CloudStorageConfig {
720 /// Amazon S3 storage configuration.
721 S3 {
722 /// S3 bucket name.
723 bucket: String,
724 /// AWS region (e.g., "us-east-1"). Uses AWS_REGION env var if None.
725 region: Option<String>,
726 /// Custom endpoint URL for S3-compatible services (MinIO, LocalStack).
727 endpoint: Option<String>,
728 /// AWS access key ID. Uses AWS_ACCESS_KEY_ID env var if None.
729 access_key_id: Option<String>,
730 /// AWS secret access key. Uses AWS_SECRET_ACCESS_KEY env var if None.
731 secret_access_key: Option<String>,
732 /// AWS session token for temporary credentials.
733 session_token: Option<String>,
734 /// Use virtual-hosted-style requests (bucket.s3.region.amazonaws.com).
735 virtual_hosted_style: bool,
736 },
737 /// Google Cloud Storage configuration.
738 Gcs {
739 /// GCS bucket name.
740 bucket: String,
741 /// Path to service account JSON key file.
742 service_account_path: Option<String>,
743 /// Service account JSON key content (alternative to path).
744 service_account_key: Option<String>,
745 },
746 /// Azure Blob Storage configuration.
747 Azure {
748 /// Azure container name.
749 container: String,
750 /// Azure storage account name.
751 account: String,
752 /// Azure storage account access key.
753 access_key: Option<String>,
754 /// Azure SAS token for limited access.
755 sas_token: Option<String>,
756 },
757}
758
759impl CloudStorageConfig {
760 /// Creates an S3 configuration using environment variables.
761 ///
762 /// Reads credentials from standard AWS environment variables:
763 /// - `AWS_ACCESS_KEY_ID`
764 /// - `AWS_SECRET_ACCESS_KEY`
765 /// - `AWS_SESSION_TOKEN` (optional)
766 /// - `AWS_REGION` or `AWS_DEFAULT_REGION`
767 /// - `AWS_ENDPOINT_URL` (optional, for S3-compatible services)
768 #[must_use]
769 pub fn s3_from_env(bucket: &str) -> Self {
770 Self::S3 {
771 bucket: bucket.to_string(),
772 region: std::env::var("AWS_REGION")
773 .or_else(|_| std::env::var("AWS_DEFAULT_REGION"))
774 .ok(),
775 endpoint: std::env::var("AWS_ENDPOINT_URL").ok(),
776 access_key_id: std::env::var("AWS_ACCESS_KEY_ID").ok(),
777 secret_access_key: std::env::var("AWS_SECRET_ACCESS_KEY").ok(),
778 session_token: std::env::var("AWS_SESSION_TOKEN").ok(),
779 virtual_hosted_style: false,
780 }
781 }
782
783 /// Creates a GCS configuration using environment variables.
784 ///
785 /// Reads service account path from `GOOGLE_APPLICATION_CREDENTIALS`.
786 #[must_use]
787 pub fn gcs_from_env(bucket: &str) -> Self {
788 Self::Gcs {
789 bucket: bucket.to_string(),
790 service_account_path: std::env::var("GOOGLE_APPLICATION_CREDENTIALS").ok(),
791 service_account_key: None,
792 }
793 }
794
795 /// Creates an Azure configuration using environment variables.
796 ///
797 /// Reads credentials from Azure environment variables:
798 /// - `AZURE_STORAGE_ACCOUNT`
799 /// - `AZURE_STORAGE_ACCESS_KEY` (optional)
800 /// - `AZURE_STORAGE_SAS_TOKEN` (optional)
801 ///
802 /// # Panics
803 ///
804 /// Panics if `AZURE_STORAGE_ACCOUNT` is not set.
805 #[must_use]
806 pub fn azure_from_env(container: &str) -> Self {
807 Self::Azure {
808 container: container.to_string(),
809 account: std::env::var("AZURE_STORAGE_ACCOUNT")
810 .expect("AZURE_STORAGE_ACCOUNT environment variable required"),
811 access_key: std::env::var("AZURE_STORAGE_ACCESS_KEY").ok(),
812 sas_token: std::env::var("AZURE_STORAGE_SAS_TOKEN").ok(),
813 }
814 }
815
816 /// Returns the bucket/container name for this configuration.
817 #[must_use]
818 pub fn bucket_name(&self) -> &str {
819 match self {
820 Self::S3 { bucket, .. } => bucket,
821 Self::Gcs { bucket, .. } => bucket,
822 Self::Azure { container, .. } => container,
823 }
824 }
825
826 /// Returns a URL-style identifier for this storage location.
827 #[must_use]
828 pub fn to_url(&self) -> String {
829 match self {
830 Self::S3 { bucket, .. } => format!("s3://{bucket}"),
831 Self::Gcs { bucket, .. } => format!("gs://{bucket}"),
832 Self::Azure {
833 container, account, ..
834 } => format!("az://{account}/{container}"),
835 }
836 }
837}
838
839#[cfg(test)]
840mod security_tests {
841 use super::*;
842
843 /// Tests for CWE-22 (Path Traversal) prevention in file sandbox.
844 mod file_sandbox {
845 use super::*;
846
847 #[test]
848 fn test_sandbox_disabled_allows_all_paths() {
849 let config = FileSandboxConfig::default();
850 assert!(!config.enabled);
851 // When disabled, all paths are allowed
852 assert!(config.validate_path("/tmp/test").is_ok());
853 }
854
855 #[test]
856 fn test_sandbox_enabled_with_no_paths_rejects() {
857 let config = FileSandboxConfig {
858 enabled: true,
859 allowed_paths: vec![],
860 };
861 let result = config.validate_path("/tmp/test");
862 assert!(result.is_err());
863 assert!(result.unwrap_err().contains("no allowed paths configured"));
864 }
865
866 #[test]
867 fn test_sandbox_rejects_outside_path() {
868 let config = FileSandboxConfig {
869 enabled: true,
870 allowed_paths: vec![PathBuf::from("/var/lib/uni")],
871 };
872 let result = config.validate_path("/etc/passwd");
873 assert!(result.is_err());
874 assert!(result.unwrap_err().contains("outside allowed sandbox"));
875 }
876
877 #[test]
878 fn test_is_potentially_insecure() {
879 // Disabled is insecure
880 let disabled = FileSandboxConfig::default();
881 assert!(disabled.is_potentially_insecure());
882
883 // Enabled with no paths is insecure
884 let no_paths = FileSandboxConfig {
885 enabled: true,
886 allowed_paths: vec![],
887 };
888 assert!(no_paths.is_potentially_insecure());
889
890 // Enabled with paths is secure
891 let secure = FileSandboxConfig::sandboxed(vec![PathBuf::from("/data")]);
892 assert!(!secure.is_potentially_insecure());
893 }
894
895 #[test]
896 fn test_security_warning_when_disabled() {
897 let disabled = FileSandboxConfig::default();
898 assert!(disabled.security_warning().is_some());
899
900 let enabled = FileSandboxConfig::sandboxed(vec![PathBuf::from("/data")]);
901 assert!(enabled.security_warning().is_none());
902 }
903
904 #[test]
905 fn test_deployment_mode_defaults() {
906 let embedded = FileSandboxConfig::default_for_mode(DeploymentMode::Embedded);
907 assert!(!embedded.enabled);
908
909 let server = FileSandboxConfig::default_for_mode(DeploymentMode::Server);
910 assert!(server.enabled);
911 assert!(!server.allowed_paths.is_empty());
912 }
913 }
914}