Skip to main content

uni_common/
config.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2024-2026 Dragonscale Team
3
4use std::path::{Path, PathBuf};
5use std::thread;
6use std::time::Duration;
7
8#[derive(Clone, Debug)]
9pub struct CompactionConfig {
10    /// Enable background compaction (default: true)
11    pub enabled: bool,
12
13    /// Max uncompacted flush generations before triggering compaction (default: 8)
14    pub max_l1_runs: usize,
15
16    /// Max L1 size in bytes before compaction (default: 256MB)
17    pub max_l1_size_bytes: u64,
18
19    /// Max age of oldest L1 run before compaction (default: 1 hour)
20    pub max_l1_age: Duration,
21
22    /// Background check interval (default: 10s)
23    pub check_interval: Duration,
24
25    /// Number of compaction worker threads (default: 1)
26    pub worker_threads: usize,
27
28    /// Number of frozen L0-csr overlay segments that must accumulate before
29    /// `AdjacencyManager::compact` is spawned post-flush (default: 2).
30    ///
31    /// Each frozen segment adds per-read overhead until merged back into the
32    /// Main CSR. Lowering this triggers compaction sooner; higher values
33    /// batch more segments per compaction at the cost of slower reads while
34    /// they accumulate. The default of 2 keeps the read-side overhead
35    /// bounded across a wide range of write rates. See issue #55.
36    pub frozen_segments_compact_threshold: usize,
37}
38
39impl Default for CompactionConfig {
40    fn default() -> Self {
41        Self {
42            enabled: true,
43            max_l1_runs: 8,
44            max_l1_size_bytes: 256 * 1024 * 1024,
45            max_l1_age: Duration::from_secs(3600),
46            check_interval: Duration::from_secs(10),
47            worker_threads: 1,
48            frozen_segments_compact_threshold: 2,
49        }
50    }
51}
52
53/// Configuration for background index rebuilding.
54#[derive(Clone, Debug)]
55pub struct IndexRebuildConfig {
56    /// Maximum number of retry attempts for failed index builds (default: 3).
57    pub max_retries: u32,
58
59    /// Delay between retry attempts (default: 60s).
60    pub retry_delay: Duration,
61
62    /// How often to check for pending index rebuild tasks (default: 5s).
63    pub worker_check_interval: Duration,
64
65    /// Row growth ratio to trigger rebuild (default: 0.5 = 50%). Set 0.0 to disable.
66    pub growth_trigger_ratio: f64,
67
68    /// Max index age before rebuild. `None` disables the time-based trigger.
69    pub max_index_age: Option<Duration>,
70
71    /// Enable post-flush automatic rebuild scheduling (default: false).
72    pub auto_rebuild_enabled: bool,
73}
74
75impl Default for IndexRebuildConfig {
76    fn default() -> Self {
77        Self {
78            max_retries: 3,
79            retry_delay: Duration::from_secs(60),
80            worker_check_interval: Duration::from_secs(5),
81            growth_trigger_ratio: 0.5,
82            max_index_age: None,
83            auto_rebuild_enabled: false,
84        }
85    }
86}
87
88#[derive(Clone, Copy, Debug)]
89pub struct WriteThrottleConfig {
90    /// Uncompacted flush generations to start throttling (default: 16)
91    pub soft_limit: usize,
92
93    /// Uncompacted flush generations to stop writes entirely (default: 32)
94    pub hard_limit: usize,
95
96    /// Base delay when throttling (default: 10ms)
97    pub base_delay: Duration,
98}
99
100impl Default for WriteThrottleConfig {
101    fn default() -> Self {
102        Self {
103            soft_limit: 16,
104            hard_limit: 32,
105            base_delay: Duration::from_millis(10),
106        }
107    }
108}
109
110#[derive(Clone, Debug)]
111pub struct ObjectStoreConfig {
112    pub connect_timeout: Duration,
113    pub read_timeout: Duration,
114    pub write_timeout: Duration,
115    pub max_retries: u32,
116    pub retry_backoff_base: Duration,
117    pub retry_backoff_max: Duration,
118}
119
120impl Default for ObjectStoreConfig {
121    fn default() -> Self {
122        Self {
123            connect_timeout: Duration::from_secs(10),
124            read_timeout: Duration::from_secs(30),
125            write_timeout: Duration::from_secs(60),
126            max_retries: 3,
127            retry_backoff_base: Duration::from_millis(100),
128            retry_backoff_max: Duration::from_secs(10),
129        }
130    }
131}
132
133/// Security configuration for file system operations.
134/// Controls which paths can be accessed by BACKUP, COPY, and EXPORT commands.
135///
136/// Disabled by default for backward compatibility in embedded mode.
137/// MUST be enabled for server mode with untrusted clients.
138#[derive(Clone, Debug, Default)]
139pub struct FileSandboxConfig {
140    /// If true, file operations are restricted to allowed_paths.
141    /// If false, all paths are allowed (NOT RECOMMENDED for server mode).
142    pub enabled: bool,
143
144    /// List of allowed base directories for file operations.
145    /// Paths must be absolute and canonical.
146    /// File operations are only allowed within these directories.
147    pub allowed_paths: Vec<PathBuf>,
148}
149
150/// Deployment mode for the database.
151///
152/// Used to determine appropriate security defaults.
153#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
154pub enum DeploymentMode {
155    /// Embedded/library mode where the host application controls access.
156    /// File sandbox is disabled by default for backward compatibility.
157    #[default]
158    Embedded,
159    /// Server mode with untrusted clients.
160    /// File sandbox is enabled by default with restricted paths.
161    Server,
162}
163
164/// HTTP server configuration.
165///
166/// Controls CORS, authentication, and other HTTP-related security settings.
167///
168/// # Security
169///
170/// **CWE-942 (Overly Permissive CORS)**, **CWE-306 (Missing Authentication)**:
171/// Production deployments should configure explicit `allowed_origins` and
172/// enable API key authentication.
173#[derive(Clone, Debug)]
174pub struct ServerConfig {
175    /// Allowed CORS origins.
176    ///
177    /// - Empty vector: No CORS headers (most restrictive)
178    /// - `["*"]`: Allow all origins (NOT RECOMMENDED for production)
179    /// - Explicit list: Only allow specified origins (RECOMMENDED)
180    ///
181    /// # Security
182    ///
183    /// **CWE-942**: Using `["*"]` allows any website to make requests to
184    /// your server, potentially exposing sensitive data.
185    pub allowed_origins: Vec<String>,
186
187    /// Optional API key for request authentication.
188    ///
189    /// When set, all API requests must include the header:
190    /// `X-API-Key: <key>`
191    ///
192    /// # Security
193    ///
194    /// **CWE-306**: Without authentication, any client can execute queries.
195    /// Enable this for any deployment accessible beyond localhost.
196    pub api_key: Option<String>,
197
198    /// Whether to require API key for metrics endpoint.
199    ///
200    /// Default: false (metrics are public for observability tooling)
201    pub require_auth_for_metrics: bool,
202}
203
204impl Default for ServerConfig {
205    fn default() -> Self {
206        Self {
207            // Default to localhost-only origin for development safety
208            allowed_origins: vec!["http://localhost:3000".to_string()],
209            api_key: None,
210            require_auth_for_metrics: false,
211        }
212    }
213}
214
215impl ServerConfig {
216    /// Create a permissive config for local development only.
217    ///
218    /// # Security
219    ///
220    /// **WARNING**: Do not use in production. This config allows all CORS origins
221    /// and has no authentication.
222    #[must_use]
223    pub fn development() -> Self {
224        Self {
225            allowed_origins: vec!["*".to_string()],
226            api_key: None,
227            require_auth_for_metrics: false,
228        }
229    }
230
231    /// Create a production config with explicit origins and required API key.
232    ///
233    /// # Panics
234    ///
235    /// Panics if `api_key` is empty.
236    #[must_use]
237    pub fn production(allowed_origins: Vec<String>, api_key: String) -> Self {
238        assert!(
239            !api_key.is_empty(),
240            "API key must not be empty for production"
241        );
242        Self {
243            allowed_origins,
244            api_key: Some(api_key),
245            require_auth_for_metrics: true,
246        }
247    }
248
249    /// Returns a security warning if the config is insecure.
250    pub fn security_warning(&self) -> Option<&'static str> {
251        let allows_all_origins = self.allowed_origins.iter().any(|o| o == "*");
252        if allows_all_origins && self.api_key.is_none() {
253            Some(
254                "Server config has permissive CORS (allow all origins) and no API key. \
255                 This is insecure for production deployments.",
256            )
257        } else if allows_all_origins {
258            Some(
259                "Server config has permissive CORS (allow all origins). \
260                 Consider restricting to specific origins for production.",
261            )
262        } else if self.api_key.is_none() {
263            Some(
264                "Server config has no API key authentication. \
265                 Enable api_key for production deployments.",
266            )
267        } else {
268            None
269        }
270    }
271}
272
273impl FileSandboxConfig {
274    /// Creates a sandboxed config that only allows operations in the specified directories.
275    pub fn sandboxed(paths: Vec<PathBuf>) -> Self {
276        Self {
277            enabled: true,
278            allowed_paths: paths,
279        }
280    }
281
282    /// Creates a config with appropriate defaults for the deployment mode.
283    ///
284    /// # Security
285    ///
286    /// - **Embedded mode**: Sandbox disabled (host application controls access)
287    /// - **Server mode**: Sandbox enabled with default paths `/var/lib/uni/data` and
288    ///   `/var/lib/uni/backups`
289    ///
290    /// **CWE-22 (Path Traversal)**: Server deployments MUST enable the sandbox to
291    /// prevent arbitrary file read/write via BACKUP, COPY, and EXPORT commands.
292    pub fn default_for_mode(mode: DeploymentMode) -> Self {
293        match mode {
294            DeploymentMode::Embedded => Self {
295                enabled: false,
296                allowed_paths: vec![],
297            },
298            DeploymentMode::Server => Self {
299                enabled: true,
300                allowed_paths: vec![
301                    PathBuf::from("/var/lib/uni/data"),
302                    PathBuf::from("/var/lib/uni/backups"),
303                ],
304            },
305        }
306    }
307
308    /// Returns a security warning message if the sandbox is disabled.
309    ///
310    /// Call this at startup to alert administrators about potential security risks.
311    /// Returns `Some(message)` if a warning should be displayed, `None` otherwise.
312    ///
313    /// # Security
314    ///
315    /// **CWE-22 (Path Traversal)**, **CWE-73 (External Control of File Name)**:
316    /// Disabled sandbox allows unrestricted filesystem access for BACKUP, COPY,
317    /// and EXPORT commands, which can lead to:
318    /// - Arbitrary file read/write in server deployments
319    /// - Data exfiltration to attacker-controlled paths
320    /// - Potential privilege escalation via file overwrites
321    ///
322    /// # Example
323    ///
324    /// ```ignore
325    /// if let Some(warning) = config.file_sandbox.security_warning() {
326    ///     tracing::warn!(target: "uni_db::security", "{}", warning);
327    /// }
328    /// ```
329    pub fn security_warning(&self) -> Option<&'static str> {
330        if !self.enabled {
331            Some(
332                "File sandbox is DISABLED. This allows unrestricted filesystem access \
333                 for BACKUP, COPY, and EXPORT commands. Enable sandbox for server \
334                 deployments: file_sandbox.enabled = true",
335            )
336        } else {
337            None
338        }
339    }
340
341    /// Returns whether the sandbox is in a potentially insecure state.
342    ///
343    /// Returns `true` if the sandbox is disabled or enabled with no allowed paths.
344    pub fn is_potentially_insecure(&self) -> bool {
345        !self.enabled || self.allowed_paths.is_empty()
346    }
347
348    /// Validate that a path is within the allowed sandbox.
349    /// Returns Ok(canonical_path) if allowed, Err if not.
350    pub fn validate_path(&self, path: &str) -> Result<PathBuf, String> {
351        if !self.enabled {
352            // Sandbox disabled - allow all paths
353            return Ok(PathBuf::from(path));
354        }
355
356        if self.allowed_paths.is_empty() {
357            return Err("File sandbox is enabled but no allowed paths configured".to_string());
358        }
359
360        // Resolve the path to canonical form to prevent traversal attacks
361        let input_path = Path::new(path);
362
363        // For paths that don't exist yet (e.g., export destinations), we need to
364        // check their parent directory exists and is within allowed paths
365        let canonical = if input_path.exists() {
366            input_path
367                .canonicalize()
368                .map_err(|e| format!("Failed to canonicalize path: {}", e))?
369        } else {
370            // Path doesn't exist - check parent
371            let parent = input_path
372                .parent()
373                .ok_or_else(|| "Invalid path: no parent directory".to_string())?;
374            if !parent.exists() {
375                return Err(format!(
376                    "Parent directory does not exist: {}",
377                    parent.display()
378                ));
379            }
380            let canonical_parent = parent
381                .canonicalize()
382                .map_err(|e| format!("Failed to canonicalize parent: {}", e))?;
383            // Reconstruct with canonical parent + original filename
384            let filename = input_path
385                .file_name()
386                .ok_or_else(|| "Invalid path: no filename".to_string())?;
387            canonical_parent.join(filename)
388        };
389
390        // Check if the canonical path is within any allowed directory
391        for allowed in &self.allowed_paths {
392            // Ensure allowed path is canonical too
393            let canonical_allowed = if allowed.exists() {
394                allowed.canonicalize().unwrap_or_else(|_| allowed.clone())
395            } else {
396                allowed.clone()
397            };
398
399            if canonical.starts_with(&canonical_allowed) {
400                return Ok(canonical);
401            }
402        }
403
404        Err(format!(
405            "Path '{}' is outside allowed sandbox directories. Allowed: {:?}",
406            path, self.allowed_paths
407        ))
408    }
409}
410
411#[derive(Clone, Debug)]
412pub struct UniConfig {
413    /// Maximum adjacency cache size in bytes (default: 1GB)
414    pub cache_size: usize,
415
416    /// Number of worker threads for parallel execution
417    pub parallelism: usize,
418
419    /// Size of each data morsel/batch (number of rows)
420    pub batch_size: usize,
421
422    /// Maximum size of traversal frontier before pruning
423    pub max_frontier_size: usize,
424
425    /// Auto-flush threshold for L0 buffer (default: 10_000 mutations)
426    pub auto_flush_threshold: usize,
427
428    /// Auto-flush interval for L0 buffer (default: 5 seconds).
429    /// Flush triggers if time elapsed AND mutation count >= auto_flush_min_mutations.
430    /// Set to None to disable time-based flush.
431    pub auto_flush_interval: Option<Duration>,
432
433    /// Minimum mutations required before the time-based flush triggers
434    /// (default: 1).
435    ///
436    /// Prevents unnecessary flushes when activity is minimal. Raising this
437    /// (e.g., to 1000) lets small bursts coalesce into one flush — useful
438    /// for benchmark workloads — but for active databases with high write
439    /// rates, raising it reduces flush frequency and lets the active overlay
440    /// grow larger between flushes, which can hurt read latency. Tune with
441    /// `compaction.frozen_segments_compact_threshold` together. See issue
442    /// #55 for the trade-off discussion.
443    pub auto_flush_min_mutations: usize,
444
445    /// Enable write-ahead logging (default: true)
446    pub wal_enabled: bool,
447
448    /// Compaction configuration
449    pub compaction: CompactionConfig,
450
451    /// Write throttling configuration
452    pub throttle: WriteThrottleConfig,
453
454    /// File sandbox configuration for BACKUP/COPY/EXPORT commands.
455    /// MUST be enabled with allowed paths in server mode to prevent arbitrary file access.
456    pub file_sandbox: FileSandboxConfig,
457
458    /// Default query execution timeout (default: 30s)
459    pub query_timeout: Duration,
460
461    /// Maximum wall time a transaction commit may take before it is aborted with
462    /// `CommitTimeout` (default: 5s). This guards against a commit blocking on the
463    /// writer/flush lock, but it also bounds the commit's own compute time — so
464    /// workloads that commit very large transactions in a single shot (bulk-history
465    /// backfills, or unoptimized debug builds) may need to raise it.
466    pub commit_timeout: Duration,
467
468    /// Default maximum memory per query (default: 1GB)
469    pub max_query_memory: usize,
470
471    /// Maximum transaction buffer memory in bytes (default: 1GB).
472    /// Limits memory usage during transactions to prevent OOM.
473    pub max_transaction_memory: usize,
474
475    /// Maximum rows for in-memory compaction (default: 5M, ~725MB at 145 bytes/row).
476    /// Configurable OOM guard to prevent memory exhaustion during compaction.
477    pub max_compaction_rows: usize,
478
479    /// Maximum iterations for recursive CTE evaluation (default: 1000).
480    pub max_recursive_cte_iterations: usize,
481
482    /// Object store resilience configuration
483    pub object_store: ObjectStoreConfig,
484
485    /// Background index rebuild configuration
486    pub index_rebuild: IndexRebuildConfig,
487
488    /// When true, reject writes that reference labels or edge types not declared
489    /// in the schema. Default: false (schemaless mode — any label or edge type
490    /// is accepted and dynamically registered).
491    pub strict_schema: bool,
492
493    /// Enable Lance `MergeInsert` for SET-only flushes (default: false).
494    ///
495    /// When true, `Writer::insert_vertex_partial` records the touched
496    /// property keys into L0 and the flush emits a partial-column source
497    /// to Lance via `MergeInsertBuilder` — skipping the read of (and write
498    /// of) the unchanged columns. Wide-row schemas with vector indexes
499    /// benefit most (~17 ms/row → ~3 ms/row on the issue #72 ingest
500    /// workload). See the Round-11 plan section in
501    /// `plan-and-implement-a-valiant-flame.md`.
502    pub partial_lance_writes: bool,
503
504    /// When true, auto-embedding for vertex writes is deferred from the
505    /// per-row `insert_vertex_*` path to the next L1 flush, where the
506    /// existing `process_embeddings_for_batch` issues one model call for
507    /// the whole flush batch instead of N per-row calls.
508    ///
509    /// Trade-off: in-tx reads of the embedding column on a freshly
510    /// SET/inserted vertex see the OLD storage value (or no value, for
511    /// new vertices) until flush. Existing behavior is identical to
512    /// today's `process_embeddings_impl(target_prop present)` short-circuit
513    /// (writer.rs:2727) — updating only the source text never refreshes
514    /// the embedding mid-tx, deferred or not. Opt-in for workloads that
515    /// don't read embeddings between write and commit.
516    ///
517    /// Default: `false` (preserves bit-for-bit compatibility with
518    /// pre-Phase-B releases).
519    pub defer_embeddings: bool,
520
521    /// Per-fork L1 fragment-count threshold above which a `tracing::warn!`
522    /// fires once per crossing during fork flush. Long-lived heavy-write
523    /// forks accumulate fragments because fork compaction is deferred to
524    /// Phase 5; this surfaces the risk operationally. Default: 256.
525    pub fork_fragment_warn_threshold: usize,
526
527    /// Per-transaction VID/EID reservoir refill size. Each `Transaction`
528    /// pre-reserves this many IDs at a time from the global `IdAllocator`,
529    /// amortizing its `tokio::Mutex` over `N` allocations. Tradeoff:
530    /// larger = fewer global-mutex acquisitions but more wasted IDs on
531    /// short transactions (capped at `batch_size - 1` per tx). u64 ID space
532    /// makes the waste negligible. Default: 16.
533    pub tx_id_reservoir_batch: usize,
534
535    /// When `true`, `check_flush` on the commit path dispatches via the
536    /// async path (`flush_to_l1_async`): rotate L0 under `flush_lock`,
537    /// then spawn the streaming + finalize work on a background task.
538    /// Concurrent committers no longer queue on the flush's long I/O.
539    ///
540    /// When `false` (default for now), `check_flush` calls the original
541    /// synchronous `flush_to_l1` and holds `flush_lock` across the full
542    /// L1-streaming write. This is the kill-switch.
543    pub async_flush_enabled: bool,
544
545    /// Maximum number of L0→L1 flushes that may be in-flight simultaneously
546    /// when `async_flush_enabled` is true. The (N+1)th rotate blocks until
547    /// one of the in-flight flushes finalizes. Bounds WAL retention and
548    /// memory growth. Default: 2.
549    pub max_pending_flushes: usize,
550
551    /// Maximum wall-clock time an async L0→L1 stream phase may run before the
552    /// flush coordinator converts it into a data-safe flush *failure* (issue
553    /// #132). A stalled sparse/multi-vector Lance read-modify-write would
554    /// otherwise never submit its rotate-seq, wedging the finalizer's
555    /// consecutive-seq pipeline and — via back-pressure permit saturation —
556    /// parking every later commit forever on `flush_lock`. On timeout the old
557    /// L0 is retained in `pending_flush`, WAL data is NOT truncated, the
558    /// permit is released, and `expected` advances; recovery is via WAL replay
559    /// / a later retry. Only meaningful when `async_flush_enabled` is true.
560    /// Default: 60s. Override with `UNI_FLUSH_STREAM_TIMEOUT` (seconds).
561    pub flush_stream_timeout: Duration,
562
563    /// Maximum time `drop_fork` will wait for pending async flushes on
564    /// that fork before failing with `PendingFlushTimeout`. Only meaningful
565    /// when `async_flush_enabled` is true. Default: 10s.
566    pub drop_fork_drain_timeout: Duration,
567
568    /// Phase 4a: cap on total fork count (Active + Pending + Tombstoned).
569    /// `None` = unbounded. When set, `Session::fork(name).await` errors
570    /// with `UniError::ForkBudgetExceeded` once the cap is reached.
571    /// Tombstoned forks count because they still hold branch state on
572    /// disk until recovery completes; counting them prevents churn-thrash.
573    ///
574    /// **Production guidance (L11):** the default is `None` (unbounded) to
575    /// avoid surprising existing embedders, but each fork's branches scale
576    /// with schema size and persist until dropped, so unbounded fork churn
577    /// is an on-disk growth risk. Production deployments that create forks
578    /// from untrusted/automated callers SHOULD set an explicit `max_forks`
579    /// (and ideally `fork_default_ttl`).
580    pub max_forks: Option<usize>,
581
582    /// Phase 4a: default TTL applied to forks when the user does not
583    /// supply one via `session.fork(name).ttl(...)`. `None` = no TTL.
584    /// The background sweeper drops forks whose `ttl_expires_at` is in
585    /// the past via `drop_fork_cascade`.
586    pub fork_default_ttl: Option<Duration>,
587
588    /// Phase 4a: how often the background TTL sweeper polls the
589    /// registry for expired forks. Default: 60 seconds.
590    pub fork_sweeper_interval: Duration,
591
592    /// Phase 4a: skip spawning the TTL sweeper. Tests should set this
593    /// to `true` when they want deterministic control over fork
594    /// lifetimes; production should leave it `false`.
595    pub disable_fork_sweeper: bool,
596
597    /// Phase 5a: minimum per-fork row count (per label) before the
598    /// background `IndexRebuildManager` schedules a fork-local index
599    /// build. Below this threshold, fork reads inherit primary's
600    /// indexes through Lance `base_paths`; above it, the planner
601    /// switches to `FusedIndexScan` once the build completes. Default
602    /// 10,000 rows per spec §8.
603    pub fork_index_build_threshold: u64,
604
605    /// Phase 5a-impl Step 7: how often the background fork index
606    /// builder polls active forks for build candidates. Default
607    /// 30 seconds.
608    pub fork_index_builder_interval: Duration,
609
610    /// Phase 5a-impl Step 7: skip spawning the background fork index
611    /// builder. Tests that exercise the manual `Session::build_fork_local_index`
612    /// trigger should set this to `true` so timing isn't dependent on
613    /// the polling cadence.
614    pub disable_fork_index_builder: bool,
615
616    /// Enable Serializable Snapshot Isolation and optimistic concurrency
617    /// control (default: `true`).
618    ///
619    /// When `true`, read-write transactions read from a pinned L0 snapshot,
620    /// track an item-level read/write-set, and validate at commit under
621    /// `flush_lock`: a write-write or read-write conflict against a commit
622    /// landed since the transaction's snapshot aborts with
623    /// `UniError::SerializationConflict`, a duplicate concurrent `MERGE` on a
624    /// unique key aborts with `UniError::ConstraintConflict`, and `FOR UPDATE`
625    /// acquires per-key row locks. Callers should wrap contended writes in
626    /// `Session::transact_with_retry`, which re-runs retriable conflicts.
627    ///
628    /// When `false`, the engine reverts to last-writer-wins: concurrent
629    /// read-modify-write transactions can silently lose updates, concurrent
630    /// `MERGE` can create duplicate unique keys, and `FOR UPDATE` is a no-op
631    /// (a `tracing::warn!` is emitted when a query requests it). Reads run
632    /// against the live L0 with no snapshot pinning. This reproduces the
633    /// pre-SSI behavior bit-for-bit and skips the (near-zero, but non-nil)
634    /// read-set/validation overhead — appropriate only for single-writer
635    /// workloads or callers that guard read-modify-write externally.
636    ///
637    /// Defaults to `true` because silent lost updates are a correctness hazard
638    /// for any concurrent-writer workload.
639    pub ssi_enabled: bool,
640}
641
642impl Default for UniConfig {
643    fn default() -> Self {
644        let parallelism = thread::available_parallelism().map_or(4, |n| n.get());
645
646        Self {
647            cache_size: 1024 * 1024 * 1024, // 1GB
648            parallelism,
649            batch_size: 1024, // Default morsel size
650            max_frontier_size: 1_000_000,
651            auto_flush_threshold: 10_000,
652            auto_flush_interval: Some(Duration::from_secs(5)),
653            auto_flush_min_mutations: 1,
654            wal_enabled: true,
655            compaction: CompactionConfig::default(),
656            throttle: WriteThrottleConfig::default(),
657            file_sandbox: FileSandboxConfig::default(),
658            query_timeout: Duration::from_secs(30),
659            commit_timeout: Duration::from_secs(5),
660            max_query_memory: 1024 * 1024 * 1024,       // 1GB
661            max_transaction_memory: 1024 * 1024 * 1024, // 1GB
662            max_compaction_rows: 5_000_000,             // 5M rows
663            max_recursive_cte_iterations: 1000,
664            object_store: ObjectStoreConfig::default(),
665            index_rebuild: IndexRebuildConfig::default(),
666            strict_schema: false,
667            partial_lance_writes: false,
668            defer_embeddings: false,
669            fork_fragment_warn_threshold: 256,
670            tx_id_reservoir_batch: 16,
671            // Default ON as of the Item-B-deep-fix landing (per-table
672            // write serialization + Lance Table cache removed + drain
673            // in flush_to_l1). Validated by full UNI_ASYNC_FLUSH=1
674            // cross-crate nextest: 1754/1754 pass.
675            //
676            // `UNI_ASYNC_FLUSH=0` / `=false` / `=no` (case-insensitive)
677            // explicitly DISABLES async flush — useful for bisecting
678            // suspected async-flush regressions and for the sync-only
679            // benchmarks in `flush_pressure.rs`. Unset = default
680            // behavior (true).
681            async_flush_enabled: std::env::var("UNI_ASYNC_FLUSH").map_or(true, |v| {
682                let v = v.to_ascii_lowercase();
683                !(v == "0" || v == "false" || v == "no")
684            }),
685            max_pending_flushes: 2,
686            // Bound a stalled async flush stream so a lost-wakeup in the
687            // sparse/multivec Lance RMW can't permanently wedge the pipeline
688            // (issue #132). 60s ≈ 200× a healthy flush, so it never kills a
689            // legitimately-slow flush, yet recovers a true stall in a minute.
690            flush_stream_timeout: std::env::var("UNI_FLUSH_STREAM_TIMEOUT")
691                .ok()
692                .and_then(|v| v.parse::<u64>().ok())
693                .map(Duration::from_secs)
694                .unwrap_or(Duration::from_secs(60)),
695            drop_fork_drain_timeout: Duration::from_secs(10),
696            max_forks: None,
697            fork_default_ttl: None,
698            fork_sweeper_interval: Duration::from_secs(60),
699            disable_fork_sweeper: false,
700            fork_index_build_threshold: 10_000,
701            fork_index_builder_interval: Duration::from_secs(30),
702            disable_fork_index_builder: false,
703            // Correctness-first default: SSI/OCC on. See the field docs for
704            // the behavioral contract and the migration note (concurrent
705            // writers now observe aborts instead of silent lost updates;
706            // wrap them in `Session::transact_with_retry`).
707            ssi_enabled: true,
708        }
709    }
710}
711
712/// Cloud storage backend configuration.
713///
714/// Supports Amazon S3, Google Cloud Storage, and Azure Blob Storage.
715/// Each variant contains the credentials and connection parameters for
716/// its respective cloud provider.
717///
718/// # Examples
719///
720/// ```ignore
721/// // Create S3 configuration from environment variables
722/// let config = CloudStorageConfig::s3_from_env("my-bucket");
723///
724/// // Create explicit S3 configuration for LocalStack testing
725/// let config = CloudStorageConfig::S3 {
726///     bucket: "test-bucket".to_string(),
727///     region: Some("us-east-1".to_string()),
728///     endpoint: Some("http://localhost:4566".to_string()),
729///     access_key_id: Some("test".to_string()),
730///     secret_access_key: Some("test".to_string()),
731///     session_token: None,
732///     virtual_hosted_style: false,
733/// };
734/// ```
735#[derive(Clone, Debug)]
736pub enum CloudStorageConfig {
737    /// Amazon S3 storage configuration.
738    S3 {
739        /// S3 bucket name.
740        bucket: String,
741        /// AWS region (e.g., "us-east-1"). Uses AWS_REGION env var if None.
742        region: Option<String>,
743        /// Custom endpoint URL for S3-compatible services (MinIO, LocalStack).
744        endpoint: Option<String>,
745        /// AWS access key ID. Uses AWS_ACCESS_KEY_ID env var if None.
746        access_key_id: Option<String>,
747        /// AWS secret access key. Uses AWS_SECRET_ACCESS_KEY env var if None.
748        secret_access_key: Option<String>,
749        /// AWS session token for temporary credentials.
750        session_token: Option<String>,
751        /// Use virtual-hosted-style requests (bucket.s3.region.amazonaws.com).
752        virtual_hosted_style: bool,
753    },
754    /// Google Cloud Storage configuration.
755    Gcs {
756        /// GCS bucket name.
757        bucket: String,
758        /// Path to service account JSON key file.
759        service_account_path: Option<String>,
760        /// Service account JSON key content (alternative to path).
761        service_account_key: Option<String>,
762    },
763    /// Azure Blob Storage configuration.
764    Azure {
765        /// Azure container name.
766        container: String,
767        /// Azure storage account name.
768        account: String,
769        /// Azure storage account access key.
770        access_key: Option<String>,
771        /// Azure SAS token for limited access.
772        sas_token: Option<String>,
773    },
774}
775
776impl CloudStorageConfig {
777    /// Creates an S3 configuration using environment variables.
778    ///
779    /// Reads credentials from standard AWS environment variables:
780    /// - `AWS_ACCESS_KEY_ID`
781    /// - `AWS_SECRET_ACCESS_KEY`
782    /// - `AWS_SESSION_TOKEN` (optional)
783    /// - `AWS_REGION` or `AWS_DEFAULT_REGION`
784    /// - `AWS_ENDPOINT_URL` (optional, for S3-compatible services)
785    #[must_use]
786    pub fn s3_from_env(bucket: &str) -> Self {
787        Self::S3 {
788            bucket: bucket.to_string(),
789            region: std::env::var("AWS_REGION")
790                .or_else(|_| std::env::var("AWS_DEFAULT_REGION"))
791                .ok(),
792            endpoint: std::env::var("AWS_ENDPOINT_URL").ok(),
793            access_key_id: std::env::var("AWS_ACCESS_KEY_ID").ok(),
794            secret_access_key: std::env::var("AWS_SECRET_ACCESS_KEY").ok(),
795            session_token: std::env::var("AWS_SESSION_TOKEN").ok(),
796            virtual_hosted_style: false,
797        }
798    }
799
800    /// Creates a GCS configuration using environment variables.
801    ///
802    /// Reads service account path from `GOOGLE_APPLICATION_CREDENTIALS`.
803    #[must_use]
804    pub fn gcs_from_env(bucket: &str) -> Self {
805        Self::Gcs {
806            bucket: bucket.to_string(),
807            service_account_path: std::env::var("GOOGLE_APPLICATION_CREDENTIALS").ok(),
808            service_account_key: None,
809        }
810    }
811
812    /// Creates an Azure configuration using environment variables.
813    ///
814    /// Reads credentials from Azure environment variables:
815    /// - `AZURE_STORAGE_ACCOUNT`
816    /// - `AZURE_STORAGE_ACCESS_KEY` (optional)
817    /// - `AZURE_STORAGE_SAS_TOKEN` (optional)
818    ///
819    /// # Panics
820    ///
821    /// Panics if `AZURE_STORAGE_ACCOUNT` is not set.
822    #[must_use]
823    pub fn azure_from_env(container: &str) -> Self {
824        Self::Azure {
825            container: container.to_string(),
826            account: std::env::var("AZURE_STORAGE_ACCOUNT")
827                .expect("AZURE_STORAGE_ACCOUNT environment variable required"),
828            access_key: std::env::var("AZURE_STORAGE_ACCESS_KEY").ok(),
829            sas_token: std::env::var("AZURE_STORAGE_SAS_TOKEN").ok(),
830        }
831    }
832
833    /// Returns the bucket/container name for this configuration.
834    #[must_use]
835    pub fn bucket_name(&self) -> &str {
836        match self {
837            Self::S3 { bucket, .. } => bucket,
838            Self::Gcs { bucket, .. } => bucket,
839            Self::Azure { container, .. } => container,
840        }
841    }
842
843    /// Returns a URL-style identifier for this storage location.
844    #[must_use]
845    pub fn to_url(&self) -> String {
846        match self {
847            Self::S3 { bucket, .. } => format!("s3://{bucket}"),
848            Self::Gcs { bucket, .. } => format!("gs://{bucket}"),
849            Self::Azure {
850                container, account, ..
851            } => format!("az://{account}/{container}"),
852        }
853    }
854}
855
856#[cfg(test)]
857mod security_tests {
858    use super::*;
859
860    /// Tests for CWE-22 (Path Traversal) prevention in file sandbox.
861    mod file_sandbox {
862        use super::*;
863
864        #[test]
865        fn test_sandbox_disabled_allows_all_paths() {
866            let config = FileSandboxConfig::default();
867            assert!(!config.enabled);
868            // When disabled, all paths are allowed
869            assert!(config.validate_path("/tmp/test").is_ok());
870        }
871
872        #[test]
873        fn test_sandbox_enabled_with_no_paths_rejects() {
874            let config = FileSandboxConfig {
875                enabled: true,
876                allowed_paths: vec![],
877            };
878            let result = config.validate_path("/tmp/test");
879            assert!(result.is_err());
880            assert!(result.unwrap_err().contains("no allowed paths configured"));
881        }
882
883        #[test]
884        fn test_sandbox_rejects_outside_path() {
885            let config = FileSandboxConfig {
886                enabled: true,
887                allowed_paths: vec![PathBuf::from("/var/lib/uni")],
888            };
889            let result = config.validate_path("/etc/passwd");
890            assert!(result.is_err());
891            assert!(result.unwrap_err().contains("outside allowed sandbox"));
892        }
893
894        #[test]
895        fn test_is_potentially_insecure() {
896            // Disabled is insecure
897            let disabled = FileSandboxConfig::default();
898            assert!(disabled.is_potentially_insecure());
899
900            // Enabled with no paths is insecure
901            let no_paths = FileSandboxConfig {
902                enabled: true,
903                allowed_paths: vec![],
904            };
905            assert!(no_paths.is_potentially_insecure());
906
907            // Enabled with paths is secure
908            let secure = FileSandboxConfig::sandboxed(vec![PathBuf::from("/data")]);
909            assert!(!secure.is_potentially_insecure());
910        }
911
912        #[test]
913        fn test_security_warning_when_disabled() {
914            let disabled = FileSandboxConfig::default();
915            assert!(disabled.security_warning().is_some());
916
917            let enabled = FileSandboxConfig::sandboxed(vec![PathBuf::from("/data")]);
918            assert!(enabled.security_warning().is_none());
919        }
920
921        #[test]
922        fn test_deployment_mode_defaults() {
923            let embedded = FileSandboxConfig::default_for_mode(DeploymentMode::Embedded);
924            assert!(!embedded.enabled);
925
926            let server = FileSandboxConfig::default_for_mode(DeploymentMode::Server);
927            assert!(server.enabled);
928            assert!(!server.allowed_paths.is_empty());
929        }
930    }
931}