Skip to main content

magic_bird/
config.rs

1//! Configuration for BIRD.
2//!
3//! BIRD_ROOT resolution order:
4//! 1. Explicit path passed to Config::new()
5//! 2. BIRD_ROOT environment variable
6//! 3. Default: ~/.local/share/bird
7
8use std::path::{Path, PathBuf};
9use std::str::FromStr;
10
11use directories::ProjectDirs;
12use serde::{Deserialize, Serialize};
13
14use crate::{Error, Result};
15
16/// Storage mode for BIRD data.
17///
18/// - **Parquet**: Multi-writer safe using atomic file creation. Suitable for
19///   concurrent shell hooks (shq). Requires periodic compaction.
20/// - **DuckDB**: Single-writer using direct table inserts. Simpler but requires
21///   serialized writes. Suitable for sequential CLI tools (blq).
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
23#[serde(rename_all = "lowercase")]
24pub enum StorageMode {
25    /// Write to Parquet files (multi-writer safe, requires compaction)
26    #[default]
27    Parquet,
28    /// Write directly to DuckDB tables (single-writer, no compaction needed)
29    DuckDB,
30}
31
32impl std::fmt::Display for StorageMode {
33    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
34        match self {
35            StorageMode::Parquet => write!(f, "parquet"),
36            StorageMode::DuckDB => write!(f, "duckdb"),
37        }
38    }
39}
40
41impl FromStr for StorageMode {
42    type Err = Error;
43
44    fn from_str(s: &str) -> Result<Self> {
45        match s.to_lowercase().as_str() {
46            "parquet" => Ok(StorageMode::Parquet),
47            "duckdb" => Ok(StorageMode::DuckDB),
48            _ => Err(Error::Config(format!(
49                "Invalid storage mode '{}': expected 'parquet' or 'duckdb'",
50                s
51            ))),
52        }
53    }
54}
55
56/// Type of remote storage.
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
58#[serde(rename_all = "lowercase")]
59pub enum RemoteType {
60    /// S3-compatible object storage (s3://, gs://)
61    S3,
62    /// MotherDuck cloud database (md:)
63    MotherDuck,
64    /// PostgreSQL database
65    Postgres,
66    /// Local or network file path
67    File,
68}
69
70impl std::fmt::Display for RemoteType {
71    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72        match self {
73            RemoteType::S3 => write!(f, "s3"),
74            RemoteType::MotherDuck => write!(f, "motherduck"),
75            RemoteType::Postgres => write!(f, "postgres"),
76            RemoteType::File => write!(f, "file"),
77        }
78    }
79}
80
81impl FromStr for RemoteType {
82    type Err = Error;
83
84    fn from_str(s: &str) -> Result<Self> {
85        match s.to_lowercase().as_str() {
86            "s3" | "gcs" => Ok(RemoteType::S3),
87            "motherduck" | "md" => Ok(RemoteType::MotherDuck),
88            "postgres" | "postgresql" | "pg" => Ok(RemoteType::Postgres),
89            "file" | "local" => Ok(RemoteType::File),
90            _ => Err(Error::Config(format!(
91                "Invalid remote type '{}': expected 's3', 'motherduck', 'postgres', or 'file'",
92                s
93            ))),
94        }
95    }
96}
97
98/// Access mode for remote storage.
99#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
100#[serde(rename_all = "snake_case")]
101pub enum RemoteMode {
102    /// Read and write access
103    #[default]
104    ReadWrite,
105    /// Read-only access
106    ReadOnly,
107}
108
109impl std::fmt::Display for RemoteMode {
110    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
111        match self {
112            RemoteMode::ReadWrite => write!(f, "read_write"),
113            RemoteMode::ReadOnly => write!(f, "read_only"),
114        }
115    }
116}
117
118/// Configuration for a remote storage location.
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct RemoteConfig {
121    /// Remote name (used as schema name: remote_{name})
122    pub name: String,
123
124    /// Type of remote storage
125    #[serde(rename = "type")]
126    pub remote_type: RemoteType,
127
128    /// URI for the remote (e.g., s3://bucket/path/bird.duckdb, md:database_name)
129    pub uri: String,
130
131    /// Access mode (read_write or read_only)
132    #[serde(default)]
133    pub mode: RemoteMode,
134
135    /// Credential provider for S3 (e.g., "credential_chain", "config")
136    #[serde(default)]
137    pub credential_provider: Option<String>,
138
139    /// Whether to auto-attach on connection open
140    #[serde(default = "default_true")]
141    pub auto_attach: bool,
142}
143
144fn default_true() -> bool {
145    true
146}
147
148impl RemoteConfig {
149    /// Create a new remote config.
150    pub fn new(name: impl Into<String>, remote_type: RemoteType, uri: impl Into<String>) -> Self {
151        Self {
152            name: name.into(),
153            remote_type,
154            uri: uri.into(),
155            mode: RemoteMode::default(),
156            credential_provider: None,
157            auto_attach: true,
158        }
159    }
160
161    /// Set read-only mode.
162    pub fn read_only(mut self) -> Self {
163        self.mode = RemoteMode::ReadOnly;
164        self
165    }
166
167    /// Get the DuckDB schema name for this remote.
168    pub fn schema_name(&self) -> String {
169        format!("remote_{}", self.name)
170    }
171
172    /// Get the quoted DuckDB schema name for this remote (for use in SQL).
173    pub fn quoted_schema_name(&self) -> String {
174        format!("\"remote_{}\"", self.name)
175    }
176
177    /// Generate the ATTACH SQL statement for this remote.
178    pub fn attach_sql(&self) -> String {
179        let mode_clause = match self.mode {
180            RemoteMode::ReadOnly => " (READ_ONLY)",
181            RemoteMode::ReadWrite => "",
182        };
183
184        let type_clause = match self.remote_type {
185            RemoteType::Postgres => " (TYPE postgres)",
186            _ => "",
187        };
188
189        format!(
190            "ATTACH '{}' AS {}{}{}",
191            self.uri,
192            self.quoted_schema_name(),
193            type_clause,
194            mode_clause
195        )
196    }
197
198    /// Get the base URL for blob storage (for S3/GCS remotes).
199    pub fn blob_base_url(&self) -> Option<String> {
200        match self.remote_type {
201            RemoteType::S3 => {
202                // Extract bucket/prefix from URI, append /blobs
203                // e.g., s3://bucket/path/bird.duckdb -> s3://bucket/path/blobs
204                if let Some(stripped) = self.uri.strip_suffix(".duckdb") {
205                    Some(format!("{}/blobs", stripped))
206                } else {
207                    Some(format!("{}/blobs", self.uri.trim_end_matches('/')))
208                }
209            }
210            _ => None,
211        }
212    }
213
214    /// Get the data directory for file remotes.
215    ///
216    /// For a remote URI like `file:///path/to/db/bird.duckdb`, returns `/path/to/db/data`.
217    /// This is needed so parquet-mode remotes can resolve their relative file paths.
218    pub fn data_dir(&self) -> Option<std::path::PathBuf> {
219        if self.remote_type != RemoteType::File {
220            return None;
221        }
222
223        // Parse file:// URI to get the database path
224        let db_path = self.uri.strip_prefix("file://")?;
225        let db_path = std::path::Path::new(db_path);
226
227        // Data directory is sibling to the .duckdb file: /path/to/db/bird.duckdb -> /path/to/db/data
228        db_path.parent().map(|p| p.join("data"))
229    }
230}
231
232/// Sync configuration for push/pull operations.
233#[derive(Debug, Clone, Default, Serialize, Deserialize)]
234pub struct SyncConfig {
235    /// Default remote for push/pull operations.
236    #[serde(default)]
237    pub default_remote: Option<String>,
238
239    /// Push data after compact operations.
240    #[serde(default)]
241    pub push_on_compact: bool,
242
243    /// Push data before archive operations.
244    #[serde(default)]
245    pub push_on_archive: bool,
246
247    /// Sync invocations table.
248    #[serde(default = "default_true")]
249    pub sync_invocations: bool,
250
251    /// Sync outputs table.
252    #[serde(default = "default_true")]
253    pub sync_outputs: bool,
254
255    /// Sync events table.
256    #[serde(default = "default_true")]
257    pub sync_events: bool,
258
259    /// Sync blob content files.
260    #[serde(default)]
261    pub sync_blobs: bool,
262
263    /// Minimum blob size to sync (bytes). Smaller blobs stay inline.
264    #[serde(default = "default_blob_sync_min")]
265    pub blob_sync_min_bytes: usize,
266}
267
268fn default_blob_sync_min() -> usize {
269    1024 // 1KB
270}
271
272/// Shell hook configuration.
273#[derive(Debug, Clone, Serialize, Deserialize, Default)]
274pub struct HooksConfig {
275    /// Command patterns to ignore (not record).
276    /// Uses glob-style matching. Defaults include shq/blq commands and job control.
277    #[serde(default = "default_ignore_patterns")]
278    pub ignore_patterns: Vec<String>,
279}
280
281fn default_ignore_patterns() -> Vec<String> {
282    vec![
283        // shq/blq commands (they handle their own recording or are queries)
284        "shq *".to_string(),
285        "shqr *".to_string(),
286        "blq *".to_string(),
287        // % aliases (expand to shq commands)
288        "%*".to_string(),
289        // Job control (noise, can cause issues)
290        "fg".to_string(),
291        "fg *".to_string(),
292        "bg".to_string(),
293        "bg *".to_string(),
294        "jobs".to_string(),
295        "jobs *".to_string(),
296        // Shell session commands
297        "exit".to_string(),
298        "logout".to_string(),
299        // Utility commands (noise)
300        "clear".to_string(),
301        "history".to_string(),
302        "history *".to_string(),
303    ]
304}
305
306/// BIRD configuration.
307#[derive(Debug, Clone, Serialize, Deserialize)]
308pub struct Config {
309    /// Root directory for all BIRD data.
310    pub bird_root: PathBuf,
311
312    /// Client identifier for this machine.
313    #[serde(default = "default_client_id")]
314    pub client_id: String,
315
316    /// Days to keep data in hot tier before archiving.
317    #[serde(default = "default_hot_days")]
318    pub hot_days: u32,
319
320    /// Threshold in bytes for inline vs blob storage.
321    #[serde(default = "default_inline_threshold")]
322    pub inline_threshold: usize,
323
324    /// Automatically extract events after `shq run` commands.
325    #[serde(default)]
326    pub auto_extract: bool,
327
328    /// Storage mode for writing data.
329    /// - parquet: Multi-writer safe, requires compaction (default)
330    /// - duckdb: Single-writer, no compaction needed
331    #[serde(default)]
332    pub storage_mode: StorageMode,
333
334    /// Remote storage configurations.
335    #[serde(default)]
336    pub remotes: Vec<RemoteConfig>,
337
338    /// Sync configuration for push/pull operations.
339    #[serde(default)]
340    pub sync: SyncConfig,
341
342    /// Shell hook configuration.
343    #[serde(default)]
344    pub hooks: HooksConfig,
345}
346
347fn default_client_id() -> String {
348    // Deterministic: username@hostname
349    let username = std::env::var("USER")
350        .or_else(|_| std::env::var("USERNAME"))
351        .unwrap_or_else(|_| "unknown".to_string());
352    let hostname = gethostname::gethostname()
353        .to_string_lossy()
354        .to_string();
355    format!("{}@{}", username, hostname)
356}
357
358fn default_hot_days() -> u32 {
359    14
360}
361
362fn default_inline_threshold() -> usize {
363    4_096 // 4KB - small for easy testing of blob storage
364}
365
366impl Config {
367    /// Create a new config with the given BIRD_ROOT.
368    pub fn with_root(bird_root: impl Into<PathBuf>) -> Self {
369        Self {
370            bird_root: bird_root.into(),
371            client_id: default_client_id(),
372            hot_days: default_hot_days(),
373            inline_threshold: default_inline_threshold(),
374            auto_extract: false,
375            storage_mode: StorageMode::default(),
376            remotes: Vec::new(),
377            sync: SyncConfig::default(),
378            hooks: HooksConfig::default(),
379        }
380    }
381
382    /// Create a new config with DuckDB storage mode.
383    pub fn with_duckdb_mode(bird_root: impl Into<PathBuf>) -> Self {
384        Self {
385            bird_root: bird_root.into(),
386            client_id: default_client_id(),
387            hot_days: default_hot_days(),
388            inline_threshold: default_inline_threshold(),
389            auto_extract: false,
390            storage_mode: StorageMode::DuckDB,
391            remotes: Vec::new(),
392            sync: SyncConfig::default(),
393            hooks: HooksConfig::default(),
394        }
395    }
396
397    /// Create a config using default BIRD_ROOT resolution.
398    pub fn default_location() -> Result<Self> {
399        let bird_root = resolve_bird_root()?;
400        Ok(Self::with_root(bird_root))
401    }
402
403    /// Load config from BIRD_ROOT/config.toml, or create default.
404    pub fn load() -> Result<Self> {
405        let bird_root = resolve_bird_root()?;
406        Self::load_from(&bird_root)
407    }
408
409    /// Load config from a specific BIRD_ROOT.
410    pub fn load_from(bird_root: &Path) -> Result<Self> {
411        let config_path = bird_root.join("config.toml");
412
413        if config_path.exists() {
414            let contents = std::fs::read_to_string(&config_path)?;
415            let mut config: Config = toml::from_str(&contents)
416                .map_err(|e| Error::Config(format!("Failed to parse config: {}", e)))?;
417            // Ensure bird_root matches the actual location
418            config.bird_root = bird_root.to_path_buf();
419            Ok(config)
420        } else {
421            Ok(Self::with_root(bird_root))
422        }
423    }
424
425    /// Save config to BIRD_ROOT/config.toml.
426    pub fn save(&self) -> Result<()> {
427        let config_path = self.bird_root.join("config.toml");
428        let contents = toml::to_string_pretty(self)
429            .map_err(|e| Error::Config(format!("Failed to serialize config: {}", e)))?;
430        std::fs::write(config_path, contents)?;
431        Ok(())
432    }
433
434    // Path helpers
435
436    /// Path to the DuckDB database file.
437    pub fn db_path(&self) -> PathBuf {
438        self.bird_root.join("db/bird.duckdb")
439    }
440
441    /// Path to the data directory.
442    pub fn data_dir(&self) -> PathBuf {
443        self.bird_root.join("db/data")
444    }
445
446    /// Path to the recent (hot) data directory.
447    pub fn recent_dir(&self) -> PathBuf {
448        self.data_dir().join("recent")
449    }
450
451    /// Path to the archive (cold) data directory.
452    pub fn archive_dir(&self) -> PathBuf {
453        self.data_dir().join("archive")
454    }
455
456    /// Path to invocations parquet files for a given date and status.
457    ///
458    /// Status partitioning: `recent/invocations/status=<status>/date=YYYY-MM-DD/`
459    pub fn invocations_dir_with_status(&self, status: &str, date: &chrono::NaiveDate) -> PathBuf {
460        self.recent_dir()
461            .join("invocations")
462            .join(format!("status={}", status))
463            .join(format!("date={}", date))
464    }
465
466    /// Path to invocations parquet files for a given date (defaults to "completed" status).
467    ///
468    /// For backwards compatibility - use `invocations_dir_with_status` for explicit status.
469    pub fn invocations_dir(&self, date: &chrono::NaiveDate) -> PathBuf {
470        self.invocations_dir_with_status("completed", date)
471    }
472
473    /// Path to the pending invocations directory (JSON files for crash recovery).
474    pub fn pending_dir(&self) -> PathBuf {
475        self.bird_root.join("db/pending")
476    }
477
478    /// Path to outputs parquet files for a given date.
479    pub fn outputs_dir(&self, date: &chrono::NaiveDate) -> PathBuf {
480        self.recent_dir()
481            .join("outputs")
482            .join(format!("date={}", date))
483    }
484
485    /// Path to sessions parquet files for a given date.
486    pub fn sessions_dir(&self, date: &chrono::NaiveDate) -> PathBuf {
487        self.recent_dir()
488            .join("sessions")
489            .join(format!("date={}", date))
490    }
491
492    /// Path to the SQL files directory.
493    pub fn sql_dir(&self) -> PathBuf {
494        self.bird_root.join("db/sql")
495    }
496
497    /// Path to the DuckDB extensions directory.
498    pub fn extensions_dir(&self) -> PathBuf {
499        self.bird_root.join("db/extensions")
500    }
501
502    /// Path to the blobs content directory.
503    pub fn blobs_dir(&self) -> PathBuf {
504        self.recent_dir().join("blobs/content")
505    }
506
507    /// Path to a specific blob file by hash and command.
508    pub fn blob_path(&self, hash: &str, cmd_hint: &str) -> PathBuf {
509        let prefix = &hash[..2.min(hash.len())];
510        let sanitized_cmd = sanitize_for_filename(cmd_hint);
511        self.blobs_dir()
512            .join(prefix)
513            .join(format!("{}--{}.bin", hash, sanitized_cmd))
514    }
515
516    /// Path to the event-formats.toml config file (legacy).
517    pub fn event_formats_path(&self) -> PathBuf {
518        self.bird_root.join("event-formats.toml")
519    }
520
521    /// Path to the format-hints.toml config file.
522    pub fn format_hints_path(&self) -> PathBuf {
523        self.bird_root.join("format-hints.toml")
524    }
525
526    /// Path to events parquet files for a given date.
527    pub fn events_dir(&self, date: &chrono::NaiveDate) -> PathBuf {
528        self.recent_dir()
529            .join("events")
530            .join(format!("date={}", date))
531    }
532
533    // Remote management helpers
534
535    /// Get a remote by name.
536    pub fn get_remote(&self, name: &str) -> Option<&RemoteConfig> {
537        self.remotes.iter().find(|r| r.name == name)
538    }
539
540    /// Add a remote configuration.
541    pub fn add_remote(&mut self, remote: RemoteConfig) {
542        // Remove existing remote with same name
543        self.remotes.retain(|r| r.name != remote.name);
544        self.remotes.push(remote);
545    }
546
547    /// Remove a remote by name. Returns true if removed.
548    pub fn remove_remote(&mut self, name: &str) -> bool {
549        let len_before = self.remotes.len();
550        self.remotes.retain(|r| r.name != name);
551        self.remotes.len() < len_before
552    }
553
554    /// Get all blob roots for multi-location resolution.
555    /// Returns local blobs dir first, then remote blob URLs.
556    pub fn blob_roots(&self) -> Vec<String> {
557        let mut roots = vec![self.blobs_dir().to_string_lossy().to_string()];
558
559        for remote in &self.remotes {
560            if let Some(blob_url) = remote.blob_base_url() {
561                roots.push(blob_url);
562            }
563        }
564
565        roots
566    }
567
568    /// Get remotes that should be auto-attached.
569    pub fn auto_attach_remotes(&self) -> Vec<&RemoteConfig> {
570        self.remotes.iter().filter(|r| r.auto_attach).collect()
571    }
572}
573
574/// Sanitize a string for use in filenames (used for blob naming).
575fn sanitize_for_filename(s: &str) -> String {
576    s.chars()
577        .map(|c| match c {
578            '/' | '\\' | ':' | '*' | '?' | '"' | '<' | '>' | '|' => '_',
579            ' ' => '-',
580            c if c.is_alphanumeric() || c == '-' || c == '_' || c == '.' => c,
581            _ => '_',
582        })
583        .take(32) // Shorter for blob filenames
584        .collect()
585}
586
587/// Resolve BIRD_ROOT using the standard resolution order.
588fn resolve_bird_root() -> Result<PathBuf> {
589    // 1. Environment variable
590    if let Ok(path) = std::env::var("BIRD_ROOT") {
591        return Ok(PathBuf::from(path));
592    }
593
594    // 2. XDG data directory (via directories crate)
595    if let Some(proj_dirs) = ProjectDirs::from("", "", "bird") {
596        return Ok(proj_dirs.data_dir().to_path_buf());
597    }
598
599    // 3. Fallback to ~/.local/share/bird
600    let home = std::env::var("HOME")
601        .map_err(|_| Error::Config("Could not determine home directory".to_string()))?;
602    Ok(PathBuf::from(home).join(".local/share/bird"))
603}
604
605#[cfg(test)]
606mod tests {
607    use super::*;
608    use tempfile::TempDir;
609
610    #[test]
611    fn test_config_with_root() {
612        let config = Config::with_root("/tmp/test-bird");
613        assert_eq!(config.bird_root, PathBuf::from("/tmp/test-bird"));
614        assert_eq!(config.hot_days, 14);
615        assert_eq!(config.inline_threshold, 4_096);
616    }
617
618    #[test]
619    fn test_blob_path() {
620        let config = Config::with_root("/tmp/test-bird");
621        let path = config.blob_path("abcdef123456", "make test");
622        assert_eq!(
623            path,
624            PathBuf::from("/tmp/test-bird/db/data/recent/blobs/content/ab/abcdef123456--make-test.bin")
625        );
626    }
627
628    #[test]
629    fn test_config_paths() {
630        let config = Config::with_root("/tmp/test-bird");
631        assert_eq!(config.db_path(), PathBuf::from("/tmp/test-bird/db/bird.duckdb"));
632        assert_eq!(config.recent_dir(), PathBuf::from("/tmp/test-bird/db/data/recent"));
633    }
634
635    #[test]
636    fn test_config_save_load() {
637        let tmp = TempDir::new().unwrap();
638        let bird_root = tmp.path().to_path_buf();
639
640        // Create the directory structure
641        std::fs::create_dir_all(&bird_root).unwrap();
642
643        let config = Config::with_root(&bird_root);
644        config.save().unwrap();
645
646        let loaded = Config::load_from(&bird_root).unwrap();
647        assert_eq!(loaded.hot_days, config.hot_days);
648        assert_eq!(loaded.inline_threshold, config.inline_threshold);
649    }
650}