Skip to main content

innate_core/
backup.rs

1//! Cloudflare R2 backup for the Innate knowledge database.
2//!
3//! Uses S3-compatible API with AWS Signature V4.
4//! R2 endpoint: https://{account_id}.r2.cloudflarestorage.com
5//! Region for signing: "auto"
6
7use std::path::Path;
8
9use hmac::{Hmac, Mac};
10use sha2::{Digest, Sha256};
11
12use crate::settings::{BackupConfig, R2Config};
13
14type HmacSha256 = Hmac<Sha256>;
15
16// ── Backup state (local cache of last backup time) ───────────────────────────
17
18fn backup_state_path() -> std::path::PathBuf {
19    dirs_next::home_dir()
20        .unwrap_or_else(|| std::path::PathBuf::from("."))
21        .join(".innate")
22        .join("backup_state.json")
23}
24
25#[derive(Debug, Default, serde::Serialize, serde::Deserialize)]
26pub struct BackupState {
27    pub last_backup_at: Option<String>,
28    pub last_backup_key: Option<String>,
29}
30
31fn load_state() -> BackupState {
32    let Ok(text) = std::fs::read_to_string(backup_state_path()) else {
33        return BackupState::default();
34    };
35    serde_json::from_str(&text).unwrap_or_default()
36}
37
38fn save_state(state: &BackupState) -> anyhow::Result<()> {
39    let path = backup_state_path();
40    if let Some(parent) = path.parent() {
41        std::fs::create_dir_all(parent)?;
42    }
43    std::fs::write(path, serde_json::to_string_pretty(state)?)?;
44    Ok(())
45}
46
47// ── Public result types ───────────────────────────────────────────────────────
48
49#[derive(Debug, Clone, serde::Serialize)]
50pub struct BackupInfo {
51    pub key: String,
52    pub last_modified: String,
53    pub size_bytes: u64,
54}
55
56#[derive(Debug, serde::Serialize)]
57pub struct PruneResult {
58    pub deleted: Vec<String>,
59    pub kept: usize,
60    /// Backups older than retention_days that were NOT deleted to honour min_backups.
61    pub protected_by_min: usize,
62}
63
64#[derive(Debug, serde::Serialize)]
65pub struct BackupResult {
66    pub key: String,
67    pub size_bytes: u64,
68    pub prune: PruneResult,
69}
70
71// ── R2 backup service ─────────────────────────────────────────────────────────
72
73pub struct R2BackupService {
74    account_id: String,
75    bucket: String,
76    access_key_id: String,
77    secret_access_key: String,
78    prefix: String,
79}
80
81impl R2BackupService {
82    pub fn from_config(cfg: &R2Config) -> anyhow::Result<Self> {
83        let access_key_id = cfg
84            .resolved_access_key_id()
85            .ok_or_else(|| anyhow::anyhow!(
86                "R2 access_key_id not set; configure backup.r2.access_key_id \
87                 or INNATE_R2_ACCESS_KEY_ID env var"
88            ))?;
89        let secret_access_key = cfg
90            .resolved_secret_access_key()
91            .ok_or_else(|| anyhow::anyhow!(
92                "R2 secret_access_key not set; configure backup.r2.secret_access_key \
93                 or INNATE_R2_SECRET_ACCESS_KEY env var"
94            ))?;
95        Ok(Self {
96            account_id: cfg.account_id.clone(),
97            bucket: cfg.bucket.clone(),
98            access_key_id,
99            secret_access_key,
100            prefix: cfg.prefix.clone(),
101        })
102    }
103
104    fn endpoint_base(&self) -> String {
105        format!("https://{}.r2.cloudflarestorage.com", self.account_id)
106    }
107
108    fn host(&self) -> String {
109        format!("{}.r2.cloudflarestorage.com", self.account_id)
110    }
111
112    // ── AWS Sig V4 ────────────────────────────────────────────────────────────
113
114    fn sha256_hex(data: &[u8]) -> String {
115        let mut h = Sha256::new();
116        h.update(data);
117        hex_bytes(&h.finalize())
118    }
119
120    fn hmac_sha256(key: &[u8], data: &[u8]) -> Vec<u8> {
121        let mut mac = HmacSha256::new_from_slice(key).expect("HMAC accepts any key size");
122        mac.update(data);
123        mac.finalize().into_bytes().to_vec()
124    }
125
126    /// Returns `(Authorization header value, x-amz-content-sha256 value)`.
127    fn sign(
128        &self,
129        method: &str,
130        path: &str,  // URI path, e.g. "/bucket/key" (already URI-encoded)
131        query: &str, // canonical query string (keys=values sorted, values URI-encoded)
132        body: &[u8],
133        datetime: &str, // "20240101T120000Z"
134    ) -> (String, String) {
135        let date = &datetime[..8];
136        let payload_hash = Self::sha256_hex(body);
137        let host = self.host();
138
139        // Canonical headers — must be sorted by lowercase name.
140        let mut hdrs: Vec<(String, String)> = vec![
141            ("host".into(), host.clone()),
142            ("x-amz-content-sha256".into(), payload_hash.clone()),
143            ("x-amz-date".into(), datetime.into()),
144        ];
145        hdrs.sort_by(|a, b| a.0.cmp(&b.0));
146
147        let canonical_headers: String = hdrs.iter().map(|(k, v)| format!("{}:{}\n", k, v)).collect();
148        let signed_headers: String = hdrs.iter().map(|(k, _)| k.as_str()).collect::<Vec<_>>().join(";");
149
150        let canonical_request = format!(
151            "{}\n{}\n{}\n{}\n{}\n{}",
152            method, path, query, canonical_headers, signed_headers, payload_hash
153        );
154
155        let credential_scope = format!("{}/auto/s3/aws4_request", date);
156        let string_to_sign = format!(
157            "AWS4-HMAC-SHA256\n{}\n{}\n{}",
158            datetime,
159            credential_scope,
160            Self::sha256_hex(canonical_request.as_bytes())
161        );
162
163        let k_date = Self::hmac_sha256(
164            format!("AWS4{}", self.secret_access_key).as_bytes(),
165            date.as_bytes(),
166        );
167        let k_region = Self::hmac_sha256(&k_date, b"auto");
168        let k_service = Self::hmac_sha256(&k_region, b"s3");
169        let k_signing = Self::hmac_sha256(&k_service, b"aws4_request");
170        let signature = hex_bytes(&Self::hmac_sha256(&k_signing, string_to_sign.as_bytes()));
171
172        let auth = format!(
173            "AWS4-HMAC-SHA256 Credential={}/{},SignedHeaders={},Signature={}",
174            self.access_key_id, credential_scope, signed_headers, signature
175        );
176        (auth, payload_hash)
177    }
178
179    fn now_datetime() -> String {
180        chrono::Utc::now().format("%Y%m%dT%H%M%SZ").to_string()
181    }
182
183    // ── HTTP operations ───────────────────────────────────────────────────────
184
185    fn put_object(&self, key: &str, body: &[u8]) -> anyhow::Result<()> {
186        let path = format!("/{}/{}", self.bucket, uri_encode_path(key));
187        let datetime = Self::now_datetime();
188        let (auth, payload_hash) = self.sign("PUT", &path, "", body, &datetime);
189        let url = format!("{}{}", self.endpoint_base(), path);
190        ureq::put(&url)
191            .set("x-amz-date", &datetime)
192            .set("x-amz-content-sha256", &payload_hash)
193            .set("Authorization", &auth)
194            .send_bytes(body)
195            .map_err(|e| anyhow::anyhow!("R2 PUT {key}: {e}"))?;
196        Ok(())
197    }
198
199    fn delete_object(&self, key: &str) -> anyhow::Result<()> {
200        let path = format!("/{}/{}", self.bucket, uri_encode_path(key));
201        let datetime = Self::now_datetime();
202        let (auth, payload_hash) = self.sign("DELETE", &path, "", &[], &datetime);
203        let url = format!("{}{}", self.endpoint_base(), path);
204        match ureq::delete(&url)
205            .set("x-amz-date", &datetime)
206            .set("x-amz-content-sha256", &payload_hash)
207            .set("Authorization", &auth)
208            .call()
209        {
210            Ok(_) => Ok(()),
211            Err(ureq::Error::Status(404, _)) => Ok(()), // already gone
212            Err(e) => Err(anyhow::anyhow!("R2 DELETE {key}: {e}")),
213        }
214    }
215
216    fn list_objects(&self) -> anyhow::Result<Vec<BackupInfo>> {
217        // Build canonical query: list-type=2 (and prefix if set).
218        // Values must be URI-encoded in the canonical query string.
219        let mut query_params: Vec<(&str, String)> = vec![("list-type", "2".into())];
220        if !self.prefix.is_empty() {
221            query_params.push(("prefix", uri_encode_value(&self.prefix)));
222        }
223        query_params.sort_by_key(|(k, _)| *k);
224        let canonical_query: String = query_params
225            .iter()
226            .map(|(k, v)| format!("{}={}", k, v))
227            .collect::<Vec<_>>()
228            .join("&");
229
230        let path = format!("/{}", self.bucket);
231        let datetime = Self::now_datetime();
232        let (auth, payload_hash) = self.sign("GET", &path, &canonical_query, &[], &datetime);
233        let url = format!("{}{}?{}", self.endpoint_base(), path, canonical_query);
234
235        let body = ureq::get(&url)
236            .set("x-amz-date", &datetime)
237            .set("x-amz-content-sha256", &payload_hash)
238            .set("Authorization", &auth)
239            .call()
240            .map_err(|e| anyhow::anyhow!("R2 LIST: {e}"))?
241            .into_string()
242            .map_err(|e| anyhow::anyhow!("R2 LIST read: {e}"))?;
243
244        Ok(parse_list_xml(&body))
245    }
246
247    // ── Public API ────────────────────────────────────────────────────────────
248
249    /// Create a consistent SQLite copy via VACUUM INTO, upload to R2, then prune old backups.
250    pub fn backup_now(
251        &self,
252        db_path: &Path,
253        retention_days: u64,
254        min_backups: usize,
255    ) -> anyhow::Result<BackupResult> {
256        // Temporary file for the VACUUM copy.
257        let tmp_dir = dirs_next::home_dir()
258            .unwrap_or_else(|| std::path::PathBuf::from("."))
259            .join(".innate")
260            .join("tmp");
261        std::fs::create_dir_all(&tmp_dir)?;
262        let ts_str = chrono::Utc::now().format("%Y%m%d-%H%M%S").to_string();
263        let tmp_path = tmp_dir.join(format!("innate-backup-{ts_str}.db"));
264
265        let vacuum_result = (|| -> anyhow::Result<Vec<u8>> {
266            let conn = rusqlite::Connection::open(db_path)
267                .map_err(|e| anyhow::anyhow!("Cannot open DB for backup: {e}"))?;
268            conn.execute(
269                "VACUUM INTO ?1",
270                rusqlite::params![tmp_path.to_string_lossy().as_ref()],
271            )
272            .map_err(|e| anyhow::anyhow!("VACUUM INTO failed: {e}"))?;
273            let bytes = std::fs::read(&tmp_path)
274                .map_err(|e| anyhow::anyhow!("Cannot read backup temp file: {e}"))?;
275            Ok(bytes)
276        })();
277        let _ = std::fs::remove_file(&tmp_path); // clean up regardless
278        let body = vacuum_result?;
279
280        let size_bytes = body.len() as u64;
281        let key = format!("{}innate-backup-{ts_str}.db", self.prefix);
282        self.put_object(&key, &body)?;
283
284        let now = crate::utils::utc_now_iso();
285        let _ = save_state(&BackupState {
286            last_backup_at: Some(now),
287            last_backup_key: Some(key.clone()),
288        });
289
290        let prune = self.prune_old_backups(retention_days, min_backups)?;
291
292        Ok(BackupResult { key, size_bytes, prune })
293    }
294
295    /// List all backup objects in R2.
296    pub fn list_backups(&self) -> anyhow::Result<Vec<BackupInfo>> {
297        let mut backups = self.list_objects()?;
298        backups.sort_by(|a, b| a.last_modified.cmp(&b.last_modified));
299        Ok(backups)
300    }
301
302    /// Delete backups older than `retention_days`, but always keep at least `min_backups`.
303    pub fn prune_old_backups(
304        &self,
305        retention_days: u64,
306        min_backups: usize,
307    ) -> anyhow::Result<PruneResult> {
308        let mut backups = self.list_objects()?;
309        // Oldest first.
310        backups.sort_by(|a, b| a.last_modified.cmp(&b.last_modified));
311
312        let cutoff = (chrono::Utc::now() - chrono::Duration::days(retention_days as i64))
313            .format("%Y-%m-%dT%H:%M:%S")
314            .to_string();
315
316        let total = backups.len();
317        let old_indices: Vec<usize> = backups
318            .iter()
319            .enumerate()
320            .filter(|(_, b)| b.last_modified < cutoff)
321            .map(|(i, _)| i)
322            .collect();
323
324        // We must keep at least min_backups total.
325        let max_deletable = total.saturating_sub(min_backups);
326        let deletable = old_indices.len().min(max_deletable);
327        let protected = old_indices.len().saturating_sub(deletable);
328
329        let to_delete: Vec<String> = old_indices
330            .into_iter()
331            .take(deletable)
332            .map(|i| backups[i].key.clone())
333            .collect();
334
335        for key in &to_delete {
336            self.delete_object(key)?;
337        }
338
339        Ok(PruneResult {
340            kept: total - to_delete.len(),
341            deleted: to_delete,
342            protected_by_min: protected,
343        })
344    }
345
346    // ── State helpers (no network) ────────────────────────────────────────────
347
348    /// Returns true if a backup is due based on local state (no network call).
349    pub fn needs_backup(interval_hours: u64) -> bool {
350        let state = load_state();
351        let Some(last_at) = state.last_backup_at else {
352            return true;
353        };
354        // utc_now_iso() format: "2024-01-01T12:00:00.000Z"
355        let Ok(last_dt) = chrono::DateTime::parse_from_rfc3339(&last_at) else {
356            return true;
357        };
358        let elapsed = chrono::Utc::now().signed_duration_since(last_dt.with_timezone(&chrono::Utc));
359        elapsed.num_hours() >= interval_hours as i64
360    }
361
362    pub fn last_backup_state() -> BackupState {
363        load_state()
364    }
365}
366
367// ── Auto-backup entry point (called by daemon and MCP/CLI) ───────────────────
368
369/// Run a backup if the configured interval has elapsed since the last backup.
370/// Returns true if a backup was performed.
371pub fn maybe_auto_backup(db_path: &Path, cfg: &BackupConfig) -> anyhow::Result<bool> {
372    if !cfg.enable {
373        return Ok(false);
374    }
375    let r2_cfg = match &cfg.r2 {
376        Some(c) => c,
377        None => return Ok(false),
378    };
379    if !R2BackupService::needs_backup(cfg.auto_backup_interval_hours) {
380        return Ok(false);
381    }
382    let svc = R2BackupService::from_config(r2_cfg)?;
383    svc.backup_now(db_path, cfg.retention_days, cfg.min_backups)?;
384    Ok(true)
385}
386
387// ── Helpers ───────────────────────────────────────────────────────────────────
388
389fn hex_bytes(bytes: &[u8]) -> String {
390    bytes.iter().map(|b| format!("{b:02x}")).collect()
391}
392
393/// URI-encode a key path segment, preserving forward slashes.
394fn uri_encode_path(s: &str) -> String {
395    let mut out = String::new();
396    for b in s.as_bytes() {
397        match b {
398            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' | b'/' => {
399                out.push(*b as char);
400            }
401            other => out.push_str(&format!("%{other:02X}")),
402        }
403    }
404    out
405}
406
407/// URI-encode a query parameter value (encodes slashes too).
408fn uri_encode_value(s: &str) -> String {
409    let mut out = String::new();
410    for b in s.as_bytes() {
411        match b {
412            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => {
413                out.push(*b as char);
414            }
415            other => {
416                out.push_str(&format!("%{other:02X}"));
417            }
418        }
419    }
420    out
421}
422
423/// Parse S3 XML list-objects-v2 response into BackupInfo list.
424fn parse_list_xml(body: &str) -> Vec<BackupInfo> {
425    let mut results = Vec::new();
426    for block in body.split("<Contents>").skip(1) {
427        let key = xml_text(block, "Key");
428        let lm = xml_text(block, "LastModified");
429        let size = xml_text(block, "Size")
430            .and_then(|s| s.parse().ok())
431            .unwrap_or(0);
432        if let (Some(key), Some(last_modified)) = (key, lm) {
433            results.push(BackupInfo { key, last_modified, size_bytes: size });
434        }
435    }
436    results
437}
438
439fn xml_text(haystack: &str, tag: &str) -> Option<String> {
440    let open = format!("<{tag}>");
441    let close = format!("</{tag}>");
442    let start = haystack.find(&open)? + open.len();
443    let end = haystack[start..].find(&close)? + start;
444    Some(haystack[start..end].to_owned())
445}