Skip to main content

bones_core/
recovery.rs

1//! Recovery procedures for corrupt shards, partial writes, and missing DB.
2//!
3//! This module implements the runtime recovery procedures that restore a bones
4//! project to a consistent state after:
5//! - Partial/torn writes (process crash mid-append)
6//! - Corrupt shard data (bit flips, truncation, invalid content)
7//! - Missing or corrupt `SQLite` projection database
8//! - Missing or corrupt binary cache files
9//! - Locked database (retry with timeout)
10//!
11//! # Recovery Philosophy
12//!
13//! - **Deterministic**: same input → same recovery action, every time.
14//! - **No silent data loss**: corrupt data is quarantined, never deleted outright.
15//! - **Fast common path**: torn-write repair is the typical case (truncate last
16//!   incomplete line). Complex cases (quarantine, rebuild) are rarer.
17//! - **User-facing messages**: every action emits a diagnostic so operators know
18//!   exactly what happened and why.
19
20use std::fs;
21use std::io;
22use std::path::{Path, PathBuf};
23use std::time::{Duration, Instant};
24
25use crate::event::parser;
26
27// ---------------------------------------------------------------------------
28// Types
29// ---------------------------------------------------------------------------
30
31/// Report from recovering a corrupt or partially-written shard file.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct RecoveryReport {
34    /// Path to the shard that was recovered.
35    pub shard_path: PathBuf,
36    /// Number of valid events preserved.
37    pub events_preserved: usize,
38    /// Number of corrupt/invalid events discarded.
39    pub events_discarded: usize,
40    /// Byte offset where corruption was detected (if applicable).
41    pub corruption_offset: Option<u64>,
42    /// What action was taken.
43    pub action_taken: RecoveryAction,
44}
45
46/// The action taken during recovery.
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub enum RecoveryAction {
49    /// Truncated file at the last valid event boundary.
50    Truncated {
51        /// Number of bytes removed from the end.
52        bytes_removed: u64,
53    },
54    /// Quarantined corrupt data to a `.corrupt` backup file.
55    Quarantined {
56        /// Path to the backup file containing the corrupt data.
57        backup_path: PathBuf,
58    },
59    /// No action needed — file was valid.
60    NoActionNeeded,
61}
62
63/// Errors that can occur during recovery operations.
64#[derive(Debug, thiserror::Error)]
65pub enum RecoveryError {
66    /// I/O error during recovery.
67    #[error("recovery I/O error: {0}")]
68    Io(#[from] io::Error),
69
70    /// The shard file does not exist.
71    #[error("shard file not found: {}", .0.display())]
72    ShardNotFound(PathBuf),
73
74    /// The events directory does not exist.
75    #[error("events directory not found: {}", .0.display())]
76    EventsDirNotFound(PathBuf),
77
78    /// The database path is invalid.
79    #[error("invalid database path: {}", .0.display())]
80    InvalidDbPath(PathBuf),
81
82    /// Rebuild failed.
83    #[error("rebuild failed: {0}")]
84    RebuildFailed(String),
85
86    /// Lock timeout exceeded.
87    #[error("database locked after {0:?} — another process may hold the lock")]
88    LockTimeout(Duration),
89}
90
91// ---------------------------------------------------------------------------
92// Partial write recovery (torn writes)
93// ---------------------------------------------------------------------------
94
95/// Recover from a partial write (e.g., crash mid-append).
96///
97/// Detects incomplete last line (no trailing newline) and truncates
98/// to the last complete event line. This is the fast path — runs on
99/// startup before replay.
100///
101/// # Algorithm
102///
103/// 1. Read the file contents.
104/// 2. If empty or ends with `\n`, nothing to do.
105/// 3. Otherwise, find the last `\n` and truncate there.
106///
107/// # Returns
108///
109/// The number of bytes removed (0 if file was already clean).
110///
111/// # Errors
112///
113/// Returns an error if the file cannot be read or truncated.
114pub fn recover_partial_write(path: &Path) -> Result<u64, RecoveryError> {
115    if !path.exists() {
116        return Err(RecoveryError::ShardNotFound(path.to_path_buf()));
117    }
118
119    let content = fs::read(path)?;
120    if content.is_empty() || content.last() == Some(&b'\n') {
121        return Ok(0);
122    }
123
124    // Find last newline
125    let last_newline = content.iter().rposition(|&b| b == b'\n');
126    let truncate_to = last_newline.map_or(0, |pos| pos + 1);
127
128    let bytes_removed = content.len() - truncate_to;
129
130    // Truncate the file
131    let file = fs::OpenOptions::new().write(true).open(path)?;
132    file.set_len(truncate_to as u64)?;
133
134    tracing::warn!(
135        path = %path.display(),
136        bytes_removed,
137        "torn write repaired: truncated incomplete trailing line"
138    );
139
140    Ok(bytes_removed as u64)
141}
142
143// ---------------------------------------------------------------------------
144// Corrupt shard recovery
145// ---------------------------------------------------------------------------
146
147/// Recover a corrupt shard file by scanning for the last valid event line
148/// and quarantining corrupt data to a backup file.
149///
150/// # Algorithm
151///
152/// 1. Read the entire shard file.
153/// 2. Split into lines; validate each line:
154///    - Comment lines (`#`...) and blank lines are always valid.
155///    - Data lines must parse successfully via the TSJSON parser.
156/// 3. Find the last contiguous block of valid lines from the start.
157/// 4. If all lines are valid, return `NoActionNeeded`.
158/// 5. Otherwise:
159///    a. Write the corrupt tail to `<path>.corrupt` for manual inspection.
160///    b. Truncate the original file to the last valid line.
161///
162/// # Returns
163///
164/// A [`RecoveryReport`] describing what was found and what action was taken.
165///
166/// # Panics
167///
168/// Panics if the internal `first_bad_line` index is unexpectedly `None` after
169/// a prior `is_some()` check — this should never happen.
170///
171/// # Errors
172///
173/// Returns [`RecoveryError::ShardNotFound`] if the file does not exist, or
174/// [`RecoveryError::Io`] if the file cannot be read or written.
175pub fn recover_corrupt_shard(path: &Path) -> Result<RecoveryReport, RecoveryError> {
176    if !path.exists() {
177        return Err(RecoveryError::ShardNotFound(path.to_path_buf()));
178    }
179
180    let content = fs::read_to_string(path).map_err(|e| {
181        // If we can't even read as UTF-8, the whole file might be binary-corrupt
182        tracing::error!(path = %path.display(), error = %e, "shard is not valid UTF-8");
183        RecoveryError::Io(e)
184    })?;
185
186    if content.is_empty() {
187        return Ok(RecoveryReport {
188            shard_path: path.to_path_buf(),
189            events_preserved: 0,
190            events_discarded: 0,
191            corruption_offset: None,
192            action_taken: RecoveryAction::NoActionNeeded,
193        });
194    }
195
196    let lines: Vec<&str> = content.lines().collect();
197    let mut events_preserved = 0;
198    let mut first_bad_line = None;
199
200    for (i, line) in lines.iter().enumerate() {
201        let trimmed = line.trim();
202        if trimmed.is_empty() || trimmed.starts_with('#') {
203            // Comment or blank — always valid
204            continue;
205        }
206
207        // Try parsing as a TSJSON event line
208        if parser::parse_line(line).is_ok() {
209            events_preserved += 1;
210        } else {
211            first_bad_line = Some(i);
212            break;
213        }
214    }
215
216    // If we broke out on a bad line, count events from the valid prefix
217    // Otherwise, all lines valid
218    if first_bad_line.is_none() {
219        return Ok(RecoveryReport {
220            shard_path: path.to_path_buf(),
221            events_preserved,
222            events_discarded: 0,
223            corruption_offset: None,
224            action_taken: RecoveryAction::NoActionNeeded,
225        });
226    }
227
228    // SAFETY: we checked `first_bad_line.is_none()` above and returned early.
229    let bad_idx = first_bad_line.expect("checked is_some above");
230    let events_discarded = lines[bad_idx..]
231        .iter()
232        .filter(|l| {
233            let t = l.trim();
234            !t.is_empty() && !t.starts_with('#')
235        })
236        .count();
237
238    // Calculate byte offset of corruption
239    let corruption_offset: u64 = content
240        .lines()
241        .take(bad_idx)
242        .map(|l| l.len() as u64 + 1) // +1 for the newline
243        .sum();
244
245    // Quarantine: write corrupt tail to backup
246    let backup_path = path.with_extension("corrupt");
247    let corrupt_content: String = lines[bad_idx..].iter().fold(String::new(), |mut acc, l| {
248        use std::fmt::Write;
249        let _ = writeln!(acc, "{l}");
250        acc
251    });
252    fs::write(&backup_path, &corrupt_content)?;
253
254    // Truncate original to valid prefix
255    let valid_content: String = lines[..bad_idx].iter().fold(String::new(), |mut acc, l| {
256        use std::fmt::Write;
257        let _ = writeln!(acc, "{l}");
258        acc
259    });
260    fs::write(path, &valid_content)?;
261
262    tracing::warn!(
263        path = %path.display(),
264        events_preserved,
265        events_discarded,
266        corruption_offset,
267        backup = %backup_path.display(),
268        "corrupt shard recovered: quarantined bad data to backup file"
269    );
270
271    Ok(RecoveryReport {
272        shard_path: path.to_path_buf(),
273        events_preserved,
274        events_discarded,
275        corruption_offset: Some(corruption_offset),
276        action_taken: RecoveryAction::Quarantined { backup_path },
277    })
278}
279
280// ---------------------------------------------------------------------------
281// Missing DB recovery
282// ---------------------------------------------------------------------------
283
284/// Recover from a missing or corrupt `SQLite` projection by triggering a full
285/// rebuild from the event log.
286///
287/// This is the "auto-heal" path when `bones.db` is absent, corrupt, or
288/// fails integrity checks. Delegates to [`crate::db::rebuild::rebuild`].
289///
290/// # Arguments
291///
292/// * `events_dir` — Path to `.bones/events/` directory.
293/// * `db_path` — Path to `.bones/bones.db`.
294///
295/// # Errors
296///
297/// Returns an error if the events directory doesn't exist or rebuild fails.
298pub fn recover_missing_db(
299    events_dir: &Path,
300    db_path: &Path,
301) -> Result<RecoveryReport, RecoveryError> {
302    if !events_dir.exists() {
303        return Err(RecoveryError::EventsDirNotFound(events_dir.to_path_buf()));
304    }
305
306    // Delete corrupt DB if it exists (rebuild will create fresh)
307    let db_existed = db_path.exists();
308    if db_existed {
309        // Back up corrupt DB before deleting
310        let backup_path = db_path.with_extension("db.corrupt");
311        if let Err(e) = fs::copy(db_path, &backup_path) {
312            tracing::warn!(
313                error = %e,
314                "could not back up corrupt DB before rebuild"
315            );
316        }
317    }
318
319    let rebuild_result = crate::db::rebuild::rebuild(events_dir, db_path)
320        .map_err(|e| RecoveryError::RebuildFailed(e.to_string()))?;
321
322    let action = if db_existed {
323        let backup_path = db_path.with_extension("db.corrupt");
324        tracing::info!(
325            events = rebuild_result.event_count,
326            items = rebuild_result.item_count,
327            elapsed_ms = rebuild_result.elapsed.as_millis(),
328            "rebuilt corrupt projection from event log"
329        );
330        RecoveryAction::Quarantined { backup_path }
331    } else {
332        tracing::info!(
333            events = rebuild_result.event_count,
334            items = rebuild_result.item_count,
335            elapsed_ms = rebuild_result.elapsed.as_millis(),
336            "rebuilt missing projection from event log"
337        );
338        RecoveryAction::NoActionNeeded
339    };
340
341    Ok(RecoveryReport {
342        shard_path: db_path.to_path_buf(),
343        events_preserved: rebuild_result.event_count,
344        events_discarded: 0,
345        corruption_offset: None,
346        action_taken: action,
347    })
348}
349
350// ---------------------------------------------------------------------------
351// Corrupt cache recovery
352// ---------------------------------------------------------------------------
353
354/// Recover from a corrupt or missing binary cache by deleting it.
355///
356/// The cache will be rebuilt lazily on next access (it's a pure
357/// performance optimization derived from the event log).
358///
359/// # Returns
360///
361/// `true` if a cache file was deleted, `false` if it didn't exist.
362///
363/// # Errors
364///
365/// Returns [`RecoveryError::Io`] if the cache file cannot be deleted.
366pub fn recover_corrupt_cache(cache_path: &Path) -> Result<bool, RecoveryError> {
367    if !cache_path.exists() {
368        return Ok(false);
369    }
370
371    fs::remove_file(cache_path)?;
372
373    tracing::info!(
374        path = %cache_path.display(),
375        "deleted corrupt binary cache — will be rebuilt on next access"
376    );
377
378    Ok(true)
379}
380
381// ---------------------------------------------------------------------------
382// Locked DB retry
383// ---------------------------------------------------------------------------
384
385/// Attempt to open a `SQLite` database with retry and timeout for lock contention.
386///
387/// If the database is locked by another process, retries with exponential
388/// backoff up to `timeout`. Returns the connection on success or a
389/// [`RecoveryError::LockTimeout`] on failure.
390///
391/// # Arguments
392///
393/// * `db_path` — Path to the `SQLite` database.
394/// * `timeout` — Maximum time to wait for the lock.
395///
396/// # Errors
397///
398/// Returns `LockTimeout` if the lock is not released within the timeout.
399/// Returns `Io` for other I/O errors.
400pub fn open_db_with_retry(
401    db_path: &Path,
402    timeout: Duration,
403) -> Result<rusqlite::Connection, RecoveryError> {
404    let start = Instant::now();
405    let mut delay = Duration::from_millis(50);
406    let max_delay = Duration::from_secs(2);
407
408    loop {
409        match crate::db::open_projection(db_path) {
410            Ok(conn) => {
411                // Test that we can actually query (not just open the file)
412                match conn.execute_batch("SELECT 1") {
413                    Ok(()) => return Ok(conn),
414                    Err(e) if is_locked_error(&e) => {
415                        // Fall through to retry
416                        tracing::debug!(
417                            elapsed_ms = start.elapsed().as_millis(),
418                            "database locked, retrying..."
419                        );
420                    }
421                    Err(e) => {
422                        return Err(RecoveryError::Io(io::Error::other(e.to_string())));
423                    }
424                }
425            }
426            Err(e) => {
427                let err_str = e.to_string();
428                if err_str.contains("locked") || err_str.contains("busy") {
429                    tracing::debug!(
430                        elapsed_ms = start.elapsed().as_millis(),
431                        "database locked on open, retrying..."
432                    );
433                } else {
434                    return Err(RecoveryError::Io(io::Error::other(err_str)));
435                }
436            }
437        }
438
439        if start.elapsed() >= timeout {
440            return Err(RecoveryError::LockTimeout(timeout));
441        }
442
443        std::thread::sleep(delay);
444        delay = (delay * 2).min(max_delay);
445    }
446}
447
448/// Check if a rusqlite error is a lock/busy error.
449fn is_locked_error(e: &rusqlite::Error) -> bool {
450    if let rusqlite::Error::SqliteFailure(err, _) = e {
451        matches!(
452            err.code,
453            rusqlite::ffi::ErrorCode::DatabaseBusy | rusqlite::ffi::ErrorCode::DatabaseLocked
454        )
455    } else {
456        let s = e.to_string();
457        s.contains("locked") || s.contains("busy")
458    }
459}
460
461// ---------------------------------------------------------------------------
462// Full project health check and auto-recovery
463// ---------------------------------------------------------------------------
464
465/// Result of a full project health check.
466#[derive(Debug, Clone)]
467pub struct HealthCheckResult {
468    /// Whether the project directory exists and looks valid.
469    pub project_valid: bool,
470    /// Torn-write recovery results (one per shard).
471    pub torn_write_repairs: Vec<(PathBuf, u64)>,
472    /// Whether the DB was rebuilt.
473    pub db_rebuilt: bool,
474    /// Number of cache files cleaned.
475    pub caches_cleaned: usize,
476    /// Errors encountered (non-fatal).
477    pub warnings: Vec<String>,
478}
479
480/// Run a full health check and auto-recovery on a bones project directory.
481///
482/// This is called on startup to ensure the project is in a consistent state.
483///
484/// # Steps
485///
486/// 1. Verify `.bones/` directory exists.
487/// 2. Recover torn writes on all shard files.
488/// 3. Check if `SQLite` DB exists and is valid; rebuild if not.
489/// 4. Clean corrupt cache files.
490///
491/// # Arguments
492///
493/// * `bones_dir` — Path to the `.bones/` directory.
494///
495/// # Errors
496///
497/// Returns a [`RecoveryError`] if a critical recovery step fails.
498/// Non-fatal issues are recorded in `HealthCheckResult::warnings`.
499pub fn auto_recover(bones_dir: &Path) -> Result<HealthCheckResult, RecoveryError> {
500    let mut result = HealthCheckResult {
501        project_valid: false,
502        torn_write_repairs: Vec::new(),
503        db_rebuilt: false,
504        caches_cleaned: 0,
505        warnings: Vec::new(),
506    };
507
508    // 1. Verify project directory
509    if !bones_dir.exists() || !bones_dir.is_dir() {
510        return Ok(result); // project_valid = false signals "not a bones project"
511    }
512    result.project_valid = true;
513
514    let events_dir = bones_dir.join("events");
515    let db_path = bones_dir.join("bones.db");
516    let cache_dir = bones_dir.join("cache");
517
518    // 2. Recover torn writes on all shard files
519    if events_dir.exists() {
520        match fs::read_dir(&events_dir) {
521            Ok(entries) => {
522                for entry in entries.flatten() {
523                    let path = entry.path();
524                    if path.extension().and_then(|e| e.to_str()) == Some("events") {
525                        match recover_partial_write(&path) {
526                            Ok(bytes) if bytes > 0 => {
527                                result.torn_write_repairs.push((path, bytes));
528                            }
529                            Ok(_) => {} // clean file
530                            Err(e) => {
531                                result.warnings.push(format!(
532                                    "torn-write check failed for {}: {e}",
533                                    path.display()
534                                ));
535                            }
536                        }
537                    }
538                }
539            }
540            Err(e) => {
541                result.warnings.push(format!("cannot read events dir: {e}"));
542            }
543        }
544    }
545
546    // 3. Check/rebuild SQLite DB
547    if events_dir.exists() {
548        let need_rebuild = !db_path.exists()
549            || crate::db::open_projection(&db_path).map_or(true, |conn| {
550                // Try a simple query to verify DB isn't corrupt
551                conn.execute_batch("SELECT COUNT(*) FROM items").is_err()
552            });
553
554        if need_rebuild {
555            match recover_missing_db(&events_dir, &db_path) {
556                Ok(_report) => {
557                    result.db_rebuilt = true;
558                }
559                Err(e) => {
560                    result.warnings.push(format!("DB rebuild failed: {e}"));
561                }
562            }
563        }
564    }
565
566    // 4. Clean corrupt cache files
567    if cache_dir.exists() {
568        let cache_events_bin = cache_dir.join("events.bin");
569        if cache_events_bin.exists() {
570            // Validate cache header (first 4 bytes should be magic)
571            let is_valid = fs::read(&cache_events_bin)
572                .is_ok_and(|data| data.len() >= 4 && &data[..4] == b"BCEV");
573
574            if !is_valid {
575                match recover_corrupt_cache(&cache_events_bin) {
576                    Ok(true) => result.caches_cleaned += 1,
577                    Ok(false) => {}
578                    Err(e) => {
579                        result.warnings.push(format!("cache cleanup failed: {e}"));
580                    }
581                }
582            }
583        }
584    }
585
586    tracing::info!(
587        torn_writes = result.torn_write_repairs.len(),
588        db_rebuilt = result.db_rebuilt,
589        caches_cleaned = result.caches_cleaned,
590        warnings = result.warnings.len(),
591        "auto-recovery complete"
592    );
593
594    Ok(result)
595}
596
597// ---------------------------------------------------------------------------
598// Tests
599// ---------------------------------------------------------------------------
600
601#[cfg(test)]
602mod tests {
603    use super::*;
604    use std::io::Write;
605    use tempfile::TempDir;
606
607    // ---- Partial write tests ----
608
609    #[test]
610    fn partial_write_clean_file() {
611        let dir = TempDir::new().unwrap();
612        let path = dir.path().join("test.events");
613        fs::write(&path, "line1\nline2\n").unwrap();
614
615        let bytes = recover_partial_write(&path).unwrap();
616        assert_eq!(bytes, 0);
617
618        let content = fs::read_to_string(&path).unwrap();
619        assert_eq!(content, "line1\nline2\n");
620    }
621
622    #[test]
623    fn partial_write_truncates_incomplete_line() {
624        let dir = TempDir::new().unwrap();
625        let path = dir.path().join("test.events");
626        fs::write(&path, "line1\nline2\npartial").unwrap();
627
628        let bytes = recover_partial_write(&path).unwrap();
629        assert_eq!(bytes, 7); // "partial" = 7 bytes
630
631        let content = fs::read_to_string(&path).unwrap();
632        assert_eq!(content, "line1\nline2\n");
633    }
634
635    #[test]
636    fn partial_write_no_complete_lines() {
637        let dir = TempDir::new().unwrap();
638        let path = dir.path().join("test.events");
639        fs::write(&path, "no newline at all").unwrap();
640
641        let bytes = recover_partial_write(&path).unwrap();
642        assert_eq!(bytes, 17);
643
644        let content = fs::read_to_string(&path).unwrap();
645        assert_eq!(content, "");
646    }
647
648    #[test]
649    fn partial_write_empty_file() {
650        let dir = TempDir::new().unwrap();
651        let path = dir.path().join("test.events");
652        fs::write(&path, "").unwrap();
653
654        let bytes = recover_partial_write(&path).unwrap();
655        assert_eq!(bytes, 0);
656    }
657
658    #[test]
659    fn partial_write_nonexistent_file() {
660        let dir = TempDir::new().unwrap();
661        let path = dir.path().join("nope.events");
662
663        let result = recover_partial_write(&path);
664        assert!(result.is_err());
665        assert!(matches!(
666            result.unwrap_err(),
667            RecoveryError::ShardNotFound(_)
668        ));
669    }
670
671    // ---- Corrupt shard tests ----
672
673    #[test]
674    fn corrupt_shard_clean_file() {
675        let dir = TempDir::new().unwrap();
676        let path = dir.path().join("test.events");
677        // Only comments and blank lines → valid
678        fs::write(&path, "# bones event log v1\n# comment\n\n").unwrap();
679
680        let report = recover_corrupt_shard(&path).unwrap();
681        assert_eq!(report.events_preserved, 0);
682        assert_eq!(report.events_discarded, 0);
683        assert_eq!(report.action_taken, RecoveryAction::NoActionNeeded);
684    }
685
686    #[test]
687    fn corrupt_shard_empty_file() {
688        let dir = TempDir::new().unwrap();
689        let path = dir.path().join("test.events");
690        fs::write(&path, "").unwrap();
691
692        let report = recover_corrupt_shard(&path).unwrap();
693        assert_eq!(report.events_preserved, 0);
694        assert_eq!(report.action_taken, RecoveryAction::NoActionNeeded);
695    }
696
697    #[test]
698    fn corrupt_shard_with_bad_data() {
699        let dir = TempDir::new().unwrap();
700        let path = dir.path().join("test.events");
701        // Header + a line that won't parse as TSJSON
702        fs::write(&path, "# header\nthis is garbage data\nmore garbage\n").unwrap();
703
704        let report = recover_corrupt_shard(&path).unwrap();
705        assert_eq!(report.events_preserved, 0);
706        assert_eq!(report.events_discarded, 2);
707        assert!(report.corruption_offset.is_some());
708
709        match &report.action_taken {
710            RecoveryAction::Quarantined { backup_path } => {
711                assert!(backup_path.exists());
712                let backup = fs::read_to_string(backup_path).unwrap();
713                assert!(backup.contains("garbage data"));
714            }
715            _ => panic!("expected Quarantined"),
716        }
717
718        // Original should only have the header
719        let content = fs::read_to_string(&path).unwrap();
720        assert_eq!(content, "# header\n");
721    }
722
723    #[test]
724    fn corrupt_shard_nonexistent_file() {
725        let dir = TempDir::new().unwrap();
726        let path = dir.path().join("nope.events");
727
728        let result = recover_corrupt_shard(&path);
729        assert!(result.is_err());
730    }
731
732    // ---- Cache recovery tests ----
733
734    #[test]
735    fn cache_recovery_deletes_file() {
736        let dir = TempDir::new().unwrap();
737        let path = dir.path().join("events.bin");
738        fs::write(&path, "corrupt data").unwrap();
739
740        let deleted = recover_corrupt_cache(&path).unwrap();
741        assert!(deleted);
742        assert!(!path.exists());
743    }
744
745    #[test]
746    fn cache_recovery_nonexistent_file() {
747        let dir = TempDir::new().unwrap();
748        let path = dir.path().join("events.bin");
749
750        let deleted = recover_corrupt_cache(&path).unwrap();
751        assert!(!deleted);
752    }
753
754    // ---- Missing DB recovery tests ----
755
756    #[test]
757    fn missing_db_no_events_dir() {
758        let dir = TempDir::new().unwrap();
759        let events_dir = dir.path().join("events");
760        let db_path = dir.path().join("bones.db");
761
762        let result = recover_missing_db(&events_dir, &db_path);
763        assert!(result.is_err());
764        assert!(matches!(
765            result.unwrap_err(),
766            RecoveryError::EventsDirNotFound(_)
767        ));
768    }
769
770    #[test]
771    fn missing_db_empty_events() {
772        let dir = TempDir::new().unwrap();
773        let bones_dir = dir.path();
774
775        // Set up minimal bones structure
776        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
777        shard_mgr.ensure_dirs().expect("ensure dirs");
778        shard_mgr.init().expect("init");
779
780        let events_dir = bones_dir.join("events");
781        let db_path = bones_dir.join("bones.db");
782
783        let report = recover_missing_db(&events_dir, &db_path).unwrap();
784        assert_eq!(report.events_preserved, 0);
785        assert!(db_path.exists());
786    }
787
788    #[test]
789    fn missing_db_with_events_rebuilds() {
790        let dir = TempDir::new().unwrap();
791        let bones_dir = dir.path();
792
793        // Set up bones with some events
794        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
795        shard_mgr.ensure_dirs().expect("ensure dirs");
796        shard_mgr.init().expect("init");
797
798        // Write a create event
799        use crate::event::Event;
800        use crate::event::data::*;
801        use crate::event::types::EventType;
802        use crate::event::writer;
803        use crate::model::item::{Kind, Size, Urgency};
804        use crate::model::item_id::ItemId;
805        use std::collections::BTreeMap;
806
807        let mut event = Event {
808            wall_ts_us: 1000,
809            agent: "test".into(),
810            itc: "itc:AQ".into(),
811            parents: vec![],
812            event_type: EventType::Create,
813            item_id: ItemId::new_unchecked("bn-001"),
814            data: EventData::Create(CreateData {
815                title: "Test item".into(),
816                kind: Kind::Task,
817                size: Some(Size::M),
818                urgency: Urgency::Default,
819                labels: vec![],
820                parent: None,
821                causation: None,
822                description: None,
823                extra: BTreeMap::new(),
824            }),
825            event_hash: String::new(),
826        };
827        writer::write_event(&mut event).expect("hash");
828        let line = writer::write_line(&event).expect("serialize");
829        let (year, month) = shard_mgr.active_shard().unwrap().unwrap();
830        shard_mgr.append_raw(year, month, &line).expect("append");
831
832        let events_dir = bones_dir.join("events");
833        let db_path = bones_dir.join("bones.db");
834
835        let report = recover_missing_db(&events_dir, &db_path).unwrap();
836        assert_eq!(report.events_preserved, 1);
837        assert!(db_path.exists());
838
839        // Verify the item is in the rebuilt DB
840        let conn = crate::db::open_projection(&db_path).unwrap();
841        let title: String = conn
842            .query_row(
843                "SELECT title FROM items WHERE item_id = 'bn-001'",
844                [],
845                |row| row.get(0),
846            )
847            .unwrap();
848        assert_eq!(title, "Test item");
849    }
850
851    #[test]
852    fn corrupt_db_is_backed_up_before_rebuild() {
853        let dir = TempDir::new().unwrap();
854        let bones_dir = dir.path();
855
856        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
857        shard_mgr.ensure_dirs().expect("ensure dirs");
858        shard_mgr.init().expect("init");
859
860        let events_dir = bones_dir.join("events");
861        let db_path = bones_dir.join("bones.db");
862
863        // Write something pretending to be a corrupt DB
864        fs::write(&db_path, "this is not sqlite").unwrap();
865
866        let report = recover_missing_db(&events_dir, &db_path).unwrap();
867
868        // Corrupt DB should be backed up
869        let backup_path = db_path.with_extension("db.corrupt");
870        match &report.action_taken {
871            RecoveryAction::Quarantined { backup_path: bp } => {
872                assert_eq!(bp, &backup_path);
873                assert!(backup_path.exists());
874                let backup_content = fs::read_to_string(&backup_path).unwrap();
875                assert_eq!(backup_content, "this is not sqlite");
876            }
877            _ => panic!("expected Quarantined action"),
878        }
879    }
880
881    // ---- Auto-recovery tests ----
882
883    #[test]
884    fn auto_recover_nonexistent_project() {
885        let dir = TempDir::new().unwrap();
886        let bones_dir = dir.path().join(".bones");
887
888        let result = auto_recover(&bones_dir).unwrap();
889        assert!(!result.project_valid);
890    }
891
892    #[test]
893    fn auto_recover_healthy_project() {
894        let dir = TempDir::new().unwrap();
895        let bones_dir = dir.path();
896
897        // Set up minimal healthy project
898        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
899        shard_mgr.ensure_dirs().expect("ensure dirs");
900        shard_mgr.init().expect("init");
901
902        // Create the DB with rebuild
903        let events_dir = bones_dir.join("events");
904        let db_path = bones_dir.join("bones.db");
905        crate::db::rebuild::rebuild(&events_dir, &db_path).unwrap();
906
907        let result = auto_recover(bones_dir).unwrap();
908        assert!(result.project_valid);
909        assert!(result.torn_write_repairs.is_empty());
910        assert!(!result.db_rebuilt);
911        assert_eq!(result.caches_cleaned, 0);
912        assert!(result.warnings.is_empty());
913    }
914
915    #[test]
916    fn auto_recover_repairs_torn_write() {
917        let dir = TempDir::new().unwrap();
918        let bones_dir = dir.path();
919
920        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
921        shard_mgr.ensure_dirs().expect("ensure dirs");
922        shard_mgr.init().expect("init");
923
924        // Create DB first
925        let events_dir = bones_dir.join("events");
926        let db_path = bones_dir.join("bones.db");
927        crate::db::rebuild::rebuild(&events_dir, &db_path).unwrap();
928
929        // Simulate torn write: append incomplete data to active shard
930        let (year, month) = shard_mgr.active_shard().unwrap().unwrap();
931        let shard_path = events_dir.join(format!("{year:04}-{month:02}.events"));
932        let mut file = fs::OpenOptions::new()
933            .append(true)
934            .open(&shard_path)
935            .unwrap();
936        file.write_all(b"incomplete line without newline").unwrap();
937
938        let result = auto_recover(bones_dir).unwrap();
939        assert!(result.project_valid);
940        assert_eq!(result.torn_write_repairs.len(), 1);
941        // "incomplete line without newline" = 30 bytes, but shard header may
942        // affect exact count. Just verify some bytes were repaired.
943        assert!(result.torn_write_repairs[0].1 > 0);
944    }
945
946    #[test]
947    fn auto_recover_rebuilds_missing_db() {
948        let dir = TempDir::new().unwrap();
949        let bones_dir = dir.path();
950
951        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
952        shard_mgr.ensure_dirs().expect("ensure dirs");
953        shard_mgr.init().expect("init");
954
955        // Don't create DB — auto_recover should rebuild it
956        let result = auto_recover(bones_dir).unwrap();
957        assert!(result.project_valid);
958        assert!(result.db_rebuilt);
959    }
960
961    #[test]
962    fn auto_recover_cleans_corrupt_cache() {
963        let dir = TempDir::new().unwrap();
964        let bones_dir = dir.path();
965
966        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
967        shard_mgr.ensure_dirs().expect("ensure dirs");
968        shard_mgr.init().expect("init");
969
970        // Create DB
971        let events_dir = bones_dir.join("events");
972        let db_path = bones_dir.join("bones.db");
973        crate::db::rebuild::rebuild(&events_dir, &db_path).unwrap();
974
975        // Create corrupt cache
976        let cache_dir = bones_dir.join("cache");
977        fs::create_dir_all(&cache_dir).unwrap();
978        fs::write(cache_dir.join("events.bin"), "not a valid cache").unwrap();
979
980        let result = auto_recover(bones_dir).unwrap();
981        assert!(result.project_valid);
982        assert_eq!(result.caches_cleaned, 1);
983        assert!(!cache_dir.join("events.bin").exists());
984    }
985
986    // ---- Locked DB retry tests ----
987
988    #[test]
989    fn open_db_with_retry_succeeds_immediately() {
990        let dir = TempDir::new().unwrap();
991        let db_path = dir.path().join("test.db");
992
993        // Create a valid DB first
994        let conn = rusqlite::Connection::open(&db_path).unwrap();
995        conn.execute_batch("CREATE TABLE test (id INTEGER)")
996            .unwrap();
997        drop(conn);
998
999        // Should open immediately
1000        let result = open_db_with_retry(&db_path, Duration::from_secs(1));
1001        assert!(result.is_ok());
1002    }
1003
1004    #[test]
1005    fn open_db_with_retry_handles_missing_db() {
1006        let dir = TempDir::new().unwrap();
1007        let db_path = dir.path().join("test.db");
1008
1009        // Should create the DB (SQLite creates on open)
1010        let result = open_db_with_retry(&db_path, Duration::from_secs(1));
1011        assert!(result.is_ok());
1012    }
1013
1014    // ---- RecoveryReport Display ----
1015
1016    #[test]
1017    fn recovery_action_debug() {
1018        let action = RecoveryAction::Truncated { bytes_removed: 42 };
1019        let debug = format!("{action:?}");
1020        assert!(debug.contains("42"));
1021
1022        let action = RecoveryAction::Quarantined {
1023            backup_path: PathBuf::from("/tmp/test.corrupt"),
1024        };
1025        let debug = format!("{action:?}");
1026        assert!(debug.contains("test.corrupt"));
1027    }
1028
1029    #[test]
1030    fn recovery_error_display() {
1031        let err = RecoveryError::ShardNotFound(PathBuf::from("/tmp/test.events"));
1032        let display = format!("{err}");
1033        assert!(display.contains("not found"));
1034
1035        let err = RecoveryError::LockTimeout(Duration::from_secs(30));
1036        let display = format!("{err}");
1037        assert!(display.contains("30s"));
1038    }
1039}