Skip to main content

bones_core/
recovery.rs

1//! Recovery procedures for corrupt shards, partial writes, and missing DB.
2//!
3//! This module implements the runtime recovery procedures that restore a bones
4//! project to a consistent state after:
5//! - Partial/torn writes (process crash mid-append)
6//! - Corrupt shard data (bit flips, truncation, invalid content)
7//! - Missing or corrupt `SQLite` projection database
8//! - Missing or corrupt binary cache files
9//! - Locked database (retry with timeout)
10//!
11//! # Recovery Philosophy
12//!
13//! - **Deterministic**: same input → same recovery action, every time.
14//! - **No silent data loss**: corrupt data is quarantined, never deleted outright.
15//! - **Fast common path**: torn-write repair is the typical case (truncate last
16//!   incomplete line). Complex cases (quarantine, rebuild) are rarer.
17//! - **User-facing messages**: every action emits a diagnostic so operators know
18//!   exactly what happened and why.
19
20use std::fs;
21use std::io;
22use std::path::{Path, PathBuf};
23use std::time::{Duration, Instant};
24
25use crate::event::parser;
26
27// ---------------------------------------------------------------------------
28// Types
29// ---------------------------------------------------------------------------
30
31/// Report from recovering a corrupt or partially-written shard file.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct RecoveryReport {
34    /// Path to the shard that was recovered.
35    pub shard_path: PathBuf,
36    /// Number of valid events preserved.
37    pub events_preserved: usize,
38    /// Number of corrupt/invalid events discarded.
39    pub events_discarded: usize,
40    /// Byte offset where corruption was detected (if applicable).
41    pub corruption_offset: Option<u64>,
42    /// What action was taken.
43    pub action_taken: RecoveryAction,
44}
45
46/// The action taken during recovery.
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub enum RecoveryAction {
49    /// Truncated file at the last valid event boundary.
50    Truncated {
51        /// Number of bytes removed from the end.
52        bytes_removed: u64,
53    },
54    /// Quarantined corrupt data to a `.corrupt` backup file.
55    Quarantined {
56        /// Path to the backup file containing the corrupt data.
57        backup_path: PathBuf,
58    },
59    /// No action needed — file was valid.
60    NoActionNeeded,
61}
62
63/// Errors that can occur during recovery operations.
64#[derive(Debug, thiserror::Error)]
65pub enum RecoveryError {
66    /// I/O error during recovery.
67    #[error("recovery I/O error: {0}")]
68    Io(#[from] io::Error),
69
70    /// The shard file does not exist.
71    #[error("shard file not found: {}", .0.display())]
72    ShardNotFound(PathBuf),
73
74    /// The events directory does not exist.
75    #[error("events directory not found: {}", .0.display())]
76    EventsDirNotFound(PathBuf),
77
78    /// The database path is invalid.
79    #[error("invalid database path: {}", .0.display())]
80    InvalidDbPath(PathBuf),
81
82    /// Rebuild failed.
83    #[error("rebuild failed: {0}")]
84    RebuildFailed(String),
85
86    /// Lock timeout exceeded.
87    #[error("database locked after {0:?} — another process may hold the lock")]
88    LockTimeout(Duration),
89}
90
91// ---------------------------------------------------------------------------
92// Partial write recovery (torn writes)
93// ---------------------------------------------------------------------------
94
95/// Recover from a partial write (e.g., crash mid-append).
96///
97/// Detects incomplete last line (no trailing newline) and truncates
98/// to the last complete event line. This is the fast path — runs on
99/// startup before replay.
100///
101/// # Algorithm
102///
103/// 1. Read the file contents.
104/// 2. If empty or ends with `\n`, nothing to do.
105/// 3. Otherwise, find the last `\n` and truncate there.
106///
107/// # Returns
108///
109/// The number of bytes removed (0 if file was already clean).
110///
111/// # Errors
112///
113/// Returns an error if the file cannot be read or truncated.
114pub fn recover_partial_write(path: &Path) -> Result<u64, RecoveryError> {
115    if !path.exists() {
116        return Err(RecoveryError::ShardNotFound(path.to_path_buf()));
117    }
118
119    let content = fs::read(path)?;
120    if content.is_empty() || content.last() == Some(&b'\n') {
121        return Ok(0);
122    }
123
124    // Find last newline
125    let last_newline = content.iter().rposition(|&b| b == b'\n');
126    let truncate_to = last_newline.map_or(0, |pos| pos + 1);
127
128    let bytes_removed = content.len() - truncate_to;
129
130    // Truncate the file
131    let file = fs::OpenOptions::new().write(true).open(path)?;
132    file.set_len(truncate_to as u64)?;
133
134    tracing::warn!(
135        path = %path.display(),
136        bytes_removed,
137        "torn write repaired: truncated incomplete trailing line"
138    );
139
140    Ok(bytes_removed as u64)
141}
142
143// ---------------------------------------------------------------------------
144// Corrupt shard recovery
145// ---------------------------------------------------------------------------
146
147/// Recover a corrupt shard file by scanning for the last valid event line
148/// and quarantining corrupt data to a backup file.
149///
150/// # Algorithm
151///
152/// 1. Read the entire shard file.
153/// 2. Split into lines; validate each line:
154///    - Comment lines (`#`...) and blank lines are always valid.
155///    - Data lines must parse successfully via the TSJSON parser.
156/// 3. Find the last contiguous block of valid lines from the start.
157/// 4. If all lines are valid, return `NoActionNeeded`.
158/// 5. Otherwise:
159///    a. Write the corrupt tail to `<path>.corrupt` for manual inspection.
160///    b. Truncate the original file to the last valid line.
161///
162/// # Returns
163///
164/// A [`RecoveryReport`] describing what was found and what action was taken.
165///
166/// # Panics
167///
168/// Panics if the internal `first_bad_line` index is unexpectedly `None` after
169/// a prior `is_some()` check — this should never happen.
170///
171/// # Errors
172///
173/// Returns [`RecoveryError::ShardNotFound`] if the file does not exist, or
174/// [`RecoveryError::Io`] if the file cannot be read or written.
175pub fn recover_corrupt_shard(path: &Path) -> Result<RecoveryReport, RecoveryError> {
176    if !path.exists() {
177        return Err(RecoveryError::ShardNotFound(path.to_path_buf()));
178    }
179
180    let content = fs::read_to_string(path).map_err(|e| {
181        // If we can't even read as UTF-8, the whole file might be binary-corrupt
182        tracing::error!(path = %path.display(), error = %e, "shard is not valid UTF-8");
183        RecoveryError::Io(e)
184    })?;
185
186    if content.is_empty() {
187        return Ok(RecoveryReport {
188            shard_path: path.to_path_buf(),
189            events_preserved: 0,
190            events_discarded: 0,
191            corruption_offset: None,
192            action_taken: RecoveryAction::NoActionNeeded,
193        });
194    }
195
196    let lines: Vec<&str> = content.lines().collect();
197    let mut events_preserved = 0;
198    let mut first_bad_line = None;
199
200    for (i, line) in lines.iter().enumerate() {
201        let trimmed = line.trim();
202        if trimmed.is_empty() || trimmed.starts_with('#') {
203            // Comment or blank — always valid
204            continue;
205        }
206
207        // Try parsing as a TSJSON event line
208        if parser::parse_line(line).is_ok() {
209            events_preserved += 1;
210        } else {
211            first_bad_line = Some(i);
212            break;
213        }
214    }
215
216    // If we broke out on a bad line, count events from the valid prefix
217    // Otherwise, all lines valid
218    if first_bad_line.is_none() {
219        return Ok(RecoveryReport {
220            shard_path: path.to_path_buf(),
221            events_preserved,
222            events_discarded: 0,
223            corruption_offset: None,
224            action_taken: RecoveryAction::NoActionNeeded,
225        });
226    }
227
228    // SAFETY: we checked `first_bad_line.is_none()` above and returned early.
229    let bad_idx = first_bad_line.expect("checked is_some above");
230    let events_discarded = lines[bad_idx..]
231        .iter()
232        .filter(|l| {
233            let t = l.trim();
234            !t.is_empty() && !t.starts_with('#')
235        })
236        .count();
237
238    // Calculate byte offset of corruption
239    let corruption_offset: u64 = content
240        .lines()
241        .take(bad_idx)
242        .map(|l| l.len() as u64 + 1) // +1 for the newline
243        .sum();
244
245    // Quarantine: write corrupt tail to backup
246    let backup_path = path.with_extension("corrupt");
247    let corrupt_content: String = lines[bad_idx..].iter().fold(String::new(), |mut acc, l| {
248        use std::fmt::Write;
249        let _ = writeln!(acc, "{l}");
250        acc
251    });
252    fs::write(&backup_path, &corrupt_content)?;
253
254    // Truncate original to valid prefix
255    let valid_content: String = lines[..bad_idx].iter().fold(String::new(), |mut acc, l| {
256        use std::fmt::Write;
257        let _ = writeln!(acc, "{l}");
258        acc
259    });
260    fs::write(path, &valid_content)?;
261
262    tracing::warn!(
263        path = %path.display(),
264        events_preserved,
265        events_discarded,
266        corruption_offset,
267        backup = %backup_path.display(),
268        "corrupt shard recovered: quarantined bad data to backup file"
269    );
270
271    Ok(RecoveryReport {
272        shard_path: path.to_path_buf(),
273        events_preserved,
274        events_discarded,
275        corruption_offset: Some(corruption_offset),
276        action_taken: RecoveryAction::Quarantined { backup_path },
277    })
278}
279
280// ---------------------------------------------------------------------------
281// Missing DB recovery
282// ---------------------------------------------------------------------------
283
284/// Recover from a missing or corrupt `SQLite` projection by triggering a full
285/// rebuild from the event log.
286///
287/// This is the "auto-heal" path when `bones.db` is absent, corrupt, or
288/// fails integrity checks. Delegates to [`crate::db::rebuild::rebuild`].
289///
290/// # Arguments
291///
292/// * `events_dir` — Path to `.bones/events/` directory.
293/// * `db_path` — Path to `.bones/bones.db`.
294///
295/// # Errors
296///
297/// Returns an error if the events directory doesn't exist or rebuild fails.
298pub fn recover_missing_db(
299    events_dir: &Path,
300    db_path: &Path,
301) -> Result<RecoveryReport, RecoveryError> {
302    if !events_dir.exists() {
303        return Err(RecoveryError::EventsDirNotFound(events_dir.to_path_buf()));
304    }
305
306    // Delete corrupt DB if it exists (rebuild will create fresh)
307    let db_existed = db_path.exists();
308    if db_existed {
309        // Back up corrupt DB before deleting
310        let backup_path = db_path.with_extension("db.corrupt");
311        if let Err(e) = fs::copy(db_path, &backup_path) {
312            tracing::warn!(
313                error = %e,
314                "could not back up corrupt DB before rebuild"
315            );
316        }
317    }
318
319    let rebuild_result = crate::db::rebuild::rebuild(events_dir, db_path)
320        .map_err(|e| RecoveryError::RebuildFailed(e.to_string()))?;
321
322    let action = if db_existed {
323        let backup_path = db_path.with_extension("db.corrupt");
324        tracing::info!(
325            events = rebuild_result.event_count,
326            items = rebuild_result.item_count,
327            elapsed_ms = rebuild_result.elapsed.as_millis(),
328            "rebuilt corrupt projection from event log"
329        );
330        RecoveryAction::Quarantined { backup_path }
331    } else {
332        tracing::info!(
333            events = rebuild_result.event_count,
334            items = rebuild_result.item_count,
335            elapsed_ms = rebuild_result.elapsed.as_millis(),
336            "rebuilt missing projection from event log"
337        );
338        RecoveryAction::NoActionNeeded
339    };
340
341    Ok(RecoveryReport {
342        shard_path: db_path.to_path_buf(),
343        events_preserved: rebuild_result.event_count,
344        events_discarded: 0,
345        corruption_offset: None,
346        action_taken: action,
347    })
348}
349
350// ---------------------------------------------------------------------------
351// Corrupt cache recovery
352// ---------------------------------------------------------------------------
353
354/// Recover from a corrupt or missing binary cache by deleting it.
355///
356/// The cache will be rebuilt lazily on next access (it's a pure
357/// performance optimization derived from the event log).
358///
359/// # Returns
360///
361/// `true` if a cache file was deleted, `false` if it didn't exist.
362///
363/// # Errors
364///
365/// Returns [`RecoveryError::Io`] if the cache file cannot be deleted.
366pub fn recover_corrupt_cache(cache_path: &Path) -> Result<bool, RecoveryError> {
367    if !cache_path.exists() {
368        return Ok(false);
369    }
370
371    fs::remove_file(cache_path)?;
372
373    tracing::info!(
374        path = %cache_path.display(),
375        "deleted corrupt binary cache — will be rebuilt on next access"
376    );
377
378    Ok(true)
379}
380
381// ---------------------------------------------------------------------------
382// Locked DB retry
383// ---------------------------------------------------------------------------
384
385/// Attempt to open a `SQLite` database with retry and timeout for lock contention.
386///
387/// If the database is locked by another process, retries with exponential
388/// backoff up to `timeout`. Returns the connection on success or a
389/// [`RecoveryError::LockTimeout`] on failure.
390///
391/// # Arguments
392///
393/// * `db_path` — Path to the `SQLite` database.
394/// * `timeout` — Maximum time to wait for the lock.
395///
396/// # Errors
397///
398/// Returns `LockTimeout` if the lock is not released within the timeout.
399/// Returns `Io` for other I/O errors.
400pub fn open_db_with_retry(
401    db_path: &Path,
402    timeout: Duration,
403) -> Result<rusqlite::Connection, RecoveryError> {
404    let start = Instant::now();
405    let mut delay = Duration::from_millis(50);
406    let max_delay = Duration::from_secs(2);
407
408    loop {
409        match crate::db::open_projection(db_path) {
410            Ok(conn) => {
411                // Test that we can actually query (not just open the file)
412                match conn.execute_batch("SELECT 1") {
413                    Ok(()) => return Ok(conn),
414                    Err(e) if is_locked_error(&e) => {
415                        // Fall through to retry
416                        tracing::debug!(
417                            elapsed_ms = start.elapsed().as_millis(),
418                            "database locked, retrying..."
419                        );
420                    }
421                    Err(e) => {
422                        return Err(RecoveryError::Io(io::Error::other(e.to_string())));
423                    }
424                }
425            }
426            Err(e) => {
427                let err_str = e.to_string();
428                if err_str.contains("locked") || err_str.contains("busy") {
429                    tracing::debug!(
430                        elapsed_ms = start.elapsed().as_millis(),
431                        "database locked on open, retrying..."
432                    );
433                } else {
434                    return Err(RecoveryError::Io(io::Error::other(err_str)));
435                }
436            }
437        }
438
439        if start.elapsed() >= timeout {
440            return Err(RecoveryError::LockTimeout(timeout));
441        }
442
443        std::thread::sleep(delay);
444        delay = (delay * 2).min(max_delay);
445    }
446}
447
448/// Check if a rusqlite error is a lock/busy error.
449fn is_locked_error(e: &rusqlite::Error) -> bool {
450    if let rusqlite::Error::SqliteFailure(err, _) = e {
451        matches!(
452            err.code,
453            rusqlite::ffi::ErrorCode::DatabaseBusy | rusqlite::ffi::ErrorCode::DatabaseLocked
454        )
455    } else {
456        let s = e.to_string();
457        s.contains("locked") || s.contains("busy")
458    }
459}
460
461// ---------------------------------------------------------------------------
462// Full project health check and auto-recovery
463// ---------------------------------------------------------------------------
464
465/// Result of a full project health check.
466#[derive(Debug, Clone)]
467pub struct HealthCheckResult {
468    /// Whether the project directory exists and looks valid.
469    pub project_valid: bool,
470    /// Torn-write recovery results (one per shard).
471    pub torn_write_repairs: Vec<(PathBuf, u64)>,
472    /// Whether the DB was rebuilt.
473    pub db_rebuilt: bool,
474    /// Number of cache files cleaned.
475    pub caches_cleaned: usize,
476    /// Errors encountered (non-fatal).
477    pub warnings: Vec<String>,
478}
479
480/// Run a full health check and auto-recovery on a bones project directory.
481///
482/// This is called on startup to ensure the project is in a consistent state.
483///
484/// # Steps
485///
486/// 1. Verify `.bones/` directory exists.
487/// 2. Recover torn writes on all shard files.
488/// 3. Check if `SQLite` DB exists and is valid; rebuild if not.
489/// 4. Clean corrupt cache files.
490///
491/// # Arguments
492///
493/// * `bones_dir` — Path to the `.bones/` directory.
494///
495/// # Errors
496///
497/// Returns a [`RecoveryError`] if a critical recovery step fails.
498/// Non-fatal issues are recorded in `HealthCheckResult::warnings`.
499pub fn auto_recover(bones_dir: &Path) -> Result<HealthCheckResult, RecoveryError> {
500    let mut result = HealthCheckResult {
501        project_valid: false,
502        torn_write_repairs: Vec::new(),
503        db_rebuilt: false,
504        caches_cleaned: 0,
505        warnings: Vec::new(),
506    };
507
508    // 1. Verify project directory
509    if !bones_dir.exists() || !bones_dir.is_dir() {
510        return Ok(result); // project_valid = false signals "not a bones project"
511    }
512    result.project_valid = true;
513
514    let events_dir = bones_dir.join("events");
515    let db_path = bones_dir.join("bones.db");
516    let cache_dir = bones_dir.join("cache");
517
518    // 2. Recover torn writes on all shard files
519    if events_dir.exists() {
520        match fs::read_dir(&events_dir) {
521            Ok(entries) => {
522                for entry in entries.flatten() {
523                    let path = entry.path();
524                    if path.extension().and_then(|e| e.to_str()) == Some("events") {
525                        match recover_partial_write(&path) {
526                            Ok(bytes) if bytes > 0 => {
527                                result.torn_write_repairs.push((path, bytes));
528                            }
529                            Ok(_) => {} // clean file
530                            Err(e) => {
531                                result.warnings.push(format!(
532                                    "torn-write check failed for {}: {e}",
533                                    path.display()
534                                ));
535                            }
536                        }
537                    }
538                }
539            }
540            Err(e) => {
541                result.warnings.push(format!("cannot read events dir: {e}"));
542            }
543        }
544    }
545
546    // 3. Check/rebuild SQLite DB
547    if events_dir.exists() {
548        let need_rebuild = !db_path.exists()
549            || crate::db::open_projection(&db_path).map_or(true, |conn| {
550                // Try a simple query to verify DB isn't corrupt
551                conn.execute_batch("SELECT COUNT(*) FROM items").is_err()
552            });
553
554        if need_rebuild {
555            match recover_missing_db(&events_dir, &db_path) {
556                Ok(_report) => {
557                    result.db_rebuilt = true;
558                }
559                Err(e) => {
560                    result.warnings.push(format!("DB rebuild failed: {e}"));
561                }
562            }
563        }
564    }
565
566    // 4. Clean corrupt cache files
567    if cache_dir.exists() {
568        let cache_events_bin = cache_dir.join("events.bin");
569        if cache_events_bin.exists() {
570            // Validate cache header (first 4 bytes should be magic)
571            let is_valid = fs::read(&cache_events_bin)
572                .map(|data| data.len() >= 4 && &data[..4] == b"BCEV")
573                .unwrap_or(false);
574
575            if !is_valid {
576                match recover_corrupt_cache(&cache_events_bin) {
577                    Ok(true) => result.caches_cleaned += 1,
578                    Ok(false) => {}
579                    Err(e) => {
580                        result.warnings.push(format!("cache cleanup failed: {e}"));
581                    }
582                }
583            }
584        }
585    }
586
587    tracing::info!(
588        torn_writes = result.torn_write_repairs.len(),
589        db_rebuilt = result.db_rebuilt,
590        caches_cleaned = result.caches_cleaned,
591        warnings = result.warnings.len(),
592        "auto-recovery complete"
593    );
594
595    Ok(result)
596}
597
598// ---------------------------------------------------------------------------
599// Tests
600// ---------------------------------------------------------------------------
601
602#[cfg(test)]
603mod tests {
604    use super::*;
605    use std::io::Write;
606    use tempfile::TempDir;
607
608    // ---- Partial write tests ----
609
610    #[test]
611    fn partial_write_clean_file() {
612        let dir = TempDir::new().unwrap();
613        let path = dir.path().join("test.events");
614        fs::write(&path, "line1\nline2\n").unwrap();
615
616        let bytes = recover_partial_write(&path).unwrap();
617        assert_eq!(bytes, 0);
618
619        let content = fs::read_to_string(&path).unwrap();
620        assert_eq!(content, "line1\nline2\n");
621    }
622
623    #[test]
624    fn partial_write_truncates_incomplete_line() {
625        let dir = TempDir::new().unwrap();
626        let path = dir.path().join("test.events");
627        fs::write(&path, "line1\nline2\npartial").unwrap();
628
629        let bytes = recover_partial_write(&path).unwrap();
630        assert_eq!(bytes, 7); // "partial" = 7 bytes
631
632        let content = fs::read_to_string(&path).unwrap();
633        assert_eq!(content, "line1\nline2\n");
634    }
635
636    #[test]
637    fn partial_write_no_complete_lines() {
638        let dir = TempDir::new().unwrap();
639        let path = dir.path().join("test.events");
640        fs::write(&path, "no newline at all").unwrap();
641
642        let bytes = recover_partial_write(&path).unwrap();
643        assert_eq!(bytes, 17);
644
645        let content = fs::read_to_string(&path).unwrap();
646        assert_eq!(content, "");
647    }
648
649    #[test]
650    fn partial_write_empty_file() {
651        let dir = TempDir::new().unwrap();
652        let path = dir.path().join("test.events");
653        fs::write(&path, "").unwrap();
654
655        let bytes = recover_partial_write(&path).unwrap();
656        assert_eq!(bytes, 0);
657    }
658
659    #[test]
660    fn partial_write_nonexistent_file() {
661        let dir = TempDir::new().unwrap();
662        let path = dir.path().join("nope.events");
663
664        let result = recover_partial_write(&path);
665        assert!(result.is_err());
666        assert!(matches!(
667            result.unwrap_err(),
668            RecoveryError::ShardNotFound(_)
669        ));
670    }
671
672    // ---- Corrupt shard tests ----
673
674    #[test]
675    fn corrupt_shard_clean_file() {
676        let dir = TempDir::new().unwrap();
677        let path = dir.path().join("test.events");
678        // Only comments and blank lines → valid
679        fs::write(&path, "# bones event log v1\n# comment\n\n").unwrap();
680
681        let report = recover_corrupt_shard(&path).unwrap();
682        assert_eq!(report.events_preserved, 0);
683        assert_eq!(report.events_discarded, 0);
684        assert_eq!(report.action_taken, RecoveryAction::NoActionNeeded);
685    }
686
687    #[test]
688    fn corrupt_shard_empty_file() {
689        let dir = TempDir::new().unwrap();
690        let path = dir.path().join("test.events");
691        fs::write(&path, "").unwrap();
692
693        let report = recover_corrupt_shard(&path).unwrap();
694        assert_eq!(report.events_preserved, 0);
695        assert_eq!(report.action_taken, RecoveryAction::NoActionNeeded);
696    }
697
698    #[test]
699    fn corrupt_shard_with_bad_data() {
700        let dir = TempDir::new().unwrap();
701        let path = dir.path().join("test.events");
702        // Header + a line that won't parse as TSJSON
703        fs::write(&path, "# header\nthis is garbage data\nmore garbage\n").unwrap();
704
705        let report = recover_corrupt_shard(&path).unwrap();
706        assert_eq!(report.events_preserved, 0);
707        assert_eq!(report.events_discarded, 2);
708        assert!(report.corruption_offset.is_some());
709
710        match &report.action_taken {
711            RecoveryAction::Quarantined { backup_path } => {
712                assert!(backup_path.exists());
713                let backup = fs::read_to_string(backup_path).unwrap();
714                assert!(backup.contains("garbage data"));
715            }
716            _ => panic!("expected Quarantined"),
717        }
718
719        // Original should only have the header
720        let content = fs::read_to_string(&path).unwrap();
721        assert_eq!(content, "# header\n");
722    }
723
724    #[test]
725    fn corrupt_shard_nonexistent_file() {
726        let dir = TempDir::new().unwrap();
727        let path = dir.path().join("nope.events");
728
729        let result = recover_corrupt_shard(&path);
730        assert!(result.is_err());
731    }
732
733    // ---- Cache recovery tests ----
734
735    #[test]
736    fn cache_recovery_deletes_file() {
737        let dir = TempDir::new().unwrap();
738        let path = dir.path().join("events.bin");
739        fs::write(&path, "corrupt data").unwrap();
740
741        let deleted = recover_corrupt_cache(&path).unwrap();
742        assert!(deleted);
743        assert!(!path.exists());
744    }
745
746    #[test]
747    fn cache_recovery_nonexistent_file() {
748        let dir = TempDir::new().unwrap();
749        let path = dir.path().join("events.bin");
750
751        let deleted = recover_corrupt_cache(&path).unwrap();
752        assert!(!deleted);
753    }
754
755    // ---- Missing DB recovery tests ----
756
757    #[test]
758    fn missing_db_no_events_dir() {
759        let dir = TempDir::new().unwrap();
760        let events_dir = dir.path().join("events");
761        let db_path = dir.path().join("bones.db");
762
763        let result = recover_missing_db(&events_dir, &db_path);
764        assert!(result.is_err());
765        assert!(matches!(
766            result.unwrap_err(),
767            RecoveryError::EventsDirNotFound(_)
768        ));
769    }
770
771    #[test]
772    fn missing_db_empty_events() {
773        let dir = TempDir::new().unwrap();
774        let bones_dir = dir.path();
775
776        // Set up minimal bones structure
777        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
778        shard_mgr.ensure_dirs().expect("ensure dirs");
779        shard_mgr.init().expect("init");
780
781        let events_dir = bones_dir.join("events");
782        let db_path = bones_dir.join("bones.db");
783
784        let report = recover_missing_db(&events_dir, &db_path).unwrap();
785        assert_eq!(report.events_preserved, 0);
786        assert!(db_path.exists());
787    }
788
789    #[test]
790    fn missing_db_with_events_rebuilds() {
791        let dir = TempDir::new().unwrap();
792        let bones_dir = dir.path();
793
794        // Set up bones with some events
795        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
796        shard_mgr.ensure_dirs().expect("ensure dirs");
797        shard_mgr.init().expect("init");
798
799        // Write a create event
800        use crate::event::Event;
801        use crate::event::data::*;
802        use crate::event::types::EventType;
803        use crate::event::writer;
804        use crate::model::item::{Kind, Size, Urgency};
805        use crate::model::item_id::ItemId;
806        use std::collections::BTreeMap;
807
808        let mut event = Event {
809            wall_ts_us: 1000,
810            agent: "test".into(),
811            itc: "itc:AQ".into(),
812            parents: vec![],
813            event_type: EventType::Create,
814            item_id: ItemId::new_unchecked("bn-001"),
815            data: EventData::Create(CreateData {
816                title: "Test item".into(),
817                kind: Kind::Task,
818                size: Some(Size::M),
819                urgency: Urgency::Default,
820                labels: vec![],
821                parent: None,
822                causation: None,
823                description: None,
824                extra: BTreeMap::new(),
825            }),
826            event_hash: String::new(),
827        };
828        writer::write_event(&mut event).expect("hash");
829        let line = writer::write_line(&event).expect("serialize");
830        let (year, month) = shard_mgr.active_shard().unwrap().unwrap();
831        shard_mgr.append_raw(year, month, &line).expect("append");
832
833        let events_dir = bones_dir.join("events");
834        let db_path = bones_dir.join("bones.db");
835
836        let report = recover_missing_db(&events_dir, &db_path).unwrap();
837        assert_eq!(report.events_preserved, 1);
838        assert!(db_path.exists());
839
840        // Verify the item is in the rebuilt DB
841        let conn = crate::db::open_projection(&db_path).unwrap();
842        let title: String = conn
843            .query_row(
844                "SELECT title FROM items WHERE item_id = 'bn-001'",
845                [],
846                |row| row.get(0),
847            )
848            .unwrap();
849        assert_eq!(title, "Test item");
850    }
851
852    #[test]
853    fn corrupt_db_is_backed_up_before_rebuild() {
854        let dir = TempDir::new().unwrap();
855        let bones_dir = dir.path();
856
857        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
858        shard_mgr.ensure_dirs().expect("ensure dirs");
859        shard_mgr.init().expect("init");
860
861        let events_dir = bones_dir.join("events");
862        let db_path = bones_dir.join("bones.db");
863
864        // Write something pretending to be a corrupt DB
865        fs::write(&db_path, "this is not sqlite").unwrap();
866
867        let report = recover_missing_db(&events_dir, &db_path).unwrap();
868
869        // Corrupt DB should be backed up
870        let backup_path = db_path.with_extension("db.corrupt");
871        match &report.action_taken {
872            RecoveryAction::Quarantined { backup_path: bp } => {
873                assert_eq!(bp, &backup_path);
874                assert!(backup_path.exists());
875                let backup_content = fs::read_to_string(&backup_path).unwrap();
876                assert_eq!(backup_content, "this is not sqlite");
877            }
878            _ => panic!("expected Quarantined action"),
879        }
880    }
881
882    // ---- Auto-recovery tests ----
883
884    #[test]
885    fn auto_recover_nonexistent_project() {
886        let dir = TempDir::new().unwrap();
887        let bones_dir = dir.path().join(".bones");
888
889        let result = auto_recover(&bones_dir).unwrap();
890        assert!(!result.project_valid);
891    }
892
893    #[test]
894    fn auto_recover_healthy_project() {
895        let dir = TempDir::new().unwrap();
896        let bones_dir = dir.path();
897
898        // Set up minimal healthy project
899        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
900        shard_mgr.ensure_dirs().expect("ensure dirs");
901        shard_mgr.init().expect("init");
902
903        // Create the DB with rebuild
904        let events_dir = bones_dir.join("events");
905        let db_path = bones_dir.join("bones.db");
906        crate::db::rebuild::rebuild(&events_dir, &db_path).unwrap();
907
908        let result = auto_recover(bones_dir).unwrap();
909        assert!(result.project_valid);
910        assert!(result.torn_write_repairs.is_empty());
911        assert!(!result.db_rebuilt);
912        assert_eq!(result.caches_cleaned, 0);
913        assert!(result.warnings.is_empty());
914    }
915
916    #[test]
917    fn auto_recover_repairs_torn_write() {
918        let dir = TempDir::new().unwrap();
919        let bones_dir = dir.path();
920
921        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
922        shard_mgr.ensure_dirs().expect("ensure dirs");
923        shard_mgr.init().expect("init");
924
925        // Create DB first
926        let events_dir = bones_dir.join("events");
927        let db_path = bones_dir.join("bones.db");
928        crate::db::rebuild::rebuild(&events_dir, &db_path).unwrap();
929
930        // Simulate torn write: append incomplete data to active shard
931        let (year, month) = shard_mgr.active_shard().unwrap().unwrap();
932        let shard_path = events_dir.join(format!("{year:04}-{month:02}.events"));
933        let mut file = fs::OpenOptions::new()
934            .append(true)
935            .open(&shard_path)
936            .unwrap();
937        file.write_all(b"incomplete line without newline").unwrap();
938
939        let result = auto_recover(bones_dir).unwrap();
940        assert!(result.project_valid);
941        assert_eq!(result.torn_write_repairs.len(), 1);
942        // "incomplete line without newline" = 30 bytes, but shard header may
943        // affect exact count. Just verify some bytes were repaired.
944        assert!(result.torn_write_repairs[0].1 > 0);
945    }
946
947    #[test]
948    fn auto_recover_rebuilds_missing_db() {
949        let dir = TempDir::new().unwrap();
950        let bones_dir = dir.path();
951
952        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
953        shard_mgr.ensure_dirs().expect("ensure dirs");
954        shard_mgr.init().expect("init");
955
956        // Don't create DB — auto_recover should rebuild it
957        let result = auto_recover(bones_dir).unwrap();
958        assert!(result.project_valid);
959        assert!(result.db_rebuilt);
960    }
961
962    #[test]
963    fn auto_recover_cleans_corrupt_cache() {
964        let dir = TempDir::new().unwrap();
965        let bones_dir = dir.path();
966
967        let shard_mgr = crate::shard::ShardManager::new(bones_dir);
968        shard_mgr.ensure_dirs().expect("ensure dirs");
969        shard_mgr.init().expect("init");
970
971        // Create DB
972        let events_dir = bones_dir.join("events");
973        let db_path = bones_dir.join("bones.db");
974        crate::db::rebuild::rebuild(&events_dir, &db_path).unwrap();
975
976        // Create corrupt cache
977        let cache_dir = bones_dir.join("cache");
978        fs::create_dir_all(&cache_dir).unwrap();
979        fs::write(cache_dir.join("events.bin"), "not a valid cache").unwrap();
980
981        let result = auto_recover(bones_dir).unwrap();
982        assert!(result.project_valid);
983        assert_eq!(result.caches_cleaned, 1);
984        assert!(!cache_dir.join("events.bin").exists());
985    }
986
987    // ---- Locked DB retry tests ----
988
989    #[test]
990    fn open_db_with_retry_succeeds_immediately() {
991        let dir = TempDir::new().unwrap();
992        let db_path = dir.path().join("test.db");
993
994        // Create a valid DB first
995        let conn = rusqlite::Connection::open(&db_path).unwrap();
996        conn.execute_batch("CREATE TABLE test (id INTEGER)")
997            .unwrap();
998        drop(conn);
999
1000        // Should open immediately
1001        let result = open_db_with_retry(&db_path, Duration::from_secs(1));
1002        assert!(result.is_ok());
1003    }
1004
1005    #[test]
1006    fn open_db_with_retry_handles_missing_db() {
1007        let dir = TempDir::new().unwrap();
1008        let db_path = dir.path().join("test.db");
1009
1010        // Should create the DB (SQLite creates on open)
1011        let result = open_db_with_retry(&db_path, Duration::from_secs(1));
1012        assert!(result.is_ok());
1013    }
1014
1015    // ---- RecoveryReport Display ----
1016
1017    #[test]
1018    fn recovery_action_debug() {
1019        let action = RecoveryAction::Truncated { bytes_removed: 42 };
1020        let debug = format!("{action:?}");
1021        assert!(debug.contains("42"));
1022
1023        let action = RecoveryAction::Quarantined {
1024            backup_path: PathBuf::from("/tmp/test.corrupt"),
1025        };
1026        let debug = format!("{action:?}");
1027        assert!(debug.contains("test.corrupt"));
1028    }
1029
1030    #[test]
1031    fn recovery_error_display() {
1032        let err = RecoveryError::ShardNotFound(PathBuf::from("/tmp/test.events"));
1033        let display = format!("{err}");
1034        assert!(display.contains("not found"));
1035
1036        let err = RecoveryError::LockTimeout(Duration::from_secs(30));
1037        let display = format!("{err}");
1038        assert!(display.contains("30s"));
1039    }
1040}