indexedlog/
repair.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8use std::fs;
9use std::io::Write;
10use std::ops::AddAssign;
11use std::path::Path;
12
13use crate::errors::ResultExt;
14use crate::lock::DirLockOptions;
15use crate::lock::ScopedDirLock;
16use crate::lock::READER_LOCK_OPTS;
17
18// Public interface -------------------------------------------------------
19
20/// Repair a structure at the given path.
21pub trait Repair<T> {
22    /// Repair a structure at the given path.
23    ///
24    /// Overload this method to repair recursively.
25    fn repair(path: impl AsRef<Path>) -> crate::Result<String>;
26}
27
28/// Repair on open.
29pub trait OpenWithRepair {
30    type Output;
31
32    /// Call `open`. If it fails with data corruption errors, try `repair`
33    /// once, then `open` again.
34    ///
35    /// This conveniently fixes a subset of corruptions usually caused by OS
36    /// crash or hard reboots. It does not fix corruptions that may occur during
37    /// data reading after `open`.
38    ///
39    /// For performance reasons, this does not perform a full verification
40    /// of all data and corruption can still happen when reading data.
41    ///
42    /// Repair is skipped if there are other readers for safety. This is
43    /// because indexedlog requires append-only for lock-free reads.
44    /// Repair is not append-only. It can silently cause other running
45    /// processes reading the data, or keeping the data previously read
46    /// to get silently wrong data without detection.
47    fn open_with_repair(&self, path: impl AsRef<Path>) -> crate::Result<Self::Output>
48    where
49        Self: Sized;
50}
51
52/// A structure with a static [`OpenOptions`].
53///
54/// Structures implementing this trait with `T` being `log::OpenOptions`
55/// or `rotate::OpenOptions` gets `Repair` implemented automatically.
56pub trait DefaultOpenOptions<T> {
57    fn default_open_options() -> T;
58}
59
60// Private implementations ------------------------------------------------
61
62/// Repair defined on an instance. For example, `OpenOptions`.
63pub trait OpenOptionsRepair {
64    fn open_options_repair(&self, path: impl AsRef<Path>) -> crate::Result<String>;
65}
66
67/// Defines the output of OpenOptions.
68pub trait OpenOptionsOutput {
69    type Output;
70
71    fn open_path(&self, path: &Path) -> crate::Result<Self::Output>;
72}
73
74/// Repair message as a string.
75/// Also write the message to other places (ex. a file, best-effort).
76pub(crate) struct RepairMessage {
77    output: String,
78    additional_outputs: Vec<Box<dyn Write>>,
79}
80
81impl RepairMessage {
82    /// Creates the `RepairMessage`. Attempt to write to `repair.log`
83    /// in `dir`, but unable to doing so is not fatal.
84    pub(crate) fn new(dir: &Path) -> Self {
85        let mut additional_outputs = Vec::new();
86
87        // Truncate the file if it's too large (ex. when repair is run
88        // in a loop). This check and the actual truncation is racy.
89        // However, this is only for diagnostics. It does not affect
90        // the main data correctness.
91        let path = dir.join("repair.log");
92        let mut need_truncate = false;
93        if let Ok(meta) = fs::metadata(&path) {
94            const REPAIR_LOG_SIZE_LIMIT: u64 = 1 << 20;
95            if meta.len() > REPAIR_LOG_SIZE_LIMIT {
96                need_truncate = true;
97            }
98        }
99
100        let mut opts = fs::OpenOptions::new();
101        opts.write(true).create(true);
102        if need_truncate {
103            opts.truncate(true);
104        } else {
105            opts.append(true);
106        }
107
108        if let Ok(mut file) = opts.open(path) {
109            if need_truncate {
110                let _ = file.write_all(b"# This file was truncated\n\n");
111            }
112            if let Ok(duration) = std::time::UNIX_EPOCH.elapsed() {
113                let msg = format!("date -d @{}\n", duration.as_secs());
114                let _ = file.write_all(msg.as_bytes());
115            }
116            additional_outputs.push(Box::new(file) as Box<dyn Write>);
117        }
118        Self {
119            output: String::new(),
120            additional_outputs,
121        }
122    }
123
124    pub(crate) fn as_str(&self) -> &str {
125        self.output.as_str()
126    }
127
128    pub(crate) fn into_string(mut self) -> String {
129        for out in self.additional_outputs.iter_mut() {
130            let _ = out.write_all(b"\n");
131            let _ = out.flush();
132        }
133        self.output
134    }
135}
136
137impl AddAssign<&str> for RepairMessage {
138    fn add_assign(&mut self, rhs: &str) {
139        self.output += rhs;
140        for out in self.additional_outputs.iter_mut() {
141            let _ = out.write_all(rhs.as_bytes());
142        }
143    }
144}
145
146impl<T: DefaultOpenOptions<O>, O: OpenOptionsRepair> Repair<O> for T {
147    fn repair(path: impl AsRef<Path>) -> crate::Result<String> {
148        T::default_open_options().open_options_repair(path.as_ref())
149    }
150}
151
152pub(crate) fn open_with_repair<T>(opts: &T, path: &Path) -> crate::Result<T::Output>
153where
154    T: OpenOptionsOutput + OpenOptionsRepair,
155{
156    match opts.open_path(path) {
157        Ok(v) => Ok(v),
158        Err(e) if e.is_corruption() => {
159            // Check if it's safe to repair (no active readers).
160            static CHECK_READER_LOCK_OPTS: DirLockOptions = DirLockOptions {
161                exclusive: true,
162                non_blocking: true,
163                ..READER_LOCK_OPTS
164            };
165
166            let mut msg = RepairMessage::new(path);
167            msg += &format!("Corruption detected: {:?}.\n", &e);
168
169            let lock = match ScopedDirLock::new_with_options(path, &CHECK_READER_LOCK_OPTS) {
170                Ok(lock) => lock,
171                Err(lock_err) => {
172                    msg += "Auto-repair is skipped due to active readers.\n";
173                    let _ = msg.into_string();
174                    return Err(e.source(lock_err))
175                        .context(|| format!("in open_with_repair({:?})", path))
176                        .context("repair is skipped due to active readers");
177                }
178            };
179
180            // Release the lock. It prevents open, optionally used by repair.
181            // Without this it will deadlock.
182            // We don't need to prevent others from obtaining the reader lock
183            // to `open` because the `open` will fail anyway.
184            drop(lock);
185
186            msg += "Starting auto repair.\n";
187            let _ = msg.into_string();
188
189            // Repair and retry.
190            let repair_message = opts
191                .open_options_repair(path)
192                .context(|| format!("in open_with_repair({:?}), attempt to repair", path))?;
193            tracing::info!("Auto-repair {:?} Result:\n{}", path, &repair_message);
194            opts.open_path(path).context(|| {
195                format!(
196                    "in open_with_repair({:?}), after repair ({})",
197                    path, repair_message
198                )
199            })
200        }
201        Err(e) => Err(e),
202    }
203}
204
205impl<T> OpenWithRepair for T
206where
207    T: OpenOptionsOutput + OpenOptionsRepair,
208{
209    type Output = T::Output;
210
211    fn open_with_repair(&self, path: impl AsRef<Path>) -> crate::Result<Self::Output>
212    where
213        Self: Sized,
214    {
215        let path = path.as_ref();
216        open_with_repair(self, path)
217    }
218}