indexedlog/
repair.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8use std::fs;
9use std::io::Write;
10use std::ops::AddAssign;
11use std::path::Path;
12
13use crate::errors::ResultExt;
14use crate::lock::DirLockOptions;
15use crate::lock::ScopedDirLock;
16use crate::lock::READER_LOCK_OPTS;
17
18// Public interface -------------------------------------------------------
19
20/// Repair a structure at the given path.
21pub trait Repair<T> {
22    /// Repair a structure at the given path.
23    ///
24    /// Overload this method to repair recursively.
25    fn repair(path: impl AsRef<Path>) -> crate::Result<String>;
26}
27
28/// Repair on open.
29pub trait OpenWithRepair {
30    type Output;
31
32    /// Call `open`. If it fails with data corruption errors, try `repair`
33    /// once, then `open` again.
34    ///
35    /// This conveniently fixes a subset of corruptions usually caused by OS
36    /// crash or hard reboots. It does not fix corruptions that may occur during
37    /// data reading after `open`.
38    ///
39    /// For performance reasons, this does not perform a full verification
40    /// of all data and corruption can still happen when reading data.
41    ///
42    /// Repair is skipped if there are other readers for safety. This is
43    /// because indexedlog requires append-only for lock-free reads.
44    /// Repair is not append-only. It can silently cause other running
45    /// processes reading the data, or keeping the data previously read
46    /// to get silently wrong data without detection.
47    fn open_with_repair(&self, path: impl AsRef<Path>) -> crate::Result<Self::Output>
48    where
49        Self: Sized;
50}
51
52/// A structure with a static [`OpenOptions`].
53///
54/// Structures implementing this trait with `T` being `log::OpenOptions`
55/// or `rotate::OpenOptions` gets `Repair` implemented automatically.
56pub trait DefaultOpenOptions<T> {
57    fn default_open_options() -> T;
58}
59
60// Private implementations ------------------------------------------------
61
62/// Repair defined on an instance. For example, `OpenOptions`.
63pub trait OpenOptionsRepair {
64    fn open_options_repair(&self, path: impl AsRef<Path>) -> crate::Result<String>;
65}
66
67/// Defines the output of OpenOptions.
68pub trait OpenOptionsOutput {
69    type Output;
70
71    fn open_path(&self, path: &Path) -> crate::Result<Self::Output>;
72}
73
74/// Repair message as a string.
75/// Also write the message to other places (ex. a file, best-effort).
76pub(crate) struct RepairMessage {
77    output: String,
78    additional_outputs: Vec<Box<dyn Write>>,
79}
80
81impl RepairMessage {
82    /// Creates the `RepairMessage`. Attempt to write to `repair.log`
83    /// in `dir`, but unable to doing so is not fatal.
84    pub(crate) fn new(dir: &Path) -> Self {
85        let mut additional_outputs = Vec::new();
86
87        // Truncate the file if it's too large (ex. when repair is run
88        // in a loop).
89        let path = dir.join("repair.log");
90        let mut need_truncate = false;
91        if let Ok(meta) = fs::metadata(&path) {
92            const REPAIR_LOG_SIZE_LIMIT: u64 = 1 << 20;
93            if meta.len() > REPAIR_LOG_SIZE_LIMIT {
94                need_truncate = true;
95            }
96        }
97
98        let mut opts = fs::OpenOptions::new();
99        opts.write(true).create(true);
100        if !need_truncate {
101            opts.append(true);
102        }
103
104        if let Ok(mut file) = opts.open(path) {
105            if need_truncate {
106                let _ = file.write_all(b"# This file was truncated\n\n");
107            }
108            if let Ok(duration) = std::time::UNIX_EPOCH.elapsed() {
109                let msg = format!("date -d @{}\n", duration.as_secs());
110                let _ = file.write_all(msg.as_bytes());
111            }
112            additional_outputs.push(Box::new(file) as Box<dyn Write>);
113        }
114        Self {
115            output: String::new(),
116            additional_outputs,
117        }
118    }
119
120    pub(crate) fn as_str(&self) -> &str {
121        self.output.as_str()
122    }
123
124    pub(crate) fn into_string(mut self) -> String {
125        for out in self.additional_outputs.iter_mut() {
126            let _ = out.write_all(b"\n");
127            let _ = out.flush();
128        }
129        self.output
130    }
131}
132
133impl AddAssign<&str> for RepairMessage {
134    fn add_assign(&mut self, rhs: &str) {
135        self.output += rhs;
136        for out in self.additional_outputs.iter_mut() {
137            let _ = out.write_all(rhs.as_bytes());
138        }
139    }
140}
141
142impl<T: DefaultOpenOptions<O>, O: OpenOptionsRepair> Repair<O> for T {
143    fn repair(path: impl AsRef<Path>) -> crate::Result<String> {
144        T::default_open_options().open_options_repair(path.as_ref())
145    }
146}
147
148pub(crate) fn open_with_repair<T>(opts: &T, path: &Path) -> crate::Result<T::Output>
149where
150    T: OpenOptionsOutput + OpenOptionsRepair,
151{
152    match opts.open_path(path) {
153        Ok(v) => Ok(v),
154        Err(e) if e.is_corruption() => {
155            // Check if it's safe to repair (no active readers).
156            static CHECK_READER_LOCK_OPTS: DirLockOptions = DirLockOptions {
157                exclusive: true,
158                non_blocking: true,
159                ..READER_LOCK_OPTS
160            };
161
162            let mut msg = RepairMessage::new(path);
163            msg += &format!("Corruption detected: {:?}.\n", &e);
164
165            let lock = match ScopedDirLock::new_with_options(path, &CHECK_READER_LOCK_OPTS) {
166                Ok(lock) => lock,
167                Err(lock_err) => {
168                    msg += &"Auto-repair is skipped due to active readers.\n";
169                    let _ = msg.into_string();
170                    return Err(e.source(lock_err))
171                        .context(|| format!("in open_with_repair({:?})", path))
172                        .context("repair is skipped due to active readers");
173                }
174            };
175
176            // Release the lock. It prevents open, optionally used by repair.
177            // Without this it will deadlock.
178            // We don't need to prevent others from obtaining the reader lock
179            // to `open` because the `open` will fail anyway.
180            drop(lock);
181
182            msg += &"Starting auto repair.\n";
183            let _ = msg.into_string();
184
185            // Repair and retry.
186            let repair_message = opts
187                .open_options_repair(path)
188                .context(|| format!("in open_with_repair({:?}), attempt to repair", path))?;
189            tracing::info!("Auto-repair {:?} Result:\n{}", path, &repair_message);
190            return opts.open_path(path).context(|| {
191                format!(
192                    "in open_with_repair({:?}), after repair ({})",
193                    path, repair_message
194                )
195            });
196        }
197        Err(e) => Err(e),
198    }
199}
200
201impl<T> OpenWithRepair for T
202where
203    T: OpenOptionsOutput + OpenOptionsRepair,
204{
205    type Output = T::Output;
206
207    fn open_with_repair(&self, path: impl AsRef<Path>) -> crate::Result<Self::Output>
208    where
209        Self: Sized,
210    {
211        let path = path.as_ref();
212        open_with_repair(self, path)
213    }
214}