Skip to main content

defect_tools/fs/
local_backend.rs

1//! [`LocalFsBackend`]: a direct-to-disk [`FsBackend`] implementation.
2//!
3//! Local filesystem backend — implements two key invariants:
4//! - **Line-ending normalization**: when writing to an existing file, normalizes
5//!   new content to match the file's dominant line ending (CRLF / LF), avoiding mixed
6//!   line endings.
7//! - **Atomic writes**: performs full overwrites via a temporary file + `rename`,
8//!   preventing partial files.
9//!
10//! Path validation is delegated to [`defect_agent::fs::resolve_workspace_path`] —
11//! `LocalFsBackend` and `AcpFsBackend` share the same function.
12
13use std::borrow::Cow;
14use std::io;
15use std::path::{Path, PathBuf};
16use std::sync::atomic::{AtomicU64, Ordering};
17use tokio::fs;
18
19use defect_agent::error::BoxError;
20use defect_agent::fs::{Fingerprint, FsBackend, FsError, resolve_workspace_path};
21use futures::future::BoxFuture;
22
23/// Hard upper bound for single-file size (shared by read and write).
24pub const MAX_FS_BYTES: u64 = 10 * 1024 * 1024;
25
26/// Monotonic in-process counter used during `tmp + rename` to prevent concurrent writes
27/// to the same path within the same process.
28static TMP_NONCE: AtomicU64 = AtomicU64::new(0);
29
30/// A disk-backed [`FsBackend`] implementation.
31///
32/// Holds the session's workspace root; all reads and writes are first validated by
33/// [`resolve_workspace_path`].
34pub struct LocalFsBackend {
35    workspace_root: PathBuf,
36}
37
38impl LocalFsBackend {
39    pub fn new(workspace_root: PathBuf) -> Self {
40        Self { workspace_root }
41    }
42
43    pub fn workspace_root(&self) -> &Path {
44        &self.workspace_root
45    }
46}
47
48impl FsBackend for LocalFsBackend {
49    fn read_text(
50        &self,
51        path: PathBuf,
52        line: Option<u32>,
53        limit: Option<u32>,
54    ) -> BoxFuture<'_, Result<String, FsError>> {
55        Box::pin(async move {
56            let abs = resolve_workspace_path(&self.workspace_root, &path)?;
57
58            let metadata = fs::metadata(&abs).await.map_err(|e| match e.kind() {
59                io::ErrorKind::NotFound => FsError::NotFound(abs.clone()),
60                _ => FsError::Backend(BoxError::new(e)),
61            })?;
62
63            // Full reads are blocked by a hard size limit. Windowed reads (when `line` or
64            // `limit` is `Some`) use a chunked-read path that streams line by line,
65            // buffering only the requested window. This implements large-file
66            // pagination, allowing the LLM to navigate log/data files larger than
67            // 10 MiB via offset/limit without exceeding the overall memory budget.
68            let windowed = line.is_some() || limit.is_some();
69            if !windowed && metadata.len() > MAX_FS_BYTES {
70                return Err(FsError::TooLarge {
71                    bytes: metadata.len(),
72                    limit: MAX_FS_BYTES,
73                });
74            }
75
76            if windowed {
77                return read_window_streaming(&abs, line, limit).await;
78            }
79
80            let bytes = fs::read(&abs).await.map_err(|e| match e.kind() {
81                io::ErrorKind::NotFound => FsError::NotFound(abs.clone()),
82                _ => FsError::Backend(BoxError::new(e)),
83            })?;
84
85            if looks_binary(&bytes) {
86                return Err(FsError::NotPermitted(format!(
87                    "binary file: {}",
88                    abs.display()
89                )));
90            }
91
92            let text = String::from_utf8(bytes)
93                .map_err(|e| FsError::NotPermitted(format!("file is not valid UTF-8: {e}")))?;
94
95            Ok(slice_lines(&text, line, limit))
96        })
97    }
98
99    fn read_bytes(&self, path: PathBuf) -> BoxFuture<'_, Result<Vec<u8>, FsError>> {
100        Box::pin(async move {
101            let abs = resolve_workspace_path(&self.workspace_root, &path)?;
102
103            let metadata = fs::metadata(&abs).await.map_err(|e| match e.kind() {
104                io::ErrorKind::NotFound => FsError::NotFound(abs.clone()),
105                _ => FsError::Backend(BoxError::new(e)),
106            })?;
107            if metadata.len() > MAX_FS_BYTES {
108                return Err(FsError::TooLarge {
109                    bytes: metadata.len(),
110                    limit: MAX_FS_BYTES,
111                });
112            }
113
114            fs::read(&abs).await.map_err(|e| match e.kind() {
115                io::ErrorKind::NotFound => FsError::NotFound(abs.clone()),
116                _ => FsError::Backend(BoxError::new(e)),
117            })
118        })
119    }
120
121    /// Use mtime + size as the fingerprint — much cheaper than the default "read entire
122    /// file + hash" approach, and sufficient for conflict detection semantics: a
123    /// change in mtime or size is treated as a conflict.
124    fn fingerprint(&self, path: PathBuf) -> BoxFuture<'_, Result<Fingerprint, FsError>> {
125        Box::pin(async move {
126            let abs = resolve_workspace_path(&self.workspace_root, &path)?;
127            let metadata = fs::metadata(&abs).await.map_err(|e| match e.kind() {
128                io::ErrorKind::NotFound => FsError::NotFound(abs.clone()),
129                _ => FsError::Backend(BoxError::new(e)),
130            })?;
131
132            let size = metadata.len();
133            let mtime_nanos = metadata
134                .modified()
135                .ok()
136                .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
137                .map(|d| d.as_nanos() as u64)
138                .unwrap_or(0);
139
140            // Store `mtime_nanos` in the `hash` field and `size` in the `bytes` field —
141            // [`Fingerprint`] equality compares the two fields directly as `(size,
142            // mtime)`.
143            Ok(Fingerprint {
144                bytes: size,
145                hash: mtime_nanos,
146            })
147        })
148    }
149
150    fn write_text(&self, path: PathBuf, content: String) -> BoxFuture<'_, Result<(), FsError>> {
151        Box::pin(async move {
152            let abs = resolve_workspace_path(&self.workspace_root, &path)?;
153
154            if content.len() as u64 > MAX_FS_BYTES {
155                return Err(FsError::TooLarge {
156                    bytes: content.len() as u64,
157                    limit: MAX_FS_BYTES,
158                });
159            }
160
161            // Normalize line endings: only normalize when the file already exists, using
162            // its existing line-ending convention; for new files, preserve the original
163            // line endings from the LLM output.
164            let final_content: Cow<'_, str> = match tokio::fs::read(&abs).await {
165                Ok(prev_bytes) => {
166                    let prev = String::from_utf8_lossy(&prev_bytes);
167                    let target = detect_line_ending(&prev);
168                    normalize(&content, target)
169                }
170                Err(e) if e.kind() == io::ErrorKind::NotFound => Cow::Borrowed(content.as_str()),
171                Err(e) => return Err(FsError::Backend(BoxError::new(e))),
172            };
173
174            atomic_write(&abs, final_content.as_bytes())
175                .await
176                .map_err(|e| FsError::Backend(BoxError::new(e)))?;
177
178            Ok(())
179        })
180    }
181}
182
183/// Streaming read window: scans the file line by line, accumulating content only within
184/// the range [start, start+take).
185///
186/// Unlike [`slice_lines`], which requires the entire file to be in memory as a
187/// [`String`], this approach uses `BufReader::read_line` and discards skipped lines
188/// without counting them toward the byte budget. This means even files far exceeding
189/// [`MAX_FS_BYTES`] won't cause memory issues as long as `limit` is tight enough.
190///
191/// Binary heuristic: rejects the file if a NUL byte is encountered during scanning,
192/// matching the semantics of the full-path [`looks_binary`].
193async fn read_window_streaming(
194    path: &Path,
195    line: Option<u32>,
196    limit: Option<u32>,
197) -> Result<String, FsError> {
198    use tokio::io::AsyncBufReadExt;
199
200    let file = tokio::fs::File::open(path)
201        .await
202        .map_err(|e| match e.kind() {
203            io::ErrorKind::NotFound => FsError::NotFound(path.to_path_buf()),
204            _ => FsError::Backend(BoxError::new(e)),
205        })?;
206    let mut reader = tokio::io::BufReader::new(file);
207
208    let start = line.unwrap_or(1).max(1) as usize - 1;
209    let take = limit.unwrap_or(u32::MAX) as usize;
210
211    let mut buf = Vec::new();
212    let mut out = String::new();
213    let mut idx: usize = 0;
214    let mut accepted: usize = 0;
215    let mut total_window_bytes: u64 = 0;
216
217    while accepted < take {
218        buf.clear();
219        let n = reader
220            .read_until(b'\n', &mut buf)
221            .await
222            .map_err(|e| FsError::Backend(BoxError::new(e)))?;
223        if n == 0 {
224            break; // EOF
225        }
226        if buf.contains(&0u8) {
227            return Err(FsError::NotPermitted(format!(
228                "binary file: {}",
229                path.display()
230            )));
231        }
232
233        if idx >= start {
234            // Only accumulate lines within the window; reject if they exceed
235            // `MAX_FS_BYTES` to prevent a single window from exhausting memory. The
236            // window size is determined by the LLM-chosen `limit`; when the threshold is
237            // hit, return `TooLarge` so the caller can retry with a smaller `limit`.
238            total_window_bytes = total_window_bytes.saturating_add(n as u64);
239            if total_window_bytes > MAX_FS_BYTES {
240                return Err(FsError::TooLarge {
241                    bytes: total_window_bytes,
242                    limit: MAX_FS_BYTES,
243                });
244            }
245            let chunk = std::str::from_utf8(&buf)
246                .map_err(|e| FsError::NotPermitted(format!("file is not valid UTF-8: {e}")))?;
247            out.push_str(chunk);
248            accepted += 1;
249        }
250        idx += 1;
251    }
252
253    Ok(out)
254}
255
256#[derive(Debug, Clone, Copy, PartialEq, Eq)]
257enum LineEnding {
258    Lf,
259    Crlf,
260}
261
262fn detect_line_ending(text: &str) -> LineEnding {
263    let crlf = text.matches("\r\n").count();
264    let total_lf = text.matches('\n').count();
265    let lone_lf = total_lf.saturating_sub(crlf);
266    if crlf > lone_lf {
267        LineEnding::Crlf
268    } else {
269        LineEnding::Lf
270    }
271}
272
273fn normalize(content: &str, target: LineEnding) -> Cow<'_, str> {
274    match target {
275        LineEnding::Lf => {
276            if content.contains("\r\n") {
277                Cow::Owned(content.replace("\r\n", "\n"))
278            } else {
279                Cow::Borrowed(content)
280            }
281        }
282        LineEnding::Crlf => {
283            // Normalize to LF first, then replace all LF with CRLF — this avoids
284            // double-converting sequences like "\r\n\n" into "\r\r\n".
285            let lf = content.replace("\r\n", "\n");
286            Cow::Owned(lf.replace('\n', "\r\n"))
287        }
288    }
289}
290
291/// Binary heuristic: presence of `\0` or a high ratio of non-printable bytes. Only scans
292/// the first 8 KiB.
293fn looks_binary(bytes: &[u8]) -> bool {
294    let head = bytes.get(..8 * 1024).unwrap_or(bytes);
295    if head.is_empty() {
296        return false;
297    }
298    if head.contains(&0u8) {
299        return true;
300    }
301    let non_printable = head
302        .iter()
303        .filter(|&&b| b < 0x09 || (b > 0x0d && b < 0x20))
304        .count();
305    non_printable * 100 / head.len() > 30
306}
307
308/// Slices the text by `line` (1-based) and `limit`. Returns the full text when both are
309/// `None`.
310fn slice_lines(text: &str, line: Option<u32>, limit: Option<u32>) -> String {
311    if line.is_none() && limit.is_none() {
312        return text.to_string();
313    }
314    let start = line.unwrap_or(1).max(1) as usize - 1;
315    let take = limit.unwrap_or(u32::MAX) as usize;
316    let mut out = String::new();
317    for (idx, l) in text.split_inclusive('\n').enumerate() {
318        if idx < start {
319            continue;
320        }
321        if idx >= start + take {
322            break;
323        }
324        out.push_str(l);
325    }
326    out
327}
328
329/// Atomic write via `tmp + rename`. The temporary file is placed in the same parent
330/// directory to avoid cross-device renames. The parent directory is created automatically
331/// if it does not exist (`mkdir -p`).
332async fn atomic_write(path: &Path, bytes: &[u8]) -> io::Result<()> {
333    let parent = path
334        .parent()
335        .ok_or_else(|| io::Error::other("path has no parent"))?;
336    tokio::fs::create_dir_all(parent).await?;
337    let file_name = path
338        .file_name()
339        .ok_or_else(|| io::Error::other("path has no file component"))?;
340    let nonce = TMP_NONCE.fetch_add(1, Ordering::Relaxed);
341    let pid = std::process::id();
342    let tmp_path = parent.join(format!(
343        ".{}.defect-{pid}-{nonce}.tmp",
344        file_name.to_string_lossy()
345    ));
346
347    // RAII: automatically removes tmp on error paths to avoid leftover files.
348    let cleanup = TmpCleanup {
349        path: Some(tmp_path.clone()),
350    };
351    tokio::fs::write(&tmp_path, bytes).await?;
352    tokio::fs::rename(&tmp_path, path).await?;
353    cleanup.disarm();
354    Ok(())
355}
356
357struct TmpCleanup {
358    path: Option<PathBuf>,
359}
360
361impl TmpCleanup {
362    fn disarm(mut self) {
363        self.path = None;
364    }
365}
366
367impl Drop for TmpCleanup {
368    fn drop(&mut self) {
369        if let Some(p) = self.path.take() {
370            // Best-effort: leaving a .tmp file is better than leaving a partial target
371            // file.
372            let _ = std::fs::remove_file(&p);
373        }
374    }
375}
376
377#[cfg(test)]
378mod tests;