Skip to main content

devboy_assets/
cache.rs

1//! Filesystem-level cache manager.
2//!
3//! Responsibilities:
4//!
5//! - Deterministically map [`AssetContext`] values to on-disk paths
6//! - Store / load / delete file blobs
7//! - Compute SHA-256 checksums
8//!
9//! The cache manager is unaware of the index — the higher-level
10//! [`crate::manager::AssetManager`] combines the two. This split keeps the
11//! filesystem concerns testable in isolation.
12
13use devboy_core::asset::AssetContext;
14use sha2::{Digest, Sha256};
15use std::io::Write as _;
16use std::path::{Path, PathBuf};
17
18use crate::error::{AssetError, Result};
19
20/// Directory name used under the cache root for issue attachments.
21pub const DIR_ISSUES: &str = "issues";
22/// Directory name used for issue comment attachments.
23pub const DIR_ISSUE_COMMENTS: &str = "issue-comments";
24/// Directory name used for merge request attachments.
25pub const DIR_MERGE_REQUESTS: &str = "merge-requests";
26/// Directory name used for MR note/comment attachments.
27pub const DIR_MR_COMMENTS: &str = "mr-comments";
28/// Directory name used for messenger chat attachments.
29pub const DIR_CHATS: &str = "chats";
30/// Directory name used for knowledge base attachments.
31pub const DIR_KB: &str = "kb";
32
33/// Maximum length for the sanitized asset ID component in a cache
34/// filename. Together with the 8-char hash and `MAX_NAME_LEN`, the
35/// total leaf stays well under the 255-byte filesystem limit.
36const MAX_ID_LEN: usize = 80;
37
38/// Maximum length for the sanitized filename component.
39const MAX_NAME_LEN: usize = 120;
40
41// Layout: {safe_id}-{8_hash}-{safe_name} + 2 dashes = MAX_ID_LEN + 8 + MAX_NAME_LEN + 2 = 210 < 255
42
43/// Manages the physical cache directory layout and file I/O.
44#[derive(Debug, Clone)]
45pub struct CacheManager {
46    root: PathBuf,
47}
48
49impl CacheManager {
50    /// Create a new manager rooted at `root`. The directory is created if
51    /// it does not already exist.
52    pub fn new(root: PathBuf) -> Result<Self> {
53        std::fs::create_dir_all(&root)?;
54        Ok(Self { root })
55    }
56
57    /// Absolute path to the cache root.
58    pub fn root(&self) -> &Path {
59        &self.root
60    }
61
62    /// Compute the on-disk path for an asset given its context and filename.
63    ///
64    /// Layout:
65    /// ```text
66    /// {root}/{context_dir}/{context_id}/{asset_id}-{safe_filename}
67    /// ```
68    ///
69    /// Both `asset_id` and `filename` are sanitized before becoming a single
70    /// path component — any directory separators or `..` sequences in the
71    /// inputs are replaced with `_`, so calling `path_for` with hostile
72    /// input can never escape the per-context directory.
73    ///
74    /// An 8-character SHA-256 prefix of the raw asset_id is embedded in
75    /// the filename to avoid collisions between IDs that differ only in
76    /// characters collapsed by sanitization. 8 hex chars = 32 bits gives
77    /// ~4 billion buckets — collision probability via birthday paradox is
78    /// negligible for a rotated local cache (< 0.0001% with 100 files per
79    /// context). We intentionally keep the hash short to stay well within
80    /// the 255-char filename limit on ext4 / NTFS / APFS.
81    pub fn path_for(&self, context: &AssetContext, asset_id: &str, filename: &str) -> PathBuf {
82        let safe_id = truncate_component(&sanitize_component(asset_id), MAX_ID_LEN);
83        let safe_name = truncate_component(&sanitize_filename(filename), MAX_NAME_LEN);
84        // Append a short hash of the *raw* (pre-sanitization) asset_id so
85        // that two IDs differing only in characters collapsed by
86        // sanitization (e.g. `a/b` → `a_b` vs `a?b` → `a_b`) never map
87        // to the same on-disk path.
88        let id_hash = &sha256_hex(asset_id.as_bytes())[..8];
89        let leaf = format!("{safe_id}-{id_hash}-{safe_name}");
90        let dir = self.dir_for(context);
91        dir.join(leaf)
92    }
93
94    /// Directory for a given context (relative to the cache root, joined).
95    pub fn dir_for(&self, context: &AssetContext) -> PathBuf {
96        match context {
97            AssetContext::Issue { key } => self.root.join(DIR_ISSUES).join(sanitize_key(key)),
98            AssetContext::IssueComment { key, comment_id } => self
99                .root
100                .join(DIR_ISSUE_COMMENTS)
101                .join(sanitize_key(key))
102                .join(sanitize_key(comment_id)),
103            AssetContext::MergeRequest { mr_id } => {
104                self.root.join(DIR_MERGE_REQUESTS).join(sanitize_key(mr_id))
105            }
106            AssetContext::MrComment { mr_id, note_id } => self
107                .root
108                .join(DIR_MR_COMMENTS)
109                .join(sanitize_key(mr_id))
110                .join(sanitize_key(note_id)),
111            AssetContext::Chat {
112                chat_id,
113                message_id,
114            } => self
115                .root
116                .join(DIR_CHATS)
117                .join(sanitize_key(chat_id))
118                .join(sanitize_key(message_id)),
119            AssetContext::KbPage { page_id } => self.root.join(DIR_KB).join(sanitize_key(page_id)),
120        }
121    }
122
123    /// Store bytes for an asset and return the absolute path where they were
124    /// written along with the SHA-256 checksum.
125    ///
126    /// Parent directories are created as needed. Writes go through a temp
127    /// file + rename so partial writes are never observable.
128    pub fn store(
129        &self,
130        context: &AssetContext,
131        asset_id: &str,
132        filename: &str,
133        data: &[u8],
134    ) -> Result<StoredFile> {
135        let path = self.path_for(context, asset_id, filename);
136        let parent = path
137            .parent()
138            .ok_or_else(|| AssetError::cache_dir(format!("no parent for {path:?}")))?;
139        std::fs::create_dir_all(parent)?;
140
141        let mut tmp = tempfile::NamedTempFile::new_in(parent)
142            .map_err(|e| AssetError::cache_dir(format!("temp file: {e}")))?;
143        tmp.write_all(data)?;
144        tmp.flush()?;
145        tmp.persist(&path)
146            .map_err(|e| AssetError::cache_dir(format!("persist file: {e}")))?;
147
148        let checksum = sha256_hex(data);
149
150        Ok(StoredFile {
151            path,
152            size: data.len() as u64,
153            checksum_sha256: checksum,
154        })
155    }
156
157    /// Read a file from the cache by absolute path. Returns `NotFound`
158    /// if the file is missing (via [`AssetError::Io`]).
159    pub fn load(&self, absolute: &Path) -> Result<Vec<u8>> {
160        Ok(std::fs::read(absolute)?)
161    }
162
163    /// Delete a file from the cache. Missing files are treated as success
164    /// so that retries / idempotent deletes work as expected.
165    pub fn delete(&self, absolute: &Path) -> Result<()> {
166        match std::fs::remove_file(absolute) {
167            Ok(()) => Ok(()),
168            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
169            Err(e) => Err(AssetError::Io(e)),
170        }
171    }
172
173    /// Check whether a file exists in the cache.
174    pub fn exists(&self, absolute: &Path) -> bool {
175        absolute.is_file()
176    }
177}
178
179/// Metadata returned from [`CacheManager::store`].
180#[derive(Debug, Clone, PartialEq, Eq)]
181pub struct StoredFile {
182    /// Absolute path where the file was written.
183    pub path: PathBuf,
184    pub size: u64,
185    /// SHA-256 checksum in lower-case hex.
186    pub checksum_sha256: String,
187}
188
189/// Validate that a cached-asset `local_path` stays under `root` and return
190/// the absolute path on success.
191///
192/// The index is trusted to hold **relative** paths produced by
193/// [`CacheManager::store`]. This helper defends against corrupted or
194/// tampered `index.json` entries that try to point elsewhere:
195///
196/// - Absolute paths are rejected (because `PathBuf::join` would discard
197///   `root` for any absolute RHS).
198/// - Paths containing `..` components are rejected — we never generate
199///   them, so anything with traversal came from outside the crate.
200/// - Lexical containment: the joined path's components must start with
201///   the root's components.
202/// - **Symlink guard**: when the resolved path exists on disk, both
203///   `root` and the resolved path are [`std::path::Path::canonicalize`]d
204///   so that any symlink within the cache directory is dereferenced. The
205///   canonicalized resolved path must still start with the canonicalized
206///   root; if it doesn't (e.g. a symlink inside the cache dir points
207///   outside), the path is rejected.
208///
209/// Returns `None` when the path is unsafe; callers drop the index entry
210/// instead of touching the filesystem.
211pub fn resolve_under_root(root: &Path, relative: &Path) -> Option<PathBuf> {
212    if relative.is_absolute() {
213        return None;
214    }
215    for component in relative.components() {
216        match component {
217            std::path::Component::ParentDir => return None,
218            std::path::Component::Prefix(_) | std::path::Component::RootDir => return None,
219            _ => {}
220        }
221    }
222    let joined = root.join(relative);
223
224    // Lexical containment — fast path for non-existent files (stale
225    // entries) where canonicalize would fail.
226    let root_components: Vec<_> = root.components().collect();
227    let joined_components: Vec<_> = joined.components().collect();
228    if joined_components.len() < root_components.len() {
229        return None;
230    }
231    for (a, b) in root_components.iter().zip(joined_components.iter()) {
232        if a != b {
233            return None;
234        }
235    }
236
237    // Symlink guard — when both paths exist, canonicalize to resolve
238    // any intermediate symlinks and re-verify containment so a symlink
239    // inside the cache dir that points outside can't be followed.
240    if joined.exists()
241        && let (Ok(canon_root), Ok(canon_target)) = (root.canonicalize(), joined.canonicalize())
242        && !canon_target.starts_with(&canon_root)
243    {
244        return None;
245    }
246
247    Some(joined)
248}
249
250/// Compute SHA-256 of a byte slice, returned as lower-case hex.
251pub fn sha256_hex(data: &[u8]) -> String {
252    let mut hasher = Sha256::new();
253    hasher.update(data);
254    let digest = hasher.finalize();
255    let mut out = String::with_capacity(digest.len() * 2);
256    for byte in digest {
257        use std::fmt::Write as _;
258        // `let _ =` is intentional: `<String as fmt::Write>::write_fmt` is
259        // infallible — its `write_str` impl is just `self.push_str(s); Ok(())`
260        // (see https://doc.rust-lang.org/std/string/struct.String.html#impl-Write-for-String).
261        // The only theoretical failure is OOM, which aborts the process
262        // rather than returning `Err`. We suppress the `#[must_use]` lint
263        // with `let _ =` instead of `.unwrap()` to avoid emitting a dead
264        // panic path for an unreachable case.
265        let _ = write!(out, "{byte:02x}");
266    }
267    out
268}
269
270/// Restrict a filename to characters that are safe to write on any FS.
271///
272/// The input is first stripped of anything before the final `/` or `\` to
273/// prevent traversal via `../` or Windows `..\\`. The remaining basename is
274/// then passed through [`sanitize_component`] so the result is always a
275/// single, FS-safe path component.
276fn sanitize_filename(name: &str) -> String {
277    let trimmed = name.trim();
278    let after_fwd = trimmed.rsplit('/').next().unwrap_or(trimmed);
279    let base = after_fwd.rsplit('\\').next().unwrap_or(after_fwd);
280    sanitize_component(base)
281}
282
283/// Sanitize an arbitrary string into a single path component.
284///
285/// Used for both filenames and opaque identifiers (asset ids, context
286/// keys). Rules:
287///
288/// - Keep ASCII alphanumerics, `.`, `-`, `_`
289/// - Replace everything else — including `/`, `\`, and any non-ASCII
290///   character — with `_`
291/// - Reject lone / repeated `..` segments by never letting them survive
292///   (the individual `.` characters remain, but the full traversal form
293///   `..` becomes part of a longer, harmless name)
294/// - Return the sentinel `"unnamed"` for empty / whitespace-only input
295fn sanitize_component(value: &str) -> String {
296    let trimmed = value.trim();
297    let mut out = String::with_capacity(trimmed.len());
298    for ch in trimmed.chars() {
299        if ch.is_ascii_alphanumeric() || ch == '.' || ch == '-' || ch == '_' {
300            out.push(ch);
301        } else {
302            out.push('_');
303        }
304    }
305    // A bare `..` (or any all-dot string) can still be interpreted as a
306    // traversal; neutralize it by replacing every `.` with `_` in that case.
307    if out.chars().all(|c| c == '.') && !out.is_empty() {
308        return out.replace('.', "_");
309    }
310    if out.is_empty() {
311        "unnamed".to_string()
312    } else {
313        out
314    }
315}
316
317/// Same rules as [`sanitize_component`] but named for clarity at call sites.
318fn sanitize_key(key: &str) -> String {
319    sanitize_component(key)
320}
321
322/// Truncate a string to at most `max_len` bytes on a char boundary.
323fn truncate_component(s: &str, max_len: usize) -> String {
324    if s.len() <= max_len {
325        return s.to_string();
326    }
327    // Find a char boundary at or before max_len.
328    let mut end = max_len;
329    while end > 0 && !s.is_char_boundary(end) {
330        end -= 1;
331    }
332    s[..end].to_string()
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338    use devboy_core::asset::AssetContext;
339    use tempfile::tempdir;
340
341    #[test]
342    fn sha256_matches_known_vector() {
343        // Well-known test vector for an empty input.
344        assert_eq!(
345            sha256_hex(b""),
346            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
347        );
348        assert_eq!(
349            sha256_hex(b"abc"),
350            "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad"
351        );
352    }
353
354    #[test]
355    fn sanitize_strips_traversal_and_bad_chars() {
356        assert_eq!(sanitize_filename("../../etc/passwd"), "passwd");
357        assert_eq!(sanitize_filename("hello world!.png"), "hello_world_.png");
358        assert_eq!(sanitize_filename("/"), "unnamed");
359        assert_eq!(sanitize_filename("привет.txt"), "______.txt");
360    }
361
362    #[test]
363    fn sanitize_handles_windows_separators() {
364        assert_eq!(
365            sanitize_filename("..\\..\\Windows\\System32\\cmd.exe"),
366            "cmd.exe",
367        );
368    }
369
370    #[test]
371    fn sanitize_neutralizes_dot_only_names() {
372        assert_eq!(sanitize_component(".."), "__");
373        assert_eq!(sanitize_component("..."), "___");
374        assert_eq!(sanitize_component("."), "_");
375    }
376
377    #[test]
378    fn path_for_blocks_asset_id_traversal() {
379        let tmp = tempdir().unwrap();
380        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
381        let ctx = AssetContext::Issue { key: "k".into() };
382
383        // Hostile asset id trying to escape the issue directory.
384        let path = cache.path_for(&ctx, "../../escape", "file.txt");
385        let rel = path.strip_prefix(tmp.path()).unwrap();
386        let components: Vec<_> = rel
387            .components()
388            .map(|c| c.as_os_str().to_string_lossy().into_owned())
389            .collect();
390        // The hostile id becomes a single sanitized segment; it never introduces
391        // a `..` component.
392        assert!(
393            !components.iter().any(|c| c == ".." || c.contains('/')),
394            "unexpected components: {components:?}",
395        );
396        assert!(path.starts_with(tmp.path()));
397    }
398
399    #[test]
400    fn store_with_hostile_ids_stays_under_cache_root() {
401        let tmp = tempdir().unwrap();
402        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
403        let ctx = AssetContext::Issue {
404            key: "../../root".into(),
405        };
406
407        let stored = cache
408            .store(&ctx, "../../../etc", "../passwd", b"secret")
409            .unwrap();
410        assert!(
411            stored.path.starts_with(tmp.path()),
412            "path escaped cache root: {:?}",
413            stored.path
414        );
415    }
416
417    #[test]
418    fn dir_for_layouts() {
419        let tmp = tempdir().unwrap();
420        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
421
422        let issue_dir = cache.dir_for(&AssetContext::Issue {
423            key: "DEV-1".into(),
424        });
425        assert!(issue_dir.ends_with("issues/DEV-1"));
426
427        let mr_dir = cache.dir_for(&AssetContext::MergeRequest { mr_id: "42".into() });
428        assert!(mr_dir.ends_with("merge-requests/42"));
429
430        let kb_dir = cache.dir_for(&AssetContext::KbPage {
431            page_id: "p1".into(),
432        });
433        assert!(kb_dir.ends_with("kb/p1"));
434    }
435
436    #[test]
437    fn store_load_delete_roundtrip() {
438        let tmp = tempdir().unwrap();
439        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
440
441        let ctx = AssetContext::Issue {
442            key: "DEV-1".into(),
443        };
444        let payload = b"hello world";
445        let stored = cache.store(&ctx, "asset-1", "hello.txt", payload).unwrap();
446
447        assert_eq!(stored.size, payload.len() as u64);
448        assert_eq!(stored.checksum_sha256, sha256_hex(payload));
449        assert!(cache.exists(&stored.path));
450
451        let loaded = cache.load(&stored.path).unwrap();
452        assert_eq!(loaded, payload);
453
454        cache.delete(&stored.path).unwrap();
455        assert!(!cache.exists(&stored.path));
456
457        // Second delete is a no-op, not an error.
458        cache.delete(&stored.path).unwrap();
459    }
460
461    #[test]
462    fn store_creates_nested_directories() {
463        let tmp = tempdir().unwrap();
464        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
465
466        let ctx = AssetContext::MrComment {
467            mr_id: "42".into(),
468            note_id: "7".into(),
469        };
470        let stored = cache.store(&ctx, "a1", "x.bin", b"x").unwrap();
471        let rel = stored.path.strip_prefix(tmp.path()).unwrap();
472
473        // Use path components so the assertion is agnostic to the OS path
474        // separator (`/` on Unix, `\` on Windows).
475        let components: Vec<_> = rel
476            .components()
477            .map(|c| c.as_os_str().to_string_lossy().into_owned())
478            .collect();
479        assert!(
480            components
481                .windows(3)
482                .any(|w| w == ["mr-comments", "42", "7"]),
483            "unexpected path components: {components:?}",
484        );
485    }
486
487    #[test]
488    fn store_rejects_nothing_and_handles_empty_file() {
489        let tmp = tempdir().unwrap();
490        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
491        let ctx = AssetContext::Issue { key: "k".into() };
492        let stored = cache.store(&ctx, "id", "empty", &[]).unwrap();
493        assert_eq!(stored.size, 0);
494        assert_eq!(
495            stored.checksum_sha256,
496            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
497        );
498    }
499
500    #[test]
501    fn resolve_under_root_accepts_relative_paths() {
502        let tmp = tempdir().unwrap();
503        let root = tmp.path();
504        let rel = PathBuf::from("issues/DEV-1/screen.png");
505        let abs = resolve_under_root(root, &rel).unwrap();
506        assert!(abs.starts_with(root));
507        assert!(abs.ends_with("issues/DEV-1/screen.png"));
508    }
509
510    #[test]
511    fn resolve_under_root_rejects_absolute() {
512        let tmp = tempdir().unwrap();
513        let abs = PathBuf::from("/etc/passwd");
514        assert!(resolve_under_root(tmp.path(), &abs).is_none());
515    }
516
517    #[test]
518    fn resolve_under_root_rejects_parent_dir() {
519        let tmp = tempdir().unwrap();
520        let traversal = PathBuf::from("../../etc/passwd");
521        assert!(resolve_under_root(tmp.path(), &traversal).is_none());
522
523        let nested = PathBuf::from("issues/../../etc/passwd");
524        assert!(resolve_under_root(tmp.path(), &nested).is_none());
525    }
526
527    #[test]
528    fn resolve_under_root_accepts_empty_and_single_segment() {
529        let tmp = tempdir().unwrap();
530        let root = tmp.path();
531        assert_eq!(
532            resolve_under_root(root, &PathBuf::from("a.txt")).unwrap(),
533            root.join("a.txt"),
534        );
535    }
536
537    #[test]
538    fn path_for_prefixes_asset_id_and_hash() {
539        let tmp = tempdir().unwrap();
540        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
541        let ctx = AssetContext::Issue { key: "k".into() };
542        let path = cache.path_for(&ctx, "abc123", "report.log");
543        let leaf = path.file_name().unwrap().to_string_lossy();
544        // Format: {sanitized_id}-{8-char hash}-{sanitized_filename}
545        assert!(leaf.starts_with("abc123-"), "unexpected leaf: {leaf}");
546        assert!(leaf.ends_with("-report.log"), "unexpected leaf: {leaf}");
547        // The hash is 8 hex chars between the id and filename.
548        let parts: Vec<&str> = leaf.splitn(3, '-').collect();
549        assert_eq!(parts.len(), 3);
550        assert_eq!(parts[1].len(), 8, "hash should be 8 hex chars");
551    }
552
553    #[test]
554    fn path_for_avoids_collision_on_sanitized_ids() {
555        let tmp = tempdir().unwrap();
556        let cache = CacheManager::new(tmp.path().to_path_buf()).unwrap();
557        let ctx = AssetContext::Issue { key: "k".into() };
558        // These two IDs sanitize to the same string but differ pre-sanitization.
559        let p1 = cache.path_for(&ctx, "a/b", "f.txt");
560        let p2 = cache.path_for(&ctx, "a?b", "f.txt");
561        assert_ne!(
562            p1, p2,
563            "different raw IDs must produce different paths even when sanitized form matches"
564        );
565    }
566}