Skip to main content

dbmd_core/
assets.rs

1//! `assets` — the db.md asset layer.
2//!
3//! Raw binary assets (PDFs, recordings, large exports) belong to a store but
4//! are too heavy for Git. A content file (the **wrapper**) declares one via an
5//! `asset:` / `assets:` frontmatter key; this module records each in the
6//! root-level `assets.jsonl` manifest: store-relative path, SHA-256, size,
7//! media type, the declaring wrapper(s), and whether it is required for
8//! byte-completeness.
9//!
10//! The manifest is a **pure projection** of (wrappers + asset files on disk):
11//! every field is derivable, so a [`scan`] where the bytes are present
12//! reproduces it byte-for-byte, exactly like `index.jsonl`. db.md never
13//! transports the bytes and never names a storage provider; that is the
14//! VibeCraft layer's job, keyed off the SHA-256. This module never shells out
15//! to git and never touches the network.
16//!
17//! Four operations — one write, three reads:
18//!   - [`scan`]   (write) discover declared assets, hash present files, rewrite the manifest
19//!   - [`verify`] (read)  prove the local store is byte-complete for required assets
20//!   - [`status`] (read)  report present / missing without failing
21//!   - [`paths`]  (read)  the store-relative path list (for an ignore mechanism)
22//!
23//! Path safety: every declared path is validated store-relative (no `..`, no
24//! absolute, no escape) via [`crate::store::ensure_path_within_store`] wherever
25//! a path is read or resolved, so a poisoned manifest can never make `scan`
26//! hash, or a restore write, outside the store.
27
28use std::collections::{BTreeMap, BTreeSet};
29use std::fmt::Write as _;
30use std::io::Read as _;
31use std::path::{Component, Path};
32
33use serde::{Deserialize, Serialize};
34use serde_norway::Value;
35use sha2::{Digest, Sha256};
36
37use crate::parser;
38use crate::store::{self, Store};
39use crate::write_atomic;
40
41/// The manifest file name at the store root.
42pub const MANIFEST_FILE: &str = "assets.jsonl";
43
44/// One asset record — one line of `assets.jsonl`.
45///
46/// Every field is derivable from the store (wrapper frontmatter + the file on
47/// disk), so the manifest rebuilds byte-for-byte. Field declaration order is
48/// the canonical JSON key order; `wrappers` is always a sorted list (never a
49/// bare string) so serialization is deterministic.
50#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51pub struct AssetRecord {
52    /// Store-relative path of the raw bytes, forward-slash, with extension. The
53    /// record key. May differ from `wrappers` (the wrapper is the `.md`).
54    pub path: String,
55    /// Lowercase-hex SHA-256 of the bytes: the integrity check and the provider
56    /// blob key. May repeat across records (identical bytes at two paths).
57    pub sha256: String,
58    /// Size in bytes.
59    pub bytes: u64,
60    /// Best-effort MIME type derived from the path extension.
61    pub media_type: String,
62    /// Store-relative path(s) of the content file(s) that declare this asset,
63    /// sorted ascending. Usually one.
64    pub wrappers: Vec<String>,
65    /// Whether the asset is required for byte-completeness (default `true`;
66    /// `false` only when every declaration marks it optional).
67    pub required: bool,
68}
69
70/// A single `asset:` / `assets:` declaration read from a wrapper's frontmatter.
71#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct Declaration {
73    /// The raw store-relative path string as written in frontmatter.
74    pub path: String,
75    /// Whether this declaration marks the asset required (bare string and
76    /// object-without-`required` default to `true`).
77    pub required: bool,
78}
79
80// ─────────────────────────────────────────────────────────────────────────────
81// Reports (serialized directly in `--json`; the CLI renders the text form)
82// ─────────────────────────────────────────────────────────────────────────────
83
84/// Result of [`scan`].
85#[derive(Debug, Serialize)]
86pub struct ScanReport {
87    pub manifest: String,
88    pub cataloged: usize,
89    pub hashed: usize,
90    pub preserved: usize,
91    pub bytes: u64,
92    pub wrote: bool,
93    pub dry_run: bool,
94    pub warnings: Vec<String>,
95    pub untracked: Vec<String>,
96}
97
98/// One asset's local state, used by [`status`] and [`verify`].
99#[derive(Debug, Serialize)]
100pub struct AssetState {
101    pub path: String,
102    pub sha256: String,
103    pub bytes: u64,
104    pub required: bool,
105    /// `present` / `missing` (status); `ok` / `missing` / `corrupt` (verify).
106    pub state: String,
107}
108
109/// Result of [`status`].
110#[derive(Debug, Serialize)]
111pub struct StatusReport {
112    pub total: usize,
113    pub present: usize,
114    pub missing: usize,
115    pub required_missing: usize,
116    pub optional_missing: usize,
117    pub bytes_total: u64,
118    pub bytes_missing: u64,
119    pub assets: Vec<AssetState>,
120}
121
122/// Result of [`verify`].
123#[derive(Debug, Serialize)]
124pub struct VerifyReport {
125    pub mode: String,
126    pub checked: usize,
127    pub ok: usize,
128    pub missing: Vec<String>,
129    pub corrupt: Vec<String>,
130    pub complete: bool,
131}
132
133// ─────────────────────────────────────────────────────────────────────────────
134// Manifest read / write
135// ─────────────────────────────────────────────────────────────────────────────
136
137/// Read `assets.jsonl` into records, deduped by path (last line wins) and
138/// sorted by path ascending. A missing manifest is an empty store, not an
139/// error. A malformed line is an `InvalidData` error (the CLI surfaces it;
140/// [`crate::validate`] flags it leniently as `ASSET_MANIFEST_MALFORMED`).
141pub fn read_manifest(store: &Store) -> crate::Result<Vec<AssetRecord>> {
142    let abs = store.root.join(MANIFEST_FILE);
143    if !abs.exists() {
144        return Ok(Vec::new());
145    }
146    let text = std::fs::read_to_string(&abs)?;
147    let mut by_path: BTreeMap<String, AssetRecord> = BTreeMap::new();
148    for (i, line) in text.lines().enumerate() {
149        if line.trim().is_empty() {
150            continue;
151        }
152        let rec: AssetRecord = serde_json::from_str(line).map_err(|e| {
153            std::io::Error::new(
154                std::io::ErrorKind::InvalidData,
155                format!("{MANIFEST_FILE} line {}: {e}", i + 1),
156            )
157        })?;
158        by_path.insert(rec.path.clone(), rec);
159    }
160    Ok(by_path.into_values().collect())
161}
162
163/// Write the manifest atomically (temp + fsync + rename, via [`write_atomic`]),
164/// records sorted by path ascending. An empty record set removes the file.
165pub fn write_manifest(store: &Store, records: &[AssetRecord]) -> crate::Result<()> {
166    let abs = store.root.join(MANIFEST_FILE);
167    if records.is_empty() {
168        if abs.exists() {
169            std::fs::remove_file(&abs)?;
170        }
171        return Ok(());
172    }
173    let mut sorted = records.to_vec();
174    sorted.sort_by(|a, b| a.path.cmp(&b.path));
175    let mut out = String::new();
176    for rec in &sorted {
177        let line = serde_json::to_string(rec).expect("AssetRecord serializes");
178        out.push_str(&line);
179        out.push('\n');
180    }
181    write_atomic(&abs, out.as_bytes())?;
182    Ok(())
183}
184
185// ─────────────────────────────────────────────────────────────────────────────
186// scan (write) — rebuild the manifest from wrapper declarations
187// ─────────────────────────────────────────────────────────────────────────────
188
189/// Walk every content file, read its `asset`/`assets` declarations, hash the
190/// present files, and (re)write the manifest. The manifest is a projection: a
191/// path no longer declared by any wrapper drops out. Bytes absent locally but
192/// previously cataloged are preserved (the eviction / disk-relief case) since
193/// they cannot be re-hashed. `dry_run` computes without writing; `untracked`
194/// additionally reports non-markdown files under `sources/` that no wrapper
195/// declares. Never writes when nothing changed (keeps the Git diff and the
196/// `--dry-run`-then-scan idempotent).
197pub fn scan(store: &Store, dry_run: bool, untracked: bool) -> crate::Result<ScanReport> {
198    // Tolerate a malformed existing manifest here: scan rebuilds from the files,
199    // so a corrupt prior file is simply replaced. We still read it (best effort)
200    // to preserve hashes for evicted (absent-but-cataloged) assets.
201    let existing_by_path: BTreeMap<String, AssetRecord> = read_manifest(store)
202        .unwrap_or_default()
203        .into_iter()
204        .map(|r| (r.path.clone(), r))
205        .collect();
206
207    // Aggregate declarations across all content files: path -> (wrappers, required).
208    let mut wrappers_by_path: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
209    let mut required_by_path: BTreeMap<String, bool> = BTreeMap::new();
210    let mut declared_paths: BTreeSet<String> = BTreeSet::new();
211    let mut warnings: Vec<String> = Vec::new();
212
213    for rel in store.walk()? {
214        let abs = store.abs_path(&rel);
215        let (fm, _body) = match parser::read_file(&abs) {
216            Ok(v) => v,
217            Err(_) => continue, // unparseable / not a content file: skip
218        };
219        let wrapper = rel_to_string(&rel);
220        for decl in declared_assets(&fm) {
221            let norm = match normalize_asset_path(&decl.path) {
222                Ok(n) => n,
223                Err(e) => {
224                    warnings.push(format!("{wrapper}: {e}"));
225                    continue;
226                }
227            };
228            if is_markdown(&norm) {
229                warnings.push(format!(
230                    "{wrapper}: asset path points at a markdown content file ({norm}); skipped"
231                ));
232                continue;
233            }
234            wrappers_by_path
235                .entry(norm.clone())
236                .or_default()
237                .insert(wrapper.clone());
238            let req = required_by_path.entry(norm.clone()).or_insert(false);
239            *req = *req || decl.required;
240            declared_paths.insert(norm);
241        }
242    }
243
244    // Build records.
245    let mut records: Vec<AssetRecord> = Vec::new();
246    let mut hashed = 0usize;
247    let mut preserved = 0usize;
248    for (path, wrappers) in &wrappers_by_path {
249        let required = *required_by_path.get(path).unwrap_or(&true);
250        let wrappers: Vec<String> = wrappers.iter().cloned().collect();
251
252        // Belt-and-suspenders containment check before any disk read.
253        let abs = match store::ensure_path_within_store(&store.root, &store.root.join(path)) {
254            Ok(p) => p,
255            Err(_) => {
256                warnings.push(format!("{path}: escapes the store root; skipped"));
257                continue;
258            }
259        };
260
261        if abs.is_dir() {
262            warnings.push(format!("{path}: is a directory, not a file; skipped"));
263            continue;
264        }
265        if abs.is_file() {
266            let (sha256, bytes) = sha256_file(&abs)?;
267            records.push(AssetRecord {
268                path: path.clone(),
269                sha256,
270                bytes,
271                media_type: media_type_for(path),
272                wrappers,
273                required,
274            });
275            hashed += 1;
276        } else if let Some(prev) = existing_by_path.get(path) {
277            // Evicted: bytes gone locally but previously cataloged. Preserve the
278            // committed hash/size (we cannot re-hash what is not here).
279            records.push(AssetRecord {
280                path: path.clone(),
281                sha256: prev.sha256.clone(),
282                bytes: prev.bytes,
283                media_type: media_type_for(path),
284                wrappers,
285                required,
286            });
287            preserved += 1;
288        } else {
289            warnings.push(format!(
290                "{path}: declared but absent and never cataloged; cannot hash (skipped)"
291            ));
292        }
293    }
294    records.sort_by(|a, b| a.path.cmp(&b.path));
295
296    let bytes: u64 = records.iter().map(|r| r.bytes).sum();
297    let cataloged = records.len();
298
299    let untracked_list = if untracked {
300        find_untracked(store, &declared_paths)?
301    } else {
302        Vec::new()
303    };
304
305    // Only write when the canonical content actually changed.
306    let mut wrote = false;
307    if !dry_run {
308        let current = read_manifest(store).unwrap_or_default();
309        if current != records {
310            write_manifest(store, &records)?;
311            wrote = true;
312        }
313    }
314
315    Ok(ScanReport {
316        manifest: MANIFEST_FILE.to_string(),
317        cataloged,
318        hashed,
319        preserved,
320        bytes,
321        wrote,
322        dry_run,
323        warnings,
324        untracked: untracked_list,
325    })
326}
327
328// ─────────────────────────────────────────────────────────────────────────────
329// verify (read) — byte-completeness gate
330// ─────────────────────────────────────────────────────────────────────────────
331
332/// Check that every required asset (plus optional, under `include_optional`) is
333/// present locally and matches the manifest. `quick` = presence + size only
334/// (fast); otherwise a full SHA-256 re-hash. This is a SWEEP (O(asset bytes) in
335/// deep mode), never a loop op. `complete` is true iff nothing is missing or
336/// corrupt in the considered set.
337pub fn verify(store: &Store, include_optional: bool, quick: bool) -> crate::Result<VerifyReport> {
338    let records = read_manifest(store)?;
339    let mut missing = Vec::new();
340    let mut corrupt = Vec::new();
341    let mut checked = 0usize;
342
343    for rec in &records {
344        if !rec.required && !include_optional {
345            continue;
346        }
347        checked += 1;
348        let abs = match store::ensure_path_within_store(&store.root, &store.root.join(&rec.path)) {
349            Ok(p) => p,
350            Err(_) => {
351                // A manifest path that escapes the store is not restorable here.
352                corrupt.push(rec.path.clone());
353                continue;
354            }
355        };
356        if !abs.is_file() {
357            missing.push(rec.path.clone());
358            continue;
359        }
360        if quick {
361            let len = std::fs::metadata(&abs)?.len();
362            if len != rec.bytes {
363                corrupt.push(rec.path.clone());
364            }
365        } else {
366            let (sha, bytes) = sha256_file(&abs)?;
367            if sha != rec.sha256 || bytes != rec.bytes {
368                corrupt.push(rec.path.clone());
369            }
370        }
371    }
372
373    let ok = checked - missing.len() - corrupt.len();
374    let complete = missing.is_empty() && corrupt.is_empty();
375    Ok(VerifyReport {
376        mode: if quick { "quick" } else { "deep" }.to_string(),
377        checked,
378        ok,
379        missing,
380        corrupt,
381        complete,
382    })
383}
384
385// ─────────────────────────────────────────────────────────────────────────────
386// status (read) — non-failing presence report
387// ─────────────────────────────────────────────────────────────────────────────
388
389/// Report which cataloged assets are present locally and how many bytes remain
390/// to restore. Never fails on a missing asset (that is `verify`'s job); it does
391/// fail on a malformed manifest.
392pub fn status(store: &Store) -> crate::Result<StatusReport> {
393    let records = read_manifest(store)?;
394    let mut present = 0usize;
395    let mut missing = 0usize;
396    let mut required_missing = 0usize;
397    let mut optional_missing = 0usize;
398    let mut bytes_total = 0u64;
399    let mut bytes_missing = 0u64;
400    let mut assets = Vec::with_capacity(records.len());
401
402    for rec in &records {
403        bytes_total += rec.bytes;
404        let is_present = store.root.join(&rec.path).is_file();
405        let state = if is_present {
406            present += 1;
407            "present"
408        } else {
409            missing += 1;
410            bytes_missing += rec.bytes;
411            if rec.required {
412                required_missing += 1;
413            } else {
414                optional_missing += 1;
415            }
416            "missing"
417        };
418        assets.push(AssetState {
419            path: rec.path.clone(),
420            sha256: rec.sha256.clone(),
421            bytes: rec.bytes,
422            required: rec.required,
423            state: state.to_string(),
424        });
425    }
426
427    Ok(StatusReport {
428        total: records.len(),
429        present,
430        missing,
431        required_missing,
432        optional_missing,
433        bytes_total,
434        bytes_missing,
435        assets,
436    })
437}
438
439// ─────────────────────────────────────────────────────────────────────────────
440// paths (read) — the VCS-neutral path list
441// ─────────────────────────────────────────────────────────────────────────────
442
443/// The cataloged asset paths, sorted ascending. The VCS-neutral list a harness
444/// feeds into a `.gitignore` managed block or a sync-service exclude. db.md
445/// itself never writes any ignore file.
446pub fn paths(store: &Store) -> crate::Result<Vec<String>> {
447    Ok(read_manifest(store)?.into_iter().map(|r| r.path).collect())
448}
449
450// ─────────────────────────────────────────────────────────────────────────────
451// Declaration parsing (shared with `validate`)
452// ─────────────────────────────────────────────────────────────────────────────
453
454/// Read all `asset:` / `assets:` declarations from a parsed frontmatter.
455///
456/// `asset: <path>` is a single required declaration. `assets:` is a list whose
457/// items are either a bare path string (required) or a `{ path, required }`
458/// mapping. Both keys may be present.
459pub fn declared_assets(fm: &parser::Frontmatter) -> Vec<Declaration> {
460    let mut out = Vec::new();
461    if let Some(v) = fm.get("asset") {
462        collect_declarations(&v, &mut out);
463    }
464    if let Some(v) = fm.get("assets") {
465        collect_declarations(&v, &mut out);
466    }
467    out
468}
469
470/// Read declarations from an already-parsed YAML mapping. Used by
471/// [`crate::validate`], which holds the parsed mapping and need not re-read the
472/// file. Equivalent to [`declared_assets`] but keyed off a raw map.
473pub fn declarations_from_yaml_map(map: &BTreeMap<String, Value>) -> Vec<Declaration> {
474    let mut out = Vec::new();
475    if let Some(v) = map.get("asset") {
476        collect_declarations(v, &mut out);
477    }
478    if let Some(v) = map.get("assets") {
479        collect_declarations(v, &mut out);
480    }
481    out
482}
483
484fn collect_declarations(v: &Value, out: &mut Vec<Declaration>) {
485    match v {
486        Value::String(s) => out.push(Declaration {
487            path: s.clone(),
488            required: true,
489        }),
490        Value::Sequence(items) => {
491            for item in items {
492                match item {
493                    Value::String(s) => out.push(Declaration {
494                        path: s.clone(),
495                        required: true,
496                    }),
497                    Value::Mapping(m) => {
498                        let path = m
499                            .get(Value::String("path".to_string()))
500                            .and_then(|x| x.as_str())
501                            .map(|s| s.to_string());
502                        if let Some(path) = path {
503                            let required = m
504                                .get(Value::String("required".to_string()))
505                                .and_then(|x| x.as_bool())
506                                .unwrap_or(true);
507                            out.push(Declaration { path, required });
508                        }
509                    }
510                    _ => {}
511                }
512            }
513        }
514        _ => {}
515    }
516}
517
518// ─────────────────────────────────────────────────────────────────────────────
519// Helpers
520// ─────────────────────────────────────────────────────────────────────────────
521
522/// Normalize a declared asset path to a store-relative forward-slash string,
523/// rejecting absolute paths and any `..` / root component. This is the lexical
524/// guard; [`crate::store::ensure_path_within_store`] is the resolved-path guard
525/// applied before any disk read.
526pub fn normalize_asset_path(raw: &str) -> Result<String, String> {
527    let trimmed = raw.trim();
528    if trimmed.is_empty() {
529        return Err("empty asset path".to_string());
530    }
531    let p = Path::new(trimmed);
532    if p.is_absolute() {
533        return Err(format!("absolute asset path not allowed: {raw}"));
534    }
535    for c in p.components() {
536        match c {
537            Component::ParentDir => return Err(format!("`..` not allowed in asset path: {raw}")),
538            Component::Prefix(_) | Component::RootDir => {
539                return Err(format!("asset path escapes the store: {raw}"))
540            }
541            _ => {}
542        }
543    }
544    Ok(trimmed.replace('\\', "/").trim_end_matches('/').to_string())
545}
546
547fn is_markdown(path: &str) -> bool {
548    Path::new(path)
549        .extension()
550        .and_then(|e| e.to_str())
551        .map(|e| e.eq_ignore_ascii_case("md"))
552        .unwrap_or(false)
553}
554
555fn rel_to_string(p: &Path) -> String {
556    p.to_string_lossy().replace('\\', "/")
557}
558
559/// Stream the file through SHA-256 (constant memory) and return
560/// `(lowercase-hex digest, byte length)`.
561fn sha256_file(abs: &Path) -> std::io::Result<(String, u64)> {
562    let mut f = std::fs::File::open(abs)?;
563    let mut hasher = Sha256::new();
564    let mut buf = [0u8; 65536];
565    let mut total: u64 = 0;
566    loop {
567        let n = f.read(&mut buf)?;
568        if n == 0 {
569            break;
570        }
571        hasher.update(&buf[..n]);
572        total += n as u64;
573    }
574    let digest = hasher.finalize();
575    let mut hex = String::with_capacity(64);
576    for b in digest.iter() {
577        let _ = write!(hex, "{b:02x}");
578    }
579    Ok((hex, total))
580}
581
582/// Best-effort MIME type from the path extension. Defaults to
583/// `application/octet-stream`. This is deterministic (extension-driven), so it
584/// does not break the manifest's rebuild equivalence.
585fn media_type_for(path: &str) -> String {
586    let ext = Path::new(path)
587        .extension()
588        .and_then(|e| e.to_str())
589        .unwrap_or("")
590        .to_ascii_lowercase();
591    let mt = match ext.as_str() {
592        "pdf" => "application/pdf",
593        "png" => "image/png",
594        "jpg" | "jpeg" => "image/jpeg",
595        "gif" => "image/gif",
596        "webp" => "image/webp",
597        "svg" => "image/svg+xml",
598        "tiff" | "tif" => "image/tiff",
599        "mp4" => "video/mp4",
600        "mov" => "video/quicktime",
601        "webm" => "video/webm",
602        "mkv" => "video/x-matroska",
603        "mp3" => "audio/mpeg",
604        "wav" => "audio/wav",
605        "m4a" => "audio/mp4",
606        "flac" => "audio/flac",
607        "zip" => "application/zip",
608        "gz" | "tgz" => "application/gzip",
609        "tar" => "application/x-tar",
610        "csv" => "text/csv",
611        "tsv" => "text/tab-separated-values",
612        "json" => "application/json",
613        "xml" => "application/xml",
614        "txt" => "text/plain",
615        "vtt" => "text/vtt",
616        "srt" => "application/x-subrip",
617        "html" | "htm" => "text/html",
618        "epub" => "application/epub+zip",
619        "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
620        "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
621        "pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
622        "doc" => "application/msword",
623        "xls" => "application/vnd.ms-excel",
624        "ppt" => "application/vnd.ms-powerpoint",
625        _ => "application/octet-stream",
626    };
627    mt.to_string()
628}
629
630/// Non-markdown files under `sources/` that no wrapper declares (the
631/// un-wrappered-drop worklist). Walks the raw filesystem (so it sees files an
632/// ignore mechanism would hide), skips `index.*` sidecars and hidden entries.
633fn find_untracked(store: &Store, declared: &BTreeSet<String>) -> crate::Result<Vec<String>> {
634    let sources = store.root.join("sources");
635    if !sources.is_dir() {
636        return Ok(Vec::new());
637    }
638    let mut out = Vec::new();
639    for entry in walkdir::WalkDir::new(&sources)
640        .into_iter()
641        .filter_entry(|e| !is_hidden(e.file_name().to_str().unwrap_or("")))
642    {
643        let entry = match entry {
644            Ok(e) => e,
645            Err(_) => continue,
646        };
647        if !entry.file_type().is_file() {
648            continue;
649        }
650        let name = entry.file_name().to_str().unwrap_or("");
651        if is_markdown(name) || name == "index.jsonl" {
652            continue;
653        }
654        let rel = match entry.path().strip_prefix(&store.root) {
655            Ok(r) => rel_to_string(r),
656            Err(_) => continue,
657        };
658        if !declared.contains(&rel) {
659            out.push(rel);
660        }
661    }
662    out.sort();
663    Ok(out)
664}
665
666fn is_hidden(name: &str) -> bool {
667    name.starts_with('.') && name != "." && name != ".."
668}