dbmd_core/
assets.rs

1//! `assets` — the db.md asset layer.
2//!
3//! Raw binary assets (PDFs, recordings, large exports) belong to a store but
4//! are too heavy for Git. A content file (the **wrapper**) declares one via an
5//! `asset:` / `assets:` frontmatter key; this module records each in the
6//! root-level `assets.jsonl` manifest: store-relative path, SHA-256, size,
7//! media type, the declaring wrapper(s), and whether it is required for
8//! byte-completeness.
9//!
10//! The manifest is a **pure projection** of (wrappers + asset files on disk):
11//! every field is derivable, so a [`scan`] where the bytes are present
12//! reproduces it byte-for-byte, exactly like `index.jsonl`. db.md never
13//! transports the bytes and never names a storage provider; that is the
14//! VibeCraft layer's job, keyed off the SHA-256. This module never shells out
15//! to git and never touches the network.
16//!
17//! Four operations — one write, three reads:
18//!   - [`scan`]   (write) discover declared assets, hash present files, rewrite the manifest
19//!   - [`verify`] (read)  prove the local store is byte-complete for required assets
20//!   - [`status`] (read)  report present / missing without failing
21//!   - [`paths`]  (read)  the store-relative path list (for an ignore mechanism)
22//!
23//! Path safety: every declared path is validated store-relative (no `..`, no
24//! absolute, no escape) via [`crate::store::ensure_path_within_store`] wherever
25//! a path is read or resolved, so a poisoned manifest can never make `scan`
26//! hash, or a restore write, outside the store.
27
28use std::collections::{BTreeMap, BTreeSet};
29use std::fmt::Write as _;
30use std::io::Read as _;
31use std::path::{Component, Path, PathBuf};
32
33use serde::{Deserialize, Serialize};
34use serde_norway::Value;
35use sha2::{Digest, Sha256};
36
37use crate::parser;
38use crate::store::{self, Store};
39use crate::write_atomic;
40
41/// The manifest file name at the store root.
42pub const MANIFEST_FILE: &str = "assets.jsonl";
43
44/// One asset record — one line of `assets.jsonl`.
45///
46/// Every field is derivable from the store (wrapper frontmatter + the file on
47/// disk), so the manifest rebuilds byte-for-byte. Field declaration order is
48/// the canonical JSON key order; `wrappers` is always a sorted list (never a
49/// bare string) so serialization is deterministic.
50#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51pub struct AssetRecord {
52    /// Store-relative path of the raw bytes, forward-slash, with extension. The
53    /// record key. May differ from `wrappers` (the wrapper is the `.md`).
54    pub path: String,
55    /// Lowercase-hex SHA-256 of the bytes: the integrity check and the provider
56    /// blob key. May repeat across records (identical bytes at two paths).
57    pub sha256: String,
58    /// Size in bytes.
59    pub bytes: u64,
60    /// Best-effort MIME type derived from the path extension.
61    pub media_type: String,
62    /// Store-relative path(s) of the content file(s) that declare this asset,
63    /// sorted ascending. Usually one.
64    pub wrappers: Vec<String>,
65    /// Whether the asset is required for byte-completeness (default `true`;
66    /// `false` only when every declaration marks it optional).
67    pub required: bool,
68}
69
70/// A single `asset:` / `assets:` declaration read from a wrapper's frontmatter.
71#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct Declaration {
73    /// The raw store-relative path string as written in frontmatter.
74    pub path: String,
75    /// Whether this declaration marks the asset required (bare string and
76    /// object-without-`required` default to `true`).
77    pub required: bool,
78}
79
80// ─────────────────────────────────────────────────────────────────────────────
81// Reports (serialized directly in `--json`; the CLI renders the text form)
82// ─────────────────────────────────────────────────────────────────────────────
83
84/// Result of [`scan`].
85#[derive(Debug, Serialize)]
86pub struct ScanReport {
87    pub manifest: String,
88    pub cataloged: usize,
89    pub hashed: usize,
90    pub preserved: usize,
91    pub bytes: u64,
92    pub wrote: bool,
93    pub dry_run: bool,
94    pub warnings: Vec<String>,
95    pub untracked: Vec<String>,
96}
97
98/// One asset's local state, used by [`status`] and [`verify`].
99#[derive(Debug, Serialize)]
100pub struct AssetState {
101    pub path: String,
102    pub sha256: String,
103    pub bytes: u64,
104    pub required: bool,
105    /// `present` / `missing` (status); `ok` / `missing` / `corrupt` (verify).
106    pub state: String,
107}
108
109/// Result of [`status`].
110#[derive(Debug, Serialize)]
111pub struct StatusReport {
112    pub total: usize,
113    pub present: usize,
114    pub missing: usize,
115    pub required_missing: usize,
116    pub optional_missing: usize,
117    pub bytes_total: u64,
118    pub bytes_missing: u64,
119    pub assets: Vec<AssetState>,
120}
121
122/// Result of [`verify`].
123#[derive(Debug, Serialize)]
124pub struct VerifyReport {
125    pub mode: String,
126    pub checked: usize,
127    pub ok: usize,
128    pub missing: Vec<String>,
129    pub corrupt: Vec<String>,
130    pub complete: bool,
131}
132
133// ─────────────────────────────────────────────────────────────────────────────
134// Manifest read / write
135// ─────────────────────────────────────────────────────────────────────────────
136
137/// Read `assets.jsonl` into records, deduped by path (last line wins) and
138/// sorted by path ascending. A missing manifest is an empty store, not an
139/// error. A malformed line is an `InvalidData` error (the CLI surfaces it;
140/// [`crate::validate`] flags it leniently as `ASSET_MANIFEST_MALFORMED`).
141pub fn read_manifest(store: &Store) -> crate::Result<Vec<AssetRecord>> {
142    let abs = store.root.join(MANIFEST_FILE);
143    if !abs.exists() {
144        return Ok(Vec::new());
145    }
146    let text = std::fs::read_to_string(&abs)?;
147    let mut by_path: BTreeMap<String, AssetRecord> = BTreeMap::new();
148    for (i, line) in text.lines().enumerate() {
149        if line.trim().is_empty() {
150            continue;
151        }
152        let rec: AssetRecord = serde_json::from_str(line).map_err(|e| {
153            std::io::Error::new(
154                std::io::ErrorKind::InvalidData,
155                format!("{MANIFEST_FILE} line {}: {e}", i + 1),
156            )
157        })?;
158        by_path.insert(rec.path.clone(), rec);
159    }
160    Ok(by_path.into_values().collect())
161}
162
163/// The canonical serialized form of a record set: one JSON line per record,
164/// records sorted by path ascending, trailing newline. An empty record set is
165/// the empty string (the manifest file is removed, not written empty). This is
166/// the SINGLE source of the manifest's byte layout — both [`write_manifest`] and
167/// the [`scan`] no-change gate go through it, so "what scan would write" and
168/// "what's on disk" are compared as the same bytes.
169fn serialize_manifest(records: &[AssetRecord]) -> String {
170    if records.is_empty() {
171        return String::new();
172    }
173    let mut sorted = records.to_vec();
174    sorted.sort_by(|a, b| a.path.cmp(&b.path));
175    let mut out = String::new();
176    for rec in &sorted {
177        let line = serde_json::to_string(rec).expect("AssetRecord serializes");
178        out.push_str(&line);
179        out.push('\n');
180    }
181    out
182}
183
184/// Write the manifest atomically (temp + fsync + rename, via [`write_atomic`]),
185/// records sorted by path ascending. An empty record set removes the file.
186pub fn write_manifest(store: &Store, records: &[AssetRecord]) -> crate::Result<()> {
187    let abs = store.root.join(MANIFEST_FILE);
188    let out = serialize_manifest(records);
189    if out.is_empty() {
190        if abs.exists() {
191            std::fs::remove_file(&abs)?;
192        }
193        return Ok(());
194    }
195    write_atomic(&abs, out.as_bytes())?;
196    Ok(())
197}
198
199// ─────────────────────────────────────────────────────────────────────────────
200// scan (write) — rebuild the manifest from wrapper declarations
201// ─────────────────────────────────────────────────────────────────────────────
202
203/// Walk every content file, read its `asset`/`assets` declarations, hash the
204/// present files, and (re)write the manifest. The manifest is a projection: a
205/// path no longer declared by any wrapper drops out. Bytes absent locally but
206/// previously cataloged are preserved (the eviction / disk-relief case) since
207/// they cannot be re-hashed. `dry_run` computes without writing; `untracked`
208/// additionally reports non-markdown files under `sources/` that no wrapper
209/// declares. Never writes when nothing changed (keeps the Git diff and the
210/// `--dry-run`-then-scan idempotent).
211pub fn scan(store: &Store, dry_run: bool, untracked: bool) -> crate::Result<ScanReport> {
212    // Tolerate a malformed existing manifest here: scan rebuilds from the files,
213    // so a corrupt prior file is simply replaced. We still read it (best effort)
214    // to preserve hashes for evicted (absent-but-cataloged) assets.
215    let existing_by_path: BTreeMap<String, AssetRecord> = read_manifest(store)
216        .unwrap_or_default()
217        .into_iter()
218        .map(|r| (r.path.clone(), r))
219        .collect();
220
221    // Aggregate declarations across all content files: path -> (wrappers, required).
222    let mut wrappers_by_path: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
223    let mut required_by_path: BTreeMap<String, bool> = BTreeMap::new();
224    let mut declared_paths: BTreeSet<String> = BTreeSet::new();
225    let mut warnings: Vec<String> = Vec::new();
226
227    for rel in store.walk()? {
228        let abs = store.abs_path(&rel);
229        let (fm, _body) = match parser::read_file(&abs) {
230            Ok(v) => v,
231            Err(_) => continue, // unparseable / not a content file: skip
232        };
233        let wrapper = rel_to_string(&rel);
234        for decl in declared_assets(&fm) {
235            let norm = match normalize_asset_path(&decl.path) {
236                Ok(n) => n,
237                Err(e) => {
238                    warnings.push(format!("{wrapper}: {e}"));
239                    continue;
240                }
241            };
242            if is_markdown(&norm) {
243                warnings.push(format!(
244                    "{wrapper}: asset path points at a markdown content file ({norm}); skipped"
245                ));
246                continue;
247            }
248            wrappers_by_path
249                .entry(norm.clone())
250                .or_default()
251                .insert(wrapper.clone());
252            let req = required_by_path.entry(norm.clone()).or_insert(false);
253            *req = *req || decl.required;
254            declared_paths.insert(norm);
255        }
256    }
257
258    // Build records.
259    let mut records: Vec<AssetRecord> = Vec::new();
260    let mut hashed = 0usize;
261    let mut preserved = 0usize;
262    for (path, wrappers) in &wrappers_by_path {
263        let required = *required_by_path.get(path).unwrap_or(&true);
264        let wrappers: Vec<String> = wrappers.iter().cloned().collect();
265
266        // Belt-and-suspenders containment check before any disk read.
267        let abs = match store::ensure_path_within_store(&store.root, &store.root.join(path)) {
268            Ok(p) => p,
269            Err(_) => {
270                warnings.push(format!("{path}: escapes the store root; skipped"));
271                continue;
272            }
273        };
274
275        if abs.is_dir() {
276            warnings.push(format!("{path}: is a directory, not a file; skipped"));
277            continue;
278        }
279        if abs.is_file() {
280            let (sha256, bytes) = sha256_file(&abs)?;
281            records.push(AssetRecord {
282                path: path.clone(),
283                sha256,
284                bytes,
285                media_type: media_type_for(path),
286                wrappers,
287                required,
288            });
289            hashed += 1;
290        } else if let Some(prev) = existing_by_path.get(path) {
291            // Evicted: bytes gone locally but previously cataloged. Preserve the
292            // committed hash/size (we cannot re-hash what is not here).
293            records.push(AssetRecord {
294                path: path.clone(),
295                sha256: prev.sha256.clone(),
296                bytes: prev.bytes,
297                media_type: media_type_for(path),
298                wrappers,
299                required,
300            });
301            preserved += 1;
302        } else {
303            warnings.push(format!(
304                "{path}: declared but absent and never cataloged; cannot hash (skipped)"
305            ));
306        }
307    }
308    records.sort_by(|a, b| a.path.cmp(&b.path));
309
310    // Saturating: poisoned-manifest `bytes` can overflow a plain `.sum()` (debug
311    // abort / release wrap); see `status`.
312    let bytes: u64 = records.iter().fold(0u64, |a, r| a.saturating_add(r.bytes));
313    let cataloged = records.len();
314
315    let untracked_list = if untracked {
316        find_untracked(store, &declared_paths)?
317    } else {
318        Vec::new()
319    };
320
321    // Only write when the canonical BYTES differ from what's on disk. Comparing
322    // parsed records would miss non-canonical on-disk state — duplicate lines
323    // from a git `merge=union`, a wrong sort, a missing trailing newline — since
324    // `read_manifest` dedupes-by-path and sorts, so a poisoned file parses back
325    // equal to the freshly computed records and the no-op gate never repairs it.
326    // We instead compare the canonical serialization against the raw on-disk
327    // bytes, so `scan` recompacts a non-canonical manifest (mirroring how
328    // `index::rebuild_all` always normalizes its artifacts). This is also the
329    // documented `merge=union` recovery (SPEC § Assets).
330    let mut wrote = false;
331    if !dry_run {
332        let canonical = serialize_manifest(&records);
333        let abs = store.root.join(MANIFEST_FILE);
334        let on_disk = std::fs::read(&abs).unwrap_or_default();
335        if on_disk != canonical.as_bytes() {
336            write_manifest(store, &records)?;
337            wrote = true;
338        }
339    }
340
341    Ok(ScanReport {
342        manifest: MANIFEST_FILE.to_string(),
343        cataloged,
344        hashed,
345        preserved,
346        bytes,
347        wrote,
348        dry_run,
349        warnings,
350        untracked: untracked_list,
351    })
352}
353
354// ─────────────────────────────────────────────────────────────────────────────
355// verify (read) — byte-completeness gate
356// ─────────────────────────────────────────────────────────────────────────────
357
358/// Check that every required asset (plus optional, under `include_optional`) is
359/// present locally and matches the manifest. `quick` = presence + size only
360/// (fast); otherwise a full SHA-256 re-hash. This is a SWEEP (O(asset bytes) in
361/// deep mode), never a loop op. `complete` is true iff nothing is missing or
362/// corrupt in the considered set.
363pub fn verify(store: &Store, include_optional: bool, quick: bool) -> crate::Result<VerifyReport> {
364    let records = read_manifest(store)?;
365    let mut missing = Vec::new();
366    let mut corrupt = Vec::new();
367    let mut checked = 0usize;
368
369    for rec in &records {
370        if !rec.required && !include_optional {
371            continue;
372        }
373        checked += 1;
374        let abs = match store::ensure_path_within_store(&store.root, &store.root.join(&rec.path)) {
375            Ok(p) => p,
376            Err(_) => {
377                // A manifest path that escapes the store is not restorable here.
378                corrupt.push(rec.path.clone());
379                continue;
380            }
381        };
382        if !abs.is_file() {
383            missing.push(rec.path.clone());
384            continue;
385        }
386        if quick {
387            let len = std::fs::metadata(&abs)?.len();
388            if len != rec.bytes {
389                corrupt.push(rec.path.clone());
390            }
391        } else {
392            let (sha, bytes) = sha256_file(&abs)?;
393            if sha != rec.sha256 || bytes != rec.bytes {
394                corrupt.push(rec.path.clone());
395            }
396        }
397    }
398
399    let ok = checked - missing.len() - corrupt.len();
400    let complete = missing.is_empty() && corrupt.is_empty();
401    Ok(VerifyReport {
402        mode: if quick { "quick" } else { "deep" }.to_string(),
403        checked,
404        ok,
405        missing,
406        corrupt,
407        complete,
408    })
409}
410
411// ─────────────────────────────────────────────────────────────────────────────
412// status (read) — non-failing presence report
413// ─────────────────────────────────────────────────────────────────────────────
414
415/// Report which cataloged assets are present locally and how many bytes remain
416/// to restore. Never fails on a missing asset (that is `verify`'s job); it does
417/// fail on a malformed manifest.
418pub fn status(store: &Store) -> crate::Result<StatusReport> {
419    let records = read_manifest(store)?;
420    let mut present = 0usize;
421    let mut missing = 0usize;
422    let mut required_missing = 0usize;
423    let mut optional_missing = 0usize;
424    let mut bytes_total = 0u64;
425    let mut bytes_missing = 0u64;
426    let mut assets = Vec::with_capacity(records.len());
427
428    for rec in &records {
429        // Saturating: `rec.bytes` is deserialized verbatim from a hand-editable /
430        // poisoned `assets.jsonl` with no clamp. An absurd value (~u64::MAX)
431        // summed with unchecked `+=` ABORTS in debug (overflow-checks) and
432        // silently WRAPS in release — and `status` is contractually non-failing.
433        bytes_total = bytes_total.saturating_add(rec.bytes);
434        // Resolve through the same containment guard `scan` and `verify` use:
435        // the module contract is that the guard applies "wherever a path is read
436        // or resolved", and an unguarded `is_file()` here let a poisoned/hand-
437        // edited manifest path (`../outside.txt`) report `present` (and count its
438        // bytes) while `verify` reported it `corrupt` — two read commands on the
439        // same store disagreeing, plus a path-existence oracle outside the store.
440        // An escaping record is treated as not-present (missing), matching verify.
441        let is_present = store::ensure_path_within_store(&store.root, &store.root.join(&rec.path))
442            .map(|p| p.is_file())
443            .unwrap_or(false);
444        let state = if is_present {
445            present += 1;
446            "present"
447        } else {
448            missing += 1;
449            bytes_missing = bytes_missing.saturating_add(rec.bytes);
450            if rec.required {
451                required_missing += 1;
452            } else {
453                optional_missing += 1;
454            }
455            "missing"
456        };
457        assets.push(AssetState {
458            path: rec.path.clone(),
459            sha256: rec.sha256.clone(),
460            bytes: rec.bytes,
461            required: rec.required,
462            state: state.to_string(),
463        });
464    }
465
466    Ok(StatusReport {
467        total: records.len(),
468        present,
469        missing,
470        required_missing,
471        optional_missing,
472        bytes_total,
473        bytes_missing,
474        assets,
475    })
476}
477
478// ─────────────────────────────────────────────────────────────────────────────
479// paths (read) — the VCS-neutral path list
480// ─────────────────────────────────────────────────────────────────────────────
481
482/// The cataloged asset paths, sorted ascending. The VCS-neutral list a harness
483/// feeds into a `.gitignore` managed block or a sync-service exclude. db.md
484/// itself never writes any ignore file.
485///
486/// Every emitted path is routed through the same containment guard `scan`,
487/// `verify`, and `status` use — the module contract is that the guard applies
488/// "wherever a path is read or resolved" (SPEC § Assets > Path safety). A
489/// poisoned / hand-edited manifest path that escapes the store (absolute, or a
490/// `..` traversal — the `merge=union`-corruption state SPEC anticipates) is
491/// OMITTED, so this list — which a harness pipes straight into a `.gitignore`
492/// managed block or a sync-exclude — can never carry an out-of-store path. The
493/// list analog of how `verify` counts an escaping record corrupt and `status`
494/// counts it missing: a path that can't be a real store member is left out.
495pub fn paths(store: &Store) -> crate::Result<Vec<String>> {
496    Ok(read_manifest(store)?
497        .into_iter()
498        .filter(|r| store::ensure_path_within_store(&store.root, &store.root.join(&r.path)).is_ok())
499        .map(|r| r.path)
500        .collect())
501}
502
503// ─────────────────────────────────────────────────────────────────────────────
504// Declaration parsing (shared with `validate`)
505// ─────────────────────────────────────────────────────────────────────────────
506
507/// Read all `asset:` / `assets:` declarations from a parsed frontmatter.
508///
509/// `asset: <path>` is a single required declaration. `assets:` is a list whose
510/// items are either a bare path string (required) or a `{ path, required }`
511/// mapping. Both keys may be present.
512pub fn declared_assets(fm: &parser::Frontmatter) -> Vec<Declaration> {
513    let mut out = Vec::new();
514    if let Some(v) = fm.get("asset") {
515        collect_declarations(&v, &mut out);
516    }
517    if let Some(v) = fm.get("assets") {
518        collect_declarations(&v, &mut out);
519    }
520    out
521}
522
523/// Read declarations from an already-parsed YAML mapping. Used by
524/// [`crate::validate`], which holds the parsed mapping and need not re-read the
525/// file. Equivalent to [`declared_assets`] but keyed off a raw map.
526pub fn declarations_from_yaml_map(map: &BTreeMap<String, Value>) -> Vec<Declaration> {
527    let mut out = Vec::new();
528    if let Some(v) = map.get("asset") {
529        collect_declarations(v, &mut out);
530    }
531    if let Some(v) = map.get("assets") {
532        collect_declarations(v, &mut out);
533    }
534    out
535}
536
537fn collect_declarations(v: &Value, out: &mut Vec<Declaration>) {
538    match v {
539        Value::String(s) => out.push(Declaration {
540            path: s.clone(),
541            required: true,
542        }),
543        Value::Sequence(items) => {
544            for item in items {
545                match item {
546                    Value::String(s) => out.push(Declaration {
547                        path: s.clone(),
548                        required: true,
549                    }),
550                    Value::Mapping(m) => {
551                        let path = m
552                            .get(Value::String("path".to_string()))
553                            .and_then(|x| x.as_str())
554                            .map(|s| s.to_string());
555                        if let Some(path) = path {
556                            let required = m
557                                .get(Value::String("required".to_string()))
558                                .and_then(|x| x.as_bool())
559                                .unwrap_or(true);
560                            out.push(Declaration { path, required });
561                        }
562                    }
563                    _ => {}
564                }
565            }
566        }
567        _ => {}
568    }
569}
570
571// ─────────────────────────────────────────────────────────────────────────────
572// Helpers
573// ─────────────────────────────────────────────────────────────────────────────
574
575/// Normalize a declared asset path to a CANONICAL store-relative forward-slash
576/// string, rejecting absolute paths and any `..` / root component. This is the
577/// lexical guard; [`crate::store::ensure_path_within_store`] is the resolved-path
578/// guard applied before any disk read.
579///
580/// The result is the record key, so it MUST be canonical: `./sources/x.pdf`,
581/// `sources/x.pdf`, and `sources/./x.pdf` all denote the same file and must fold
582/// to the same key `sources/x.pdf`. The path is rebuilt from `Normal` components
583/// only (dropping `CurDir`); hostile `..`/root/prefix components are still hard
584/// errors (never silently sanitized), so a leading `./` is normalized away while
585/// a traversal attempt is rejected.
586pub fn normalize_asset_path(raw: &str) -> Result<String, String> {
587    let trimmed = raw.trim();
588    if trimmed.is_empty() {
589        return Err("empty asset path".to_string());
590    }
591    let p = Path::new(trimmed);
592    if p.is_absolute() {
593        return Err(format!("absolute asset path not allowed: {raw}"));
594    }
595    let mut normal: Vec<&std::ffi::OsStr> = Vec::new();
596    for c in p.components() {
597        match c {
598            Component::ParentDir => return Err(format!("`..` not allowed in asset path: {raw}")),
599            Component::Prefix(_) | Component::RootDir => {
600                return Err(format!("asset path escapes the store: {raw}"))
601            }
602            // A `.` (CurDir) carries no path information — drop it so the key is
603            // canonical and `./x` does not split into a second record from `x`.
604            Component::CurDir => {}
605            Component::Normal(seg) => normal.push(seg),
606        }
607    }
608    if normal.is_empty() {
609        // The path was only `.`/`./` — no actual target.
610        return Err(format!("asset path names no file: {raw}"));
611    }
612    let joined: PathBuf = normal.into_iter().collect();
613    Ok(joined.to_string_lossy().replace('\\', "/"))
614}
615
616fn is_markdown(path: &str) -> bool {
617    Path::new(path)
618        .extension()
619        .and_then(|e| e.to_str())
620        .map(|e| e.eq_ignore_ascii_case("md"))
621        .unwrap_or(false)
622}
623
624fn rel_to_string(p: &Path) -> String {
625    p.to_string_lossy().replace('\\', "/")
626}
627
628/// Stream the file through SHA-256 (constant memory) and return
629/// `(lowercase-hex digest, byte length)`.
630fn sha256_file(abs: &Path) -> std::io::Result<(String, u64)> {
631    let mut f = std::fs::File::open(abs)?;
632    let mut hasher = Sha256::new();
633    let mut buf = [0u8; 65536];
634    let mut total: u64 = 0;
635    loop {
636        let n = f.read(&mut buf)?;
637        if n == 0 {
638            break;
639        }
640        hasher.update(&buf[..n]);
641        total += n as u64;
642    }
643    let digest = hasher.finalize();
644    let mut hex = String::with_capacity(64);
645    for b in digest.iter() {
646        let _ = write!(hex, "{b:02x}");
647    }
648    Ok((hex, total))
649}
650
651/// Best-effort MIME type from the path extension. Defaults to
652/// `application/octet-stream`. This is deterministic (extension-driven), so it
653/// does not break the manifest's rebuild equivalence.
654fn media_type_for(path: &str) -> String {
655    let ext = Path::new(path)
656        .extension()
657        .and_then(|e| e.to_str())
658        .unwrap_or("")
659        .to_ascii_lowercase();
660    let mt = match ext.as_str() {
661        "pdf" => "application/pdf",
662        "png" => "image/png",
663        "jpg" | "jpeg" => "image/jpeg",
664        "gif" => "image/gif",
665        "webp" => "image/webp",
666        "svg" => "image/svg+xml",
667        "tiff" | "tif" => "image/tiff",
668        "mp4" => "video/mp4",
669        "mov" => "video/quicktime",
670        "webm" => "video/webm",
671        "mkv" => "video/x-matroska",
672        "mp3" => "audio/mpeg",
673        "wav" => "audio/wav",
674        "m4a" => "audio/mp4",
675        "flac" => "audio/flac",
676        "zip" => "application/zip",
677        "gz" | "tgz" => "application/gzip",
678        "tar" => "application/x-tar",
679        "csv" => "text/csv",
680        "tsv" => "text/tab-separated-values",
681        "json" => "application/json",
682        "xml" => "application/xml",
683        "txt" => "text/plain",
684        "vtt" => "text/vtt",
685        "srt" => "application/x-subrip",
686        "html" | "htm" => "text/html",
687        "epub" => "application/epub+zip",
688        "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
689        "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
690        "pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
691        "doc" => "application/msword",
692        "xls" => "application/vnd.ms-excel",
693        "ppt" => "application/vnd.ms-powerpoint",
694        _ => "application/octet-stream",
695    };
696    mt.to_string()
697}
698
699/// Non-markdown files under `sources/` that no wrapper declares (the
700/// un-wrappered-drop worklist). Walks the raw filesystem (so it sees files an
701/// ignore mechanism would hide), skips `index.*` sidecars and hidden entries.
702fn find_untracked(store: &Store, declared: &BTreeSet<String>) -> crate::Result<Vec<String>> {
703    let sources = store.root.join("sources");
704    if !sources.is_dir() {
705        return Ok(Vec::new());
706    }
707    let mut out = Vec::new();
708    for entry in walkdir::WalkDir::new(&sources)
709        .into_iter()
710        .filter_entry(|e| !is_hidden(e.file_name().to_str().unwrap_or("")))
711    {
712        let entry = match entry {
713            Ok(e) => e,
714            Err(_) => continue,
715        };
716        if !entry.file_type().is_file() {
717            continue;
718        }
719        let name = entry.file_name().to_str().unwrap_or("");
720        if is_markdown(name) || name == "index.jsonl" {
721            continue;
722        }
723        let rel = match entry.path().strip_prefix(&store.root) {
724            Ok(r) => rel_to_string(r),
725            Err(_) => continue,
726        };
727        if !declared.contains(&rel) {
728            out.push(rel);
729        }
730    }
731    out.sort();
732    Ok(out)
733}
734
735fn is_hidden(name: &str) -> bool {
736    name.starts_with('.') && name != "." && name != ".."
737}
738
739#[cfg(test)]
740mod tests {
741    use super::*;
742
743    /// Regression (adversarial review): `normalize_asset_path` must fold a
744    /// leading/interior `.` (CurDir) into the canonical key, so `./sources/x.pdf`
745    /// and `sources/x.pdf` are ONE record (not duplicated, byte-double-counted,
746    /// and falsely reported untracked). Traversal / absolute / root stay hard
747    /// errors — folding must never silently sanitize a hostile path.
748    #[test]
749    fn normalize_asset_path_folds_curdir_and_rejects_traversal() {
750        assert_eq!(
751            normalize_asset_path("./sources/x.pdf").unwrap(),
752            "sources/x.pdf"
753        );
754        assert_eq!(
755            normalize_asset_path("sources/x.pdf").unwrap(),
756            "sources/x.pdf"
757        );
758        assert_eq!(
759            normalize_asset_path("sources/./x.pdf").unwrap(),
760            "sources/x.pdf"
761        );
762        assert_eq!(
763            normalize_asset_path("sources/x.pdf/").unwrap(),
764            "sources/x.pdf"
765        );
766
767        // Hostile / structural inputs are still rejected, not sanitized.
768        assert!(normalize_asset_path("../outside.txt").is_err());
769        assert!(normalize_asset_path("sources/../../etc/passwd").is_err());
770        assert!(normalize_asset_path("/abs/x.pdf").is_err());
771        // A `.`-only path (or empty) names no file.
772        assert!(normalize_asset_path(".").is_err());
773        assert!(normalize_asset_path("./").is_err());
774        assert!(normalize_asset_path("").is_err());
775    }
776
777    /// Regression (adversarial review #16): a poisoned / hand-edited
778    /// `assets.jsonl` whose `bytes` sum past u64::MAX must NOT abort `status`
779    /// (debug overflow-checks) or silently WRAP (release). `status`/`scan` are
780    /// non-failing reports over an editable manifest, so the byte totals SATURATE.
781    #[test]
782    fn status_and_scan_saturate_on_overflowing_manifest_bytes() {
783        let tmp = tempfile::TempDir::new().unwrap();
784        let root = tmp.path();
785        std::fs::write(root.join("DB.md"), "---\ntype: db-md\n---\n# store\n").unwrap();
786        // Two in-store records whose byte sizes sum past u64::MAX.
787        std::fs::write(
788            root.join("assets.jsonl"),
789            "{\"path\":\"records/a.bin\",\"sha256\":\"x\",\"bytes\":18446744073709551615,\
790\"media_type\":\"application/octet-stream\",\"wrappers\":[\"records/w.md\"],\"required\":true}\n\
791{\"path\":\"records/b.bin\",\"sha256\":\"y\",\"bytes\":1,\
792\"media_type\":\"application/octet-stream\",\"wrappers\":[\"records/w.md\"],\"required\":true}\n",
793        )
794        .unwrap();
795        let store = Store {
796            root: root.to_path_buf(),
797            config: crate::parser::Config::default(),
798        };
799
800        // status: must not panic; totals saturate at u64::MAX (both assets are
801        // missing from disk, so bytes_missing accumulates them too).
802        let report = status(&store).expect("status is non-failing on a poisoned manifest");
803        assert_eq!(
804            report.bytes_total,
805            u64::MAX,
806            "byte total must saturate, not wrap"
807        );
808        assert_eq!(
809            report.bytes_missing,
810            u64::MAX,
811            "missing bytes must saturate too"
812        );
813        assert_eq!(report.total, 2);
814
815        // scan's `.sum()` over the same records must likewise not overflow.
816        scan(&store, true, false).expect("scan must not overflow on a poisoned manifest");
817    }
818
819    /// Build a minimal store with one wrapper declaring one present asset, and
820    /// return `(store, canonical_manifest_string)` after an initial scan.
821    fn store_with_one_asset() -> (tempfile::TempDir, Store, String) {
822        let tmp = tempfile::TempDir::new().unwrap();
823        let root = tmp.path();
824        std::fs::create_dir_all(root.join("sources")).unwrap();
825        std::fs::write(root.join("DB.md"), "---\ntype: db-md\n---\n# store\n").unwrap();
826        std::fs::write(
827            root.join("sources/a.pdf.md"),
828            "---\ntype: pdf-source\nsummary: x\nasset: sources/a.pdf\n---\nbody\n",
829        )
830        .unwrap();
831        std::fs::write(root.join("sources/a.pdf"), b"PDFBYTES").unwrap();
832        let store = Store {
833            root: root.to_path_buf(),
834            config: crate::parser::Config::default(),
835        };
836        let report = scan(&store, false, false).unwrap();
837        assert!(report.wrote, "first scan writes the manifest");
838        let canonical = std::fs::read_to_string(root.join(MANIFEST_FILE)).unwrap();
839        (tmp, store, canonical)
840    }
841
842    /// Regression (adversarial review): `assets scan`'s no-change gate must
843    /// compare the canonical serialization against the on-disk BYTES, not parsed
844    /// records. A duplicate-line manifest (the git `merge=union` recovery case,
845    /// SPEC § Assets) parses — via `read_manifest`'s dedupe-by-path — back to the
846    /// same records, so a records-vs-records gate would call it "no change" and
847    /// leave the non-canonical bytes forever. `scan` must recompact it to the one
848    /// canonical line and report `wrote: true` (mirroring `index::rebuild_all`,
849    /// which always normalizes non-canonical artifacts).
850    #[test]
851    fn scan_recompacts_duplicate_line_manifest() {
852        let (_tmp, store, canonical) = store_with_one_asset();
853        let abs = store.root.join(MANIFEST_FILE);
854
855        // Simulate a git `merge=union`: the same canonical line, twice.
856        std::fs::write(&abs, format!("{canonical}{canonical}")).unwrap();
857        assert_eq!(std::fs::read_to_string(&abs).unwrap().lines().count(), 2);
858
859        let report = scan(&store, false, false).unwrap();
860        assert!(
861            report.wrote,
862            "a non-canonical (duplicate-line) manifest must be recompacted and reported as updated"
863        );
864        let after = std::fs::read_to_string(&abs).unwrap();
865        assert_eq!(
866            after.lines().count(),
867            1,
868            "duplicate lines must collapse to the single canonical line"
869        );
870        assert_eq!(
871            after, canonical,
872            "scan must restore the exact canonical bytes"
873        );
874    }
875
876    /// Regression (adversarial review): a wrongly-sorted / no-trailing-newline
877    /// manifest is also non-canonical on-disk and must be repaired by `scan`,
878    /// even though it parses (after the read-side sort) to the same records.
879    #[test]
880    fn scan_recompacts_noncanonical_byte_layout() {
881        let (_tmp, store, canonical) = store_with_one_asset();
882        let abs = store.root.join(MANIFEST_FILE);
883
884        // Strip the trailing newline: same record, non-canonical bytes.
885        std::fs::write(&abs, canonical.trim_end_matches('\n')).unwrap();
886        let report = scan(&store, false, false).unwrap();
887        assert!(
888            report.wrote,
889            "a manifest missing its trailing newline must be recompacted"
890        );
891        assert_eq!(
892            std::fs::read_to_string(&abs).unwrap(),
893            canonical,
894            "scan must restore the canonical trailing newline"
895        );
896    }
897
898    /// Regression (adversarial review): `paths` must enforce the containment
899    /// guard "wherever it reads the manifest" (SPEC § Assets > Path safety),
900    /// matching its sibling reads `verify`/`status`. A poisoned / hand-edited
901    /// `assets.jsonl` (the `merge=union`-corruption state the SPEC anticipates)
902    /// with an absolute (`/etc/hosts`) and a `..`-traversal recorded path must
903    /// NOT leak those verbatim — they would flow straight into a harness's
904    /// `.gitignore` managed block or sync-exclude. `paths` is a list, so the
905    /// analog of verify-counts-corrupt / status-counts-missing is to OMIT them;
906    /// the legitimate in-store path is still emitted unchanged.
907    #[test]
908    fn paths_omits_store_escaping_records() {
909        let tmp = tempfile::TempDir::new().unwrap();
910        let root = tmp.path();
911        std::fs::write(root.join("DB.md"), "---\ntype: db-md\n---\n# store\n").unwrap();
912        // One legitimate in-store record plus two store-escaping ones.
913        std::fs::write(
914            root.join("assets.jsonl"),
915            "{\"path\":\"sources/legit.pdf\",\"sha256\":\"a\",\"bytes\":9,\
916\"media_type\":\"application/pdf\",\"wrappers\":[\"sources/legit.pdf.md\"],\"required\":true}\n\
917{\"path\":\"../../../../../../etc/passwd\",\"sha256\":\"b\",\"bytes\":4096,\
918\"media_type\":\"text/plain\",\"wrappers\":[\"sources/legit.pdf.md\"],\"required\":false}\n\
919{\"path\":\"/etc/hosts\",\"sha256\":\"c\",\"bytes\":4096,\
920\"media_type\":\"text/plain\",\"wrappers\":[\"sources/legit.pdf.md\"],\"required\":false}\n",
921        )
922        .unwrap();
923        let store = Store {
924            root: root.to_path_buf(),
925            config: crate::parser::Config::default(),
926        };
927
928        let out = paths(&store).expect("paths is non-failing on a poisoned manifest");
929        assert_eq!(
930            out,
931            vec!["sources/legit.pdf".to_string()],
932            "only the safe in-store path is emitted; escaping paths are omitted"
933        );
934        assert!(
935            !out.iter().any(|p| p.starts_with('/') || p.contains("..")),
936            "no absolute or `..` path may ever leak from `paths`: {out:?}"
937        );
938    }
939
940    /// A clean (all-in-store) manifest must be unchanged by the containment
941    /// filter: every legitimate path is emitted, none dropped.
942    #[test]
943    fn paths_passes_a_clean_manifest_through_unchanged() {
944        let (_tmp, store, _canonical) = store_with_one_asset();
945        let out = paths(&store).expect("paths over a clean manifest");
946        assert_eq!(out, vec!["sources/a.pdf".to_string()]);
947    }
948
949    /// Idempotency must survive the fix: a genuinely-canonical manifest is left
950    /// byte-identical and `scan` reports `wrote: false`. (The old gate already
951    /// did this for parsed-equal records; the byte gate must not regress it.)
952    #[test]
953    fn scan_canonical_manifest_is_left_untouched() {
954        let (_tmp, store, canonical) = store_with_one_asset();
955        let abs = store.root.join(MANIFEST_FILE);
956
957        let report = scan(&store, false, false).unwrap();
958        assert!(
959            !report.wrote,
960            "a canonical, unchanged manifest must not be rewritten"
961        );
962        assert_eq!(
963            std::fs::read_to_string(&abs).unwrap(),
964            canonical,
965            "a no-op rescan must leave the manifest byte-identical"
966        );
967    }
968}
dbmd_core/assets.rs

dbmd_core/
assets.rs