Skip to main content

dbmd_core/
assets.rs

1//! `assets` — the db.md asset layer.
2//!
3//! Raw binary assets (PDFs, recordings, large exports) belong to a store but
4//! are too heavy for Git. A content file (the **wrapper**) declares one via an
5//! `asset:` / `assets:` frontmatter key; this module records each in the
6//! root-level `assets.jsonl` manifest: store-relative path, SHA-256, size,
7//! media type, the declaring wrapper(s), and whether it is required for
8//! byte-completeness.
9//!
10//! The manifest is a **pure projection** of (wrappers + asset files on disk):
11//! every field is derivable, so a [`scan`] where the bytes are present
12//! reproduces it byte-for-byte, exactly like `index.jsonl`. db.md never
13//! transports the bytes and never names a storage provider; that is the
14//! VibeCraft layer's job, keyed off the SHA-256. This module never shells out
15//! to git and never touches the network.
16//!
17//! Four operations — one write, three reads:
18//!   - [`scan`]   (write) discover declared assets, hash present files, rewrite the manifest
19//!   - [`verify`] (read)  prove the local store is byte-complete for required assets
20//!   - [`status`] (read)  report present / missing without failing
21//!   - [`paths`]  (read)  the store-relative path list (for an ignore mechanism)
22//!
23//! Path safety: every declared path is validated store-relative (no `..`, no
24//! absolute, no escape) via [`crate::store::ensure_path_within_store`] wherever
25//! a path is read or resolved, so a poisoned manifest can never make `scan`
26//! hash, or a restore write, outside the store.
27
28use std::collections::{BTreeMap, BTreeSet};
29use std::fmt::Write as _;
30use std::io::Read as _;
31use std::path::{Component, Path, PathBuf};
32
33use serde::{Deserialize, Serialize};
34use serde_norway::Value;
35use sha2::{Digest, Sha256};
36
37use crate::parser;
38use crate::store::{self, Store};
39use crate::write_atomic;
40
41/// The manifest file name at the store root.
42pub const MANIFEST_FILE: &str = "assets.jsonl";
43
44/// One asset record — one line of `assets.jsonl`.
45///
46/// Every field is derivable from the store (wrapper frontmatter + the file on
47/// disk), so the manifest rebuilds byte-for-byte. Field declaration order is
48/// the canonical JSON key order; `wrappers` is always a sorted list (never a
49/// bare string) so serialization is deterministic.
50#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51pub struct AssetRecord {
52    /// Store-relative path of the raw bytes, forward-slash, with extension. The
53    /// record key. May differ from `wrappers` (the wrapper is the `.md`).
54    pub path: String,
55    /// Lowercase-hex SHA-256 of the bytes: the integrity check and the provider
56    /// blob key. May repeat across records (identical bytes at two paths).
57    pub sha256: String,
58    /// Size in bytes.
59    pub bytes: u64,
60    /// Best-effort MIME type derived from the path extension.
61    pub media_type: String,
62    /// Store-relative path(s) of the content file(s) that declare this asset,
63    /// sorted ascending. Usually one.
64    pub wrappers: Vec<String>,
65    /// Whether the asset is required for byte-completeness (default `true`;
66    /// `false` only when every declaration marks it optional).
67    pub required: bool,
68}
69
70/// A single `asset:` / `assets:` declaration read from a wrapper's frontmatter.
71#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct Declaration {
73    /// The raw store-relative path string as written in frontmatter.
74    pub path: String,
75    /// Whether this declaration marks the asset required (bare string and
76    /// object-without-`required` default to `true`).
77    pub required: bool,
78}
79
80// ─────────────────────────────────────────────────────────────────────────────
81// Reports (serialized directly in `--json`; the CLI renders the text form)
82// ─────────────────────────────────────────────────────────────────────────────
83
84/// Result of [`scan`].
85#[derive(Debug, Serialize)]
86pub struct ScanReport {
87    pub manifest: String,
88    pub cataloged: usize,
89    pub hashed: usize,
90    pub preserved: usize,
91    pub bytes: u64,
92    pub wrote: bool,
93    pub dry_run: bool,
94    pub warnings: Vec<String>,
95    pub untracked: Vec<String>,
96}
97
98/// One asset's local state, used by [`status`] and [`verify`].
99#[derive(Debug, Serialize)]
100pub struct AssetState {
101    pub path: String,
102    pub sha256: String,
103    pub bytes: u64,
104    pub required: bool,
105    /// `present` / `missing` (status); `ok` / `missing` / `corrupt` (verify).
106    pub state: String,
107}
108
109/// Result of [`status`].
110#[derive(Debug, Serialize)]
111pub struct StatusReport {
112    pub total: usize,
113    pub present: usize,
114    pub missing: usize,
115    pub required_missing: usize,
116    pub optional_missing: usize,
117    pub bytes_total: u64,
118    pub bytes_missing: u64,
119    pub assets: Vec<AssetState>,
120}
121
122/// Result of [`verify`].
123#[derive(Debug, Serialize)]
124pub struct VerifyReport {
125    pub mode: String,
126    pub checked: usize,
127    pub ok: usize,
128    pub missing: Vec<String>,
129    pub corrupt: Vec<String>,
130    pub complete: bool,
131}
132
133// ─────────────────────────────────────────────────────────────────────────────
134// Manifest read / write
135// ─────────────────────────────────────────────────────────────────────────────
136
137/// Read `assets.jsonl` into records, deduped by path (last line wins) and
138/// sorted by path ascending. A missing manifest is an empty store, not an
139/// error. A malformed line is an `InvalidData` error (the CLI surfaces it;
140/// [`crate::validate`] flags it leniently as `ASSET_MANIFEST_MALFORMED`).
141pub fn read_manifest(store: &Store) -> crate::Result<Vec<AssetRecord>> {
142    let abs = store.root.join(MANIFEST_FILE);
143    if !abs.exists() {
144        return Ok(Vec::new());
145    }
146    let text = std::fs::read_to_string(&abs)?;
147    let mut by_path: BTreeMap<String, AssetRecord> = BTreeMap::new();
148    for (i, line) in text.lines().enumerate() {
149        if line.trim().is_empty() {
150            continue;
151        }
152        let rec: AssetRecord = serde_json::from_str(line).map_err(|e| {
153            std::io::Error::new(
154                std::io::ErrorKind::InvalidData,
155                format!("{MANIFEST_FILE} line {}: {e}", i + 1),
156            )
157        })?;
158        by_path.insert(rec.path.clone(), rec);
159    }
160    Ok(by_path.into_values().collect())
161}
162
163/// Write the manifest atomically (temp + fsync + rename, via [`write_atomic`]),
164/// records sorted by path ascending. An empty record set removes the file.
165pub fn write_manifest(store: &Store, records: &[AssetRecord]) -> crate::Result<()> {
166    let abs = store.root.join(MANIFEST_FILE);
167    if records.is_empty() {
168        if abs.exists() {
169            std::fs::remove_file(&abs)?;
170        }
171        return Ok(());
172    }
173    let mut sorted = records.to_vec();
174    sorted.sort_by(|a, b| a.path.cmp(&b.path));
175    let mut out = String::new();
176    for rec in &sorted {
177        let line = serde_json::to_string(rec).expect("AssetRecord serializes");
178        out.push_str(&line);
179        out.push('\n');
180    }
181    write_atomic(&abs, out.as_bytes())?;
182    Ok(())
183}
184
185// ─────────────────────────────────────────────────────────────────────────────
186// scan (write) — rebuild the manifest from wrapper declarations
187// ─────────────────────────────────────────────────────────────────────────────
188
189/// Walk every content file, read its `asset`/`assets` declarations, hash the
190/// present files, and (re)write the manifest. The manifest is a projection: a
191/// path no longer declared by any wrapper drops out. Bytes absent locally but
192/// previously cataloged are preserved (the eviction / disk-relief case) since
193/// they cannot be re-hashed. `dry_run` computes without writing; `untracked`
194/// additionally reports non-markdown files under `sources/` that no wrapper
195/// declares. Never writes when nothing changed (keeps the Git diff and the
196/// `--dry-run`-then-scan idempotent).
197pub fn scan(store: &Store, dry_run: bool, untracked: bool) -> crate::Result<ScanReport> {
198    // Tolerate a malformed existing manifest here: scan rebuilds from the files,
199    // so a corrupt prior file is simply replaced. We still read it (best effort)
200    // to preserve hashes for evicted (absent-but-cataloged) assets.
201    let existing_by_path: BTreeMap<String, AssetRecord> = read_manifest(store)
202        .unwrap_or_default()
203        .into_iter()
204        .map(|r| (r.path.clone(), r))
205        .collect();
206
207    // Aggregate declarations across all content files: path -> (wrappers, required).
208    let mut wrappers_by_path: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
209    let mut required_by_path: BTreeMap<String, bool> = BTreeMap::new();
210    let mut declared_paths: BTreeSet<String> = BTreeSet::new();
211    let mut warnings: Vec<String> = Vec::new();
212
213    for rel in store.walk()? {
214        let abs = store.abs_path(&rel);
215        let (fm, _body) = match parser::read_file(&abs) {
216            Ok(v) => v,
217            Err(_) => continue, // unparseable / not a content file: skip
218        };
219        let wrapper = rel_to_string(&rel);
220        for decl in declared_assets(&fm) {
221            let norm = match normalize_asset_path(&decl.path) {
222                Ok(n) => n,
223                Err(e) => {
224                    warnings.push(format!("{wrapper}: {e}"));
225                    continue;
226                }
227            };
228            if is_markdown(&norm) {
229                warnings.push(format!(
230                    "{wrapper}: asset path points at a markdown content file ({norm}); skipped"
231                ));
232                continue;
233            }
234            wrappers_by_path
235                .entry(norm.clone())
236                .or_default()
237                .insert(wrapper.clone());
238            let req = required_by_path.entry(norm.clone()).or_insert(false);
239            *req = *req || decl.required;
240            declared_paths.insert(norm);
241        }
242    }
243
244    // Build records.
245    let mut records: Vec<AssetRecord> = Vec::new();
246    let mut hashed = 0usize;
247    let mut preserved = 0usize;
248    for (path, wrappers) in &wrappers_by_path {
249        let required = *required_by_path.get(path).unwrap_or(&true);
250        let wrappers: Vec<String> = wrappers.iter().cloned().collect();
251
252        // Belt-and-suspenders containment check before any disk read.
253        let abs = match store::ensure_path_within_store(&store.root, &store.root.join(path)) {
254            Ok(p) => p,
255            Err(_) => {
256                warnings.push(format!("{path}: escapes the store root; skipped"));
257                continue;
258            }
259        };
260
261        if abs.is_dir() {
262            warnings.push(format!("{path}: is a directory, not a file; skipped"));
263            continue;
264        }
265        if abs.is_file() {
266            let (sha256, bytes) = sha256_file(&abs)?;
267            records.push(AssetRecord {
268                path: path.clone(),
269                sha256,
270                bytes,
271                media_type: media_type_for(path),
272                wrappers,
273                required,
274            });
275            hashed += 1;
276        } else if let Some(prev) = existing_by_path.get(path) {
277            // Evicted: bytes gone locally but previously cataloged. Preserve the
278            // committed hash/size (we cannot re-hash what is not here).
279            records.push(AssetRecord {
280                path: path.clone(),
281                sha256: prev.sha256.clone(),
282                bytes: prev.bytes,
283                media_type: media_type_for(path),
284                wrappers,
285                required,
286            });
287            preserved += 1;
288        } else {
289            warnings.push(format!(
290                "{path}: declared but absent and never cataloged; cannot hash (skipped)"
291            ));
292        }
293    }
294    records.sort_by(|a, b| a.path.cmp(&b.path));
295
296    // Saturating: poisoned-manifest `bytes` can overflow a plain `.sum()` (debug
297    // abort / release wrap); see `status`.
298    let bytes: u64 = records.iter().fold(0u64, |a, r| a.saturating_add(r.bytes));
299    let cataloged = records.len();
300
301    let untracked_list = if untracked {
302        find_untracked(store, &declared_paths)?
303    } else {
304        Vec::new()
305    };
306
307    // Only write when the canonical content actually changed.
308    let mut wrote = false;
309    if !dry_run {
310        let current = read_manifest(store).unwrap_or_default();
311        if current != records {
312            write_manifest(store, &records)?;
313            wrote = true;
314        }
315    }
316
317    Ok(ScanReport {
318        manifest: MANIFEST_FILE.to_string(),
319        cataloged,
320        hashed,
321        preserved,
322        bytes,
323        wrote,
324        dry_run,
325        warnings,
326        untracked: untracked_list,
327    })
328}
329
330// ─────────────────────────────────────────────────────────────────────────────
331// verify (read) — byte-completeness gate
332// ─────────────────────────────────────────────────────────────────────────────
333
334/// Check that every required asset (plus optional, under `include_optional`) is
335/// present locally and matches the manifest. `quick` = presence + size only
336/// (fast); otherwise a full SHA-256 re-hash. This is a SWEEP (O(asset bytes) in
337/// deep mode), never a loop op. `complete` is true iff nothing is missing or
338/// corrupt in the considered set.
339pub fn verify(store: &Store, include_optional: bool, quick: bool) -> crate::Result<VerifyReport> {
340    let records = read_manifest(store)?;
341    let mut missing = Vec::new();
342    let mut corrupt = Vec::new();
343    let mut checked = 0usize;
344
345    for rec in &records {
346        if !rec.required && !include_optional {
347            continue;
348        }
349        checked += 1;
350        let abs = match store::ensure_path_within_store(&store.root, &store.root.join(&rec.path)) {
351            Ok(p) => p,
352            Err(_) => {
353                // A manifest path that escapes the store is not restorable here.
354                corrupt.push(rec.path.clone());
355                continue;
356            }
357        };
358        if !abs.is_file() {
359            missing.push(rec.path.clone());
360            continue;
361        }
362        if quick {
363            let len = std::fs::metadata(&abs)?.len();
364            if len != rec.bytes {
365                corrupt.push(rec.path.clone());
366            }
367        } else {
368            let (sha, bytes) = sha256_file(&abs)?;
369            if sha != rec.sha256 || bytes != rec.bytes {
370                corrupt.push(rec.path.clone());
371            }
372        }
373    }
374
375    let ok = checked - missing.len() - corrupt.len();
376    let complete = missing.is_empty() && corrupt.is_empty();
377    Ok(VerifyReport {
378        mode: if quick { "quick" } else { "deep" }.to_string(),
379        checked,
380        ok,
381        missing,
382        corrupt,
383        complete,
384    })
385}
386
387// ─────────────────────────────────────────────────────────────────────────────
388// status (read) — non-failing presence report
389// ─────────────────────────────────────────────────────────────────────────────
390
391/// Report which cataloged assets are present locally and how many bytes remain
392/// to restore. Never fails on a missing asset (that is `verify`'s job); it does
393/// fail on a malformed manifest.
394pub fn status(store: &Store) -> crate::Result<StatusReport> {
395    let records = read_manifest(store)?;
396    let mut present = 0usize;
397    let mut missing = 0usize;
398    let mut required_missing = 0usize;
399    let mut optional_missing = 0usize;
400    let mut bytes_total = 0u64;
401    let mut bytes_missing = 0u64;
402    let mut assets = Vec::with_capacity(records.len());
403
404    for rec in &records {
405        // Saturating: `rec.bytes` is deserialized verbatim from a hand-editable /
406        // poisoned `assets.jsonl` with no clamp. An absurd value (~u64::MAX)
407        // summed with unchecked `+=` ABORTS in debug (overflow-checks) and
408        // silently WRAPS in release — and `status` is contractually non-failing.
409        bytes_total = bytes_total.saturating_add(rec.bytes);
410        // Resolve through the same containment guard `scan` and `verify` use:
411        // the module contract is that the guard applies "wherever a path is read
412        // or resolved", and an unguarded `is_file()` here let a poisoned/hand-
413        // edited manifest path (`../outside.txt`) report `present` (and count its
414        // bytes) while `verify` reported it `corrupt` — two read commands on the
415        // same store disagreeing, plus a path-existence oracle outside the store.
416        // An escaping record is treated as not-present (missing), matching verify.
417        let is_present = store::ensure_path_within_store(&store.root, &store.root.join(&rec.path))
418            .map(|p| p.is_file())
419            .unwrap_or(false);
420        let state = if is_present {
421            present += 1;
422            "present"
423        } else {
424            missing += 1;
425            bytes_missing = bytes_missing.saturating_add(rec.bytes);
426            if rec.required {
427                required_missing += 1;
428            } else {
429                optional_missing += 1;
430            }
431            "missing"
432        };
433        assets.push(AssetState {
434            path: rec.path.clone(),
435            sha256: rec.sha256.clone(),
436            bytes: rec.bytes,
437            required: rec.required,
438            state: state.to_string(),
439        });
440    }
441
442    Ok(StatusReport {
443        total: records.len(),
444        present,
445        missing,
446        required_missing,
447        optional_missing,
448        bytes_total,
449        bytes_missing,
450        assets,
451    })
452}
453
454// ─────────────────────────────────────────────────────────────────────────────
455// paths (read) — the VCS-neutral path list
456// ─────────────────────────────────────────────────────────────────────────────
457
458/// The cataloged asset paths, sorted ascending. The VCS-neutral list a harness
459/// feeds into a `.gitignore` managed block or a sync-service exclude. db.md
460/// itself never writes any ignore file.
461pub fn paths(store: &Store) -> crate::Result<Vec<String>> {
462    Ok(read_manifest(store)?.into_iter().map(|r| r.path).collect())
463}
464
465// ─────────────────────────────────────────────────────────────────────────────
466// Declaration parsing (shared with `validate`)
467// ─────────────────────────────────────────────────────────────────────────────
468
469/// Read all `asset:` / `assets:` declarations from a parsed frontmatter.
470///
471/// `asset: <path>` is a single required declaration. `assets:` is a list whose
472/// items are either a bare path string (required) or a `{ path, required }`
473/// mapping. Both keys may be present.
474pub fn declared_assets(fm: &parser::Frontmatter) -> Vec<Declaration> {
475    let mut out = Vec::new();
476    if let Some(v) = fm.get("asset") {
477        collect_declarations(&v, &mut out);
478    }
479    if let Some(v) = fm.get("assets") {
480        collect_declarations(&v, &mut out);
481    }
482    out
483}
484
485/// Read declarations from an already-parsed YAML mapping. Used by
486/// [`crate::validate`], which holds the parsed mapping and need not re-read the
487/// file. Equivalent to [`declared_assets`] but keyed off a raw map.
488pub fn declarations_from_yaml_map(map: &BTreeMap<String, Value>) -> Vec<Declaration> {
489    let mut out = Vec::new();
490    if let Some(v) = map.get("asset") {
491        collect_declarations(v, &mut out);
492    }
493    if let Some(v) = map.get("assets") {
494        collect_declarations(v, &mut out);
495    }
496    out
497}
498
499fn collect_declarations(v: &Value, out: &mut Vec<Declaration>) {
500    match v {
501        Value::String(s) => out.push(Declaration {
502            path: s.clone(),
503            required: true,
504        }),
505        Value::Sequence(items) => {
506            for item in items {
507                match item {
508                    Value::String(s) => out.push(Declaration {
509                        path: s.clone(),
510                        required: true,
511                    }),
512                    Value::Mapping(m) => {
513                        let path = m
514                            .get(Value::String("path".to_string()))
515                            .and_then(|x| x.as_str())
516                            .map(|s| s.to_string());
517                        if let Some(path) = path {
518                            let required = m
519                                .get(Value::String("required".to_string()))
520                                .and_then(|x| x.as_bool())
521                                .unwrap_or(true);
522                            out.push(Declaration { path, required });
523                        }
524                    }
525                    _ => {}
526                }
527            }
528        }
529        _ => {}
530    }
531}
532
533// ─────────────────────────────────────────────────────────────────────────────
534// Helpers
535// ─────────────────────────────────────────────────────────────────────────────
536
537/// Normalize a declared asset path to a CANONICAL store-relative forward-slash
538/// string, rejecting absolute paths and any `..` / root component. This is the
539/// lexical guard; [`crate::store::ensure_path_within_store`] is the resolved-path
540/// guard applied before any disk read.
541///
542/// The result is the record key, so it MUST be canonical: `./sources/x.pdf`,
543/// `sources/x.pdf`, and `sources/./x.pdf` all denote the same file and must fold
544/// to the same key `sources/x.pdf`. The path is rebuilt from `Normal` components
545/// only (dropping `CurDir`); hostile `..`/root/prefix components are still hard
546/// errors (never silently sanitized), so a leading `./` is normalized away while
547/// a traversal attempt is rejected.
548pub fn normalize_asset_path(raw: &str) -> Result<String, String> {
549    let trimmed = raw.trim();
550    if trimmed.is_empty() {
551        return Err("empty asset path".to_string());
552    }
553    let p = Path::new(trimmed);
554    if p.is_absolute() {
555        return Err(format!("absolute asset path not allowed: {raw}"));
556    }
557    let mut normal: Vec<&std::ffi::OsStr> = Vec::new();
558    for c in p.components() {
559        match c {
560            Component::ParentDir => return Err(format!("`..` not allowed in asset path: {raw}")),
561            Component::Prefix(_) | Component::RootDir => {
562                return Err(format!("asset path escapes the store: {raw}"))
563            }
564            // A `.` (CurDir) carries no path information — drop it so the key is
565            // canonical and `./x` does not split into a second record from `x`.
566            Component::CurDir => {}
567            Component::Normal(seg) => normal.push(seg),
568        }
569    }
570    if normal.is_empty() {
571        // The path was only `.`/`./` — no actual target.
572        return Err(format!("asset path names no file: {raw}"));
573    }
574    let joined: PathBuf = normal.into_iter().collect();
575    Ok(joined.to_string_lossy().replace('\\', "/"))
576}
577
578fn is_markdown(path: &str) -> bool {
579    Path::new(path)
580        .extension()
581        .and_then(|e| e.to_str())
582        .map(|e| e.eq_ignore_ascii_case("md"))
583        .unwrap_or(false)
584}
585
586fn rel_to_string(p: &Path) -> String {
587    p.to_string_lossy().replace('\\', "/")
588}
589
590/// Stream the file through SHA-256 (constant memory) and return
591/// `(lowercase-hex digest, byte length)`.
592fn sha256_file(abs: &Path) -> std::io::Result<(String, u64)> {
593    let mut f = std::fs::File::open(abs)?;
594    let mut hasher = Sha256::new();
595    let mut buf = [0u8; 65536];
596    let mut total: u64 = 0;
597    loop {
598        let n = f.read(&mut buf)?;
599        if n == 0 {
600            break;
601        }
602        hasher.update(&buf[..n]);
603        total += n as u64;
604    }
605    let digest = hasher.finalize();
606    let mut hex = String::with_capacity(64);
607    for b in digest.iter() {
608        let _ = write!(hex, "{b:02x}");
609    }
610    Ok((hex, total))
611}
612
613/// Best-effort MIME type from the path extension. Defaults to
614/// `application/octet-stream`. This is deterministic (extension-driven), so it
615/// does not break the manifest's rebuild equivalence.
616fn media_type_for(path: &str) -> String {
617    let ext = Path::new(path)
618        .extension()
619        .and_then(|e| e.to_str())
620        .unwrap_or("")
621        .to_ascii_lowercase();
622    let mt = match ext.as_str() {
623        "pdf" => "application/pdf",
624        "png" => "image/png",
625        "jpg" | "jpeg" => "image/jpeg",
626        "gif" => "image/gif",
627        "webp" => "image/webp",
628        "svg" => "image/svg+xml",
629        "tiff" | "tif" => "image/tiff",
630        "mp4" => "video/mp4",
631        "mov" => "video/quicktime",
632        "webm" => "video/webm",
633        "mkv" => "video/x-matroska",
634        "mp3" => "audio/mpeg",
635        "wav" => "audio/wav",
636        "m4a" => "audio/mp4",
637        "flac" => "audio/flac",
638        "zip" => "application/zip",
639        "gz" | "tgz" => "application/gzip",
640        "tar" => "application/x-tar",
641        "csv" => "text/csv",
642        "tsv" => "text/tab-separated-values",
643        "json" => "application/json",
644        "xml" => "application/xml",
645        "txt" => "text/plain",
646        "vtt" => "text/vtt",
647        "srt" => "application/x-subrip",
648        "html" | "htm" => "text/html",
649        "epub" => "application/epub+zip",
650        "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
651        "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
652        "pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
653        "doc" => "application/msword",
654        "xls" => "application/vnd.ms-excel",
655        "ppt" => "application/vnd.ms-powerpoint",
656        _ => "application/octet-stream",
657    };
658    mt.to_string()
659}
660
661/// Non-markdown files under `sources/` that no wrapper declares (the
662/// un-wrappered-drop worklist). Walks the raw filesystem (so it sees files an
663/// ignore mechanism would hide), skips `index.*` sidecars and hidden entries.
664fn find_untracked(store: &Store, declared: &BTreeSet<String>) -> crate::Result<Vec<String>> {
665    let sources = store.root.join("sources");
666    if !sources.is_dir() {
667        return Ok(Vec::new());
668    }
669    let mut out = Vec::new();
670    for entry in walkdir::WalkDir::new(&sources)
671        .into_iter()
672        .filter_entry(|e| !is_hidden(e.file_name().to_str().unwrap_or("")))
673    {
674        let entry = match entry {
675            Ok(e) => e,
676            Err(_) => continue,
677        };
678        if !entry.file_type().is_file() {
679            continue;
680        }
681        let name = entry.file_name().to_str().unwrap_or("");
682        if is_markdown(name) || name == "index.jsonl" {
683            continue;
684        }
685        let rel = match entry.path().strip_prefix(&store.root) {
686            Ok(r) => rel_to_string(r),
687            Err(_) => continue,
688        };
689        if !declared.contains(&rel) {
690            out.push(rel);
691        }
692    }
693    out.sort();
694    Ok(out)
695}
696
697fn is_hidden(name: &str) -> bool {
698    name.starts_with('.') && name != "." && name != ".."
699}
700
701#[cfg(test)]
702mod tests {
703    use super::*;
704
705    /// Regression (adversarial review): `normalize_asset_path` must fold a
706    /// leading/interior `.` (CurDir) into the canonical key, so `./sources/x.pdf`
707    /// and `sources/x.pdf` are ONE record (not duplicated, byte-double-counted,
708    /// and falsely reported untracked). Traversal / absolute / root stay hard
709    /// errors — folding must never silently sanitize a hostile path.
710    #[test]
711    fn normalize_asset_path_folds_curdir_and_rejects_traversal() {
712        assert_eq!(
713            normalize_asset_path("./sources/x.pdf").unwrap(),
714            "sources/x.pdf"
715        );
716        assert_eq!(
717            normalize_asset_path("sources/x.pdf").unwrap(),
718            "sources/x.pdf"
719        );
720        assert_eq!(
721            normalize_asset_path("sources/./x.pdf").unwrap(),
722            "sources/x.pdf"
723        );
724        assert_eq!(
725            normalize_asset_path("sources/x.pdf/").unwrap(),
726            "sources/x.pdf"
727        );
728
729        // Hostile / structural inputs are still rejected, not sanitized.
730        assert!(normalize_asset_path("../outside.txt").is_err());
731        assert!(normalize_asset_path("sources/../../etc/passwd").is_err());
732        assert!(normalize_asset_path("/abs/x.pdf").is_err());
733        // A `.`-only path (or empty) names no file.
734        assert!(normalize_asset_path(".").is_err());
735        assert!(normalize_asset_path("./").is_err());
736        assert!(normalize_asset_path("").is_err());
737    }
738
739    /// Regression (adversarial review #16): a poisoned / hand-edited
740    /// `assets.jsonl` whose `bytes` sum past u64::MAX must NOT abort `status`
741    /// (debug overflow-checks) or silently WRAP (release). `status`/`scan` are
742    /// non-failing reports over an editable manifest, so the byte totals SATURATE.
743    #[test]
744    fn status_and_scan_saturate_on_overflowing_manifest_bytes() {
745        let tmp = tempfile::TempDir::new().unwrap();
746        let root = tmp.path();
747        std::fs::write(root.join("DB.md"), "---\ntype: db-md\n---\n# store\n").unwrap();
748        // Two in-store records whose byte sizes sum past u64::MAX.
749        std::fs::write(
750            root.join("assets.jsonl"),
751            "{\"path\":\"records/a.bin\",\"sha256\":\"x\",\"bytes\":18446744073709551615,\
752\"media_type\":\"application/octet-stream\",\"wrappers\":[\"records/w.md\"],\"required\":true}\n\
753{\"path\":\"records/b.bin\",\"sha256\":\"y\",\"bytes\":1,\
754\"media_type\":\"application/octet-stream\",\"wrappers\":[\"records/w.md\"],\"required\":true}\n",
755        )
756        .unwrap();
757        let store = Store {
758            root: root.to_path_buf(),
759            config: crate::parser::Config::default(),
760        };
761
762        // status: must not panic; totals saturate at u64::MAX (both assets are
763        // missing from disk, so bytes_missing accumulates them too).
764        let report = status(&store).expect("status is non-failing on a poisoned manifest");
765        assert_eq!(
766            report.bytes_total,
767            u64::MAX,
768            "byte total must saturate, not wrap"
769        );
770        assert_eq!(
771            report.bytes_missing,
772            u64::MAX,
773            "missing bytes must saturate too"
774        );
775        assert_eq!(report.total, 2);
776
777        // scan's `.sum()` over the same records must likewise not overflow.
778        scan(&store, true, false).expect("scan must not overflow on a poisoned manifest");
779    }
780}