1use std::collections::{BTreeMap, BTreeSet};
29use std::fmt::Write as _;
30use std::io::Read as _;
31use std::path::{Component, Path, PathBuf};
32
33use serde::{Deserialize, Serialize};
34use serde_norway::Value;
35use sha2::{Digest, Sha256};
36
37use crate::parser;
38use crate::store::{self, Store};
39use crate::write_atomic;
40
41pub const MANIFEST_FILE: &str = "assets.jsonl";
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51pub struct AssetRecord {
52 pub path: String,
55 pub sha256: String,
58 pub bytes: u64,
60 pub media_type: String,
62 pub wrappers: Vec<String>,
65 pub required: bool,
68}
69
70#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct Declaration {
73 pub path: String,
75 pub required: bool,
78}
79
80#[derive(Debug, Serialize)]
86pub struct ScanReport {
87 pub manifest: String,
88 pub cataloged: usize,
89 pub hashed: usize,
90 pub preserved: usize,
91 pub bytes: u64,
92 pub wrote: bool,
93 pub dry_run: bool,
94 pub warnings: Vec<String>,
95 pub untracked: Vec<String>,
96}
97
98#[derive(Debug, Serialize)]
100pub struct AssetState {
101 pub path: String,
102 pub sha256: String,
103 pub bytes: u64,
104 pub required: bool,
105 pub state: String,
107}
108
109#[derive(Debug, Serialize)]
111pub struct StatusReport {
112 pub total: usize,
113 pub present: usize,
114 pub missing: usize,
115 pub required_missing: usize,
116 pub optional_missing: usize,
117 pub bytes_total: u64,
118 pub bytes_missing: u64,
119 pub assets: Vec<AssetState>,
120}
121
122#[derive(Debug, Serialize)]
124pub struct VerifyReport {
125 pub mode: String,
126 pub checked: usize,
127 pub ok: usize,
128 pub missing: Vec<String>,
129 pub corrupt: Vec<String>,
130 pub complete: bool,
131}
132
133pub fn read_manifest(store: &Store) -> crate::Result<Vec<AssetRecord>> {
142 let abs = store.root.join(MANIFEST_FILE);
143 if !abs.exists() {
144 return Ok(Vec::new());
145 }
146 let text = std::fs::read_to_string(&abs)?;
147 let mut by_path: BTreeMap<String, AssetRecord> = BTreeMap::new();
148 for (i, line) in text.lines().enumerate() {
149 if line.trim().is_empty() {
150 continue;
151 }
152 let rec: AssetRecord = serde_json::from_str(line).map_err(|e| {
153 std::io::Error::new(
154 std::io::ErrorKind::InvalidData,
155 format!("{MANIFEST_FILE} line {}: {e}", i + 1),
156 )
157 })?;
158 by_path.insert(rec.path.clone(), rec);
159 }
160 Ok(by_path.into_values().collect())
161}
162
163pub fn write_manifest(store: &Store, records: &[AssetRecord]) -> crate::Result<()> {
166 let abs = store.root.join(MANIFEST_FILE);
167 if records.is_empty() {
168 if abs.exists() {
169 std::fs::remove_file(&abs)?;
170 }
171 return Ok(());
172 }
173 let mut sorted = records.to_vec();
174 sorted.sort_by(|a, b| a.path.cmp(&b.path));
175 let mut out = String::new();
176 for rec in &sorted {
177 let line = serde_json::to_string(rec).expect("AssetRecord serializes");
178 out.push_str(&line);
179 out.push('\n');
180 }
181 write_atomic(&abs, out.as_bytes())?;
182 Ok(())
183}
184
185pub fn scan(store: &Store, dry_run: bool, untracked: bool) -> crate::Result<ScanReport> {
198 let existing_by_path: BTreeMap<String, AssetRecord> = read_manifest(store)
202 .unwrap_or_default()
203 .into_iter()
204 .map(|r| (r.path.clone(), r))
205 .collect();
206
207 let mut wrappers_by_path: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
209 let mut required_by_path: BTreeMap<String, bool> = BTreeMap::new();
210 let mut declared_paths: BTreeSet<String> = BTreeSet::new();
211 let mut warnings: Vec<String> = Vec::new();
212
213 for rel in store.walk()? {
214 let abs = store.abs_path(&rel);
215 let (fm, _body) = match parser::read_file(&abs) {
216 Ok(v) => v,
217 Err(_) => continue, };
219 let wrapper = rel_to_string(&rel);
220 for decl in declared_assets(&fm) {
221 let norm = match normalize_asset_path(&decl.path) {
222 Ok(n) => n,
223 Err(e) => {
224 warnings.push(format!("{wrapper}: {e}"));
225 continue;
226 }
227 };
228 if is_markdown(&norm) {
229 warnings.push(format!(
230 "{wrapper}: asset path points at a markdown content file ({norm}); skipped"
231 ));
232 continue;
233 }
234 wrappers_by_path
235 .entry(norm.clone())
236 .or_default()
237 .insert(wrapper.clone());
238 let req = required_by_path.entry(norm.clone()).or_insert(false);
239 *req = *req || decl.required;
240 declared_paths.insert(norm);
241 }
242 }
243
244 let mut records: Vec<AssetRecord> = Vec::new();
246 let mut hashed = 0usize;
247 let mut preserved = 0usize;
248 for (path, wrappers) in &wrappers_by_path {
249 let required = *required_by_path.get(path).unwrap_or(&true);
250 let wrappers: Vec<String> = wrappers.iter().cloned().collect();
251
252 let abs = match store::ensure_path_within_store(&store.root, &store.root.join(path)) {
254 Ok(p) => p,
255 Err(_) => {
256 warnings.push(format!("{path}: escapes the store root; skipped"));
257 continue;
258 }
259 };
260
261 if abs.is_dir() {
262 warnings.push(format!("{path}: is a directory, not a file; skipped"));
263 continue;
264 }
265 if abs.is_file() {
266 let (sha256, bytes) = sha256_file(&abs)?;
267 records.push(AssetRecord {
268 path: path.clone(),
269 sha256,
270 bytes,
271 media_type: media_type_for(path),
272 wrappers,
273 required,
274 });
275 hashed += 1;
276 } else if let Some(prev) = existing_by_path.get(path) {
277 records.push(AssetRecord {
280 path: path.clone(),
281 sha256: prev.sha256.clone(),
282 bytes: prev.bytes,
283 media_type: media_type_for(path),
284 wrappers,
285 required,
286 });
287 preserved += 1;
288 } else {
289 warnings.push(format!(
290 "{path}: declared but absent and never cataloged; cannot hash (skipped)"
291 ));
292 }
293 }
294 records.sort_by(|a, b| a.path.cmp(&b.path));
295
296 let bytes: u64 = records.iter().fold(0u64, |a, r| a.saturating_add(r.bytes));
299 let cataloged = records.len();
300
301 let untracked_list = if untracked {
302 find_untracked(store, &declared_paths)?
303 } else {
304 Vec::new()
305 };
306
307 let mut wrote = false;
309 if !dry_run {
310 let current = read_manifest(store).unwrap_or_default();
311 if current != records {
312 write_manifest(store, &records)?;
313 wrote = true;
314 }
315 }
316
317 Ok(ScanReport {
318 manifest: MANIFEST_FILE.to_string(),
319 cataloged,
320 hashed,
321 preserved,
322 bytes,
323 wrote,
324 dry_run,
325 warnings,
326 untracked: untracked_list,
327 })
328}
329
330pub fn verify(store: &Store, include_optional: bool, quick: bool) -> crate::Result<VerifyReport> {
340 let records = read_manifest(store)?;
341 let mut missing = Vec::new();
342 let mut corrupt = Vec::new();
343 let mut checked = 0usize;
344
345 for rec in &records {
346 if !rec.required && !include_optional {
347 continue;
348 }
349 checked += 1;
350 let abs = match store::ensure_path_within_store(&store.root, &store.root.join(&rec.path)) {
351 Ok(p) => p,
352 Err(_) => {
353 corrupt.push(rec.path.clone());
355 continue;
356 }
357 };
358 if !abs.is_file() {
359 missing.push(rec.path.clone());
360 continue;
361 }
362 if quick {
363 let len = std::fs::metadata(&abs)?.len();
364 if len != rec.bytes {
365 corrupt.push(rec.path.clone());
366 }
367 } else {
368 let (sha, bytes) = sha256_file(&abs)?;
369 if sha != rec.sha256 || bytes != rec.bytes {
370 corrupt.push(rec.path.clone());
371 }
372 }
373 }
374
375 let ok = checked - missing.len() - corrupt.len();
376 let complete = missing.is_empty() && corrupt.is_empty();
377 Ok(VerifyReport {
378 mode: if quick { "quick" } else { "deep" }.to_string(),
379 checked,
380 ok,
381 missing,
382 corrupt,
383 complete,
384 })
385}
386
387pub fn status(store: &Store) -> crate::Result<StatusReport> {
395 let records = read_manifest(store)?;
396 let mut present = 0usize;
397 let mut missing = 0usize;
398 let mut required_missing = 0usize;
399 let mut optional_missing = 0usize;
400 let mut bytes_total = 0u64;
401 let mut bytes_missing = 0u64;
402 let mut assets = Vec::with_capacity(records.len());
403
404 for rec in &records {
405 bytes_total = bytes_total.saturating_add(rec.bytes);
410 let is_present = store::ensure_path_within_store(&store.root, &store.root.join(&rec.path))
418 .map(|p| p.is_file())
419 .unwrap_or(false);
420 let state = if is_present {
421 present += 1;
422 "present"
423 } else {
424 missing += 1;
425 bytes_missing = bytes_missing.saturating_add(rec.bytes);
426 if rec.required {
427 required_missing += 1;
428 } else {
429 optional_missing += 1;
430 }
431 "missing"
432 };
433 assets.push(AssetState {
434 path: rec.path.clone(),
435 sha256: rec.sha256.clone(),
436 bytes: rec.bytes,
437 required: rec.required,
438 state: state.to_string(),
439 });
440 }
441
442 Ok(StatusReport {
443 total: records.len(),
444 present,
445 missing,
446 required_missing,
447 optional_missing,
448 bytes_total,
449 bytes_missing,
450 assets,
451 })
452}
453
454pub fn paths(store: &Store) -> crate::Result<Vec<String>> {
462 Ok(read_manifest(store)?.into_iter().map(|r| r.path).collect())
463}
464
465pub fn declared_assets(fm: &parser::Frontmatter) -> Vec<Declaration> {
475 let mut out = Vec::new();
476 if let Some(v) = fm.get("asset") {
477 collect_declarations(&v, &mut out);
478 }
479 if let Some(v) = fm.get("assets") {
480 collect_declarations(&v, &mut out);
481 }
482 out
483}
484
485pub fn declarations_from_yaml_map(map: &BTreeMap<String, Value>) -> Vec<Declaration> {
489 let mut out = Vec::new();
490 if let Some(v) = map.get("asset") {
491 collect_declarations(v, &mut out);
492 }
493 if let Some(v) = map.get("assets") {
494 collect_declarations(v, &mut out);
495 }
496 out
497}
498
499fn collect_declarations(v: &Value, out: &mut Vec<Declaration>) {
500 match v {
501 Value::String(s) => out.push(Declaration {
502 path: s.clone(),
503 required: true,
504 }),
505 Value::Sequence(items) => {
506 for item in items {
507 match item {
508 Value::String(s) => out.push(Declaration {
509 path: s.clone(),
510 required: true,
511 }),
512 Value::Mapping(m) => {
513 let path = m
514 .get(Value::String("path".to_string()))
515 .and_then(|x| x.as_str())
516 .map(|s| s.to_string());
517 if let Some(path) = path {
518 let required = m
519 .get(Value::String("required".to_string()))
520 .and_then(|x| x.as_bool())
521 .unwrap_or(true);
522 out.push(Declaration { path, required });
523 }
524 }
525 _ => {}
526 }
527 }
528 }
529 _ => {}
530 }
531}
532
533pub fn normalize_asset_path(raw: &str) -> Result<String, String> {
549 let trimmed = raw.trim();
550 if trimmed.is_empty() {
551 return Err("empty asset path".to_string());
552 }
553 let p = Path::new(trimmed);
554 if p.is_absolute() {
555 return Err(format!("absolute asset path not allowed: {raw}"));
556 }
557 let mut normal: Vec<&std::ffi::OsStr> = Vec::new();
558 for c in p.components() {
559 match c {
560 Component::ParentDir => return Err(format!("`..` not allowed in asset path: {raw}")),
561 Component::Prefix(_) | Component::RootDir => {
562 return Err(format!("asset path escapes the store: {raw}"))
563 }
564 Component::CurDir => {}
567 Component::Normal(seg) => normal.push(seg),
568 }
569 }
570 if normal.is_empty() {
571 return Err(format!("asset path names no file: {raw}"));
573 }
574 let joined: PathBuf = normal.into_iter().collect();
575 Ok(joined.to_string_lossy().replace('\\', "/"))
576}
577
578fn is_markdown(path: &str) -> bool {
579 Path::new(path)
580 .extension()
581 .and_then(|e| e.to_str())
582 .map(|e| e.eq_ignore_ascii_case("md"))
583 .unwrap_or(false)
584}
585
586fn rel_to_string(p: &Path) -> String {
587 p.to_string_lossy().replace('\\', "/")
588}
589
590fn sha256_file(abs: &Path) -> std::io::Result<(String, u64)> {
593 let mut f = std::fs::File::open(abs)?;
594 let mut hasher = Sha256::new();
595 let mut buf = [0u8; 65536];
596 let mut total: u64 = 0;
597 loop {
598 let n = f.read(&mut buf)?;
599 if n == 0 {
600 break;
601 }
602 hasher.update(&buf[..n]);
603 total += n as u64;
604 }
605 let digest = hasher.finalize();
606 let mut hex = String::with_capacity(64);
607 for b in digest.iter() {
608 let _ = write!(hex, "{b:02x}");
609 }
610 Ok((hex, total))
611}
612
613fn media_type_for(path: &str) -> String {
617 let ext = Path::new(path)
618 .extension()
619 .and_then(|e| e.to_str())
620 .unwrap_or("")
621 .to_ascii_lowercase();
622 let mt = match ext.as_str() {
623 "pdf" => "application/pdf",
624 "png" => "image/png",
625 "jpg" | "jpeg" => "image/jpeg",
626 "gif" => "image/gif",
627 "webp" => "image/webp",
628 "svg" => "image/svg+xml",
629 "tiff" | "tif" => "image/tiff",
630 "mp4" => "video/mp4",
631 "mov" => "video/quicktime",
632 "webm" => "video/webm",
633 "mkv" => "video/x-matroska",
634 "mp3" => "audio/mpeg",
635 "wav" => "audio/wav",
636 "m4a" => "audio/mp4",
637 "flac" => "audio/flac",
638 "zip" => "application/zip",
639 "gz" | "tgz" => "application/gzip",
640 "tar" => "application/x-tar",
641 "csv" => "text/csv",
642 "tsv" => "text/tab-separated-values",
643 "json" => "application/json",
644 "xml" => "application/xml",
645 "txt" => "text/plain",
646 "vtt" => "text/vtt",
647 "srt" => "application/x-subrip",
648 "html" | "htm" => "text/html",
649 "epub" => "application/epub+zip",
650 "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
651 "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
652 "pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
653 "doc" => "application/msword",
654 "xls" => "application/vnd.ms-excel",
655 "ppt" => "application/vnd.ms-powerpoint",
656 _ => "application/octet-stream",
657 };
658 mt.to_string()
659}
660
661fn find_untracked(store: &Store, declared: &BTreeSet<String>) -> crate::Result<Vec<String>> {
665 let sources = store.root.join("sources");
666 if !sources.is_dir() {
667 return Ok(Vec::new());
668 }
669 let mut out = Vec::new();
670 for entry in walkdir::WalkDir::new(&sources)
671 .into_iter()
672 .filter_entry(|e| !is_hidden(e.file_name().to_str().unwrap_or("")))
673 {
674 let entry = match entry {
675 Ok(e) => e,
676 Err(_) => continue,
677 };
678 if !entry.file_type().is_file() {
679 continue;
680 }
681 let name = entry.file_name().to_str().unwrap_or("");
682 if is_markdown(name) || name == "index.jsonl" {
683 continue;
684 }
685 let rel = match entry.path().strip_prefix(&store.root) {
686 Ok(r) => rel_to_string(r),
687 Err(_) => continue,
688 };
689 if !declared.contains(&rel) {
690 out.push(rel);
691 }
692 }
693 out.sort();
694 Ok(out)
695}
696
697fn is_hidden(name: &str) -> bool {
698 name.starts_with('.') && name != "." && name != ".."
699}
700
701#[cfg(test)]
702mod tests {
703 use super::*;
704
705 #[test]
711 fn normalize_asset_path_folds_curdir_and_rejects_traversal() {
712 assert_eq!(
713 normalize_asset_path("./sources/x.pdf").unwrap(),
714 "sources/x.pdf"
715 );
716 assert_eq!(
717 normalize_asset_path("sources/x.pdf").unwrap(),
718 "sources/x.pdf"
719 );
720 assert_eq!(
721 normalize_asset_path("sources/./x.pdf").unwrap(),
722 "sources/x.pdf"
723 );
724 assert_eq!(
725 normalize_asset_path("sources/x.pdf/").unwrap(),
726 "sources/x.pdf"
727 );
728
729 assert!(normalize_asset_path("../outside.txt").is_err());
731 assert!(normalize_asset_path("sources/../../etc/passwd").is_err());
732 assert!(normalize_asset_path("/abs/x.pdf").is_err());
733 assert!(normalize_asset_path(".").is_err());
735 assert!(normalize_asset_path("./").is_err());
736 assert!(normalize_asset_path("").is_err());
737 }
738
739 #[test]
744 fn status_and_scan_saturate_on_overflowing_manifest_bytes() {
745 let tmp = tempfile::TempDir::new().unwrap();
746 let root = tmp.path();
747 std::fs::write(root.join("DB.md"), "---\ntype: db-md\n---\n# store\n").unwrap();
748 std::fs::write(
750 root.join("assets.jsonl"),
751 "{\"path\":\"records/a.bin\",\"sha256\":\"x\",\"bytes\":18446744073709551615,\
752\"media_type\":\"application/octet-stream\",\"wrappers\":[\"records/w.md\"],\"required\":true}\n\
753{\"path\":\"records/b.bin\",\"sha256\":\"y\",\"bytes\":1,\
754\"media_type\":\"application/octet-stream\",\"wrappers\":[\"records/w.md\"],\"required\":true}\n",
755 )
756 .unwrap();
757 let store = Store {
758 root: root.to_path_buf(),
759 config: crate::parser::Config::default(),
760 };
761
762 let report = status(&store).expect("status is non-failing on a poisoned manifest");
765 assert_eq!(
766 report.bytes_total,
767 u64::MAX,
768 "byte total must saturate, not wrap"
769 );
770 assert_eq!(
771 report.bytes_missing,
772 u64::MAX,
773 "missing bytes must saturate too"
774 );
775 assert_eq!(report.total, 2);
776
777 scan(&store, true, false).expect("scan must not overflow on a poisoned manifest");
779 }
780}