1use std::collections::{BTreeMap, BTreeSet};
29use std::fmt::Write as _;
30use std::io::Read as _;
31use std::path::{Component, Path, PathBuf};
32
33use serde::{Deserialize, Serialize};
34use serde_norway::Value;
35use sha2::{Digest, Sha256};
36
37use crate::parser;
38use crate::store::{self, Store};
39use crate::write_atomic;
40
41pub const MANIFEST_FILE: &str = "assets.jsonl";
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
51pub struct AssetRecord {
52 pub path: String,
55 pub sha256: String,
58 pub bytes: u64,
60 pub media_type: String,
62 pub wrappers: Vec<String>,
65 pub required: bool,
68}
69
70#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct Declaration {
73 pub path: String,
75 pub required: bool,
78}
79
80#[derive(Debug, Serialize)]
86pub struct ScanReport {
87 pub manifest: String,
88 pub cataloged: usize,
89 pub hashed: usize,
90 pub preserved: usize,
91 pub bytes: u64,
92 pub wrote: bool,
93 pub dry_run: bool,
94 pub warnings: Vec<String>,
95 pub untracked: Vec<String>,
96}
97
98#[derive(Debug, Serialize)]
100pub struct AssetState {
101 pub path: String,
102 pub sha256: String,
103 pub bytes: u64,
104 pub required: bool,
105 pub state: String,
107}
108
109#[derive(Debug, Serialize)]
111pub struct StatusReport {
112 pub total: usize,
113 pub present: usize,
114 pub missing: usize,
115 pub required_missing: usize,
116 pub optional_missing: usize,
117 pub bytes_total: u64,
118 pub bytes_missing: u64,
119 pub assets: Vec<AssetState>,
120}
121
122#[derive(Debug, Serialize)]
124pub struct VerifyReport {
125 pub mode: String,
126 pub checked: usize,
127 pub ok: usize,
128 pub missing: Vec<String>,
129 pub corrupt: Vec<String>,
130 pub complete: bool,
131}
132
133pub fn read_manifest(store: &Store) -> crate::Result<Vec<AssetRecord>> {
142 let abs = store.root.join(MANIFEST_FILE);
143 if !abs.exists() {
144 return Ok(Vec::new());
145 }
146 let text = std::fs::read_to_string(&abs)?;
147 let mut by_path: BTreeMap<String, AssetRecord> = BTreeMap::new();
148 for (i, line) in text.lines().enumerate() {
149 if line.trim().is_empty() {
150 continue;
151 }
152 let rec: AssetRecord = serde_json::from_str(line).map_err(|e| {
153 std::io::Error::new(
154 std::io::ErrorKind::InvalidData,
155 format!("{MANIFEST_FILE} line {}: {e}", i + 1),
156 )
157 })?;
158 by_path.insert(rec.path.clone(), rec);
159 }
160 Ok(by_path.into_values().collect())
161}
162
163pub fn write_manifest(store: &Store, records: &[AssetRecord]) -> crate::Result<()> {
166 let abs = store.root.join(MANIFEST_FILE);
167 if records.is_empty() {
168 if abs.exists() {
169 std::fs::remove_file(&abs)?;
170 }
171 return Ok(());
172 }
173 let mut sorted = records.to_vec();
174 sorted.sort_by(|a, b| a.path.cmp(&b.path));
175 let mut out = String::new();
176 for rec in &sorted {
177 let line = serde_json::to_string(rec).expect("AssetRecord serializes");
178 out.push_str(&line);
179 out.push('\n');
180 }
181 write_atomic(&abs, out.as_bytes())?;
182 Ok(())
183}
184
185pub fn scan(store: &Store, dry_run: bool, untracked: bool) -> crate::Result<ScanReport> {
198 let existing_by_path: BTreeMap<String, AssetRecord> = read_manifest(store)
202 .unwrap_or_default()
203 .into_iter()
204 .map(|r| (r.path.clone(), r))
205 .collect();
206
207 let mut wrappers_by_path: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
209 let mut required_by_path: BTreeMap<String, bool> = BTreeMap::new();
210 let mut declared_paths: BTreeSet<String> = BTreeSet::new();
211 let mut warnings: Vec<String> = Vec::new();
212
213 for rel in store.walk()? {
214 let abs = store.abs_path(&rel);
215 let (fm, _body) = match parser::read_file(&abs) {
216 Ok(v) => v,
217 Err(_) => continue, };
219 let wrapper = rel_to_string(&rel);
220 for decl in declared_assets(&fm) {
221 let norm = match normalize_asset_path(&decl.path) {
222 Ok(n) => n,
223 Err(e) => {
224 warnings.push(format!("{wrapper}: {e}"));
225 continue;
226 }
227 };
228 if is_markdown(&norm) {
229 warnings.push(format!(
230 "{wrapper}: asset path points at a markdown content file ({norm}); skipped"
231 ));
232 continue;
233 }
234 wrappers_by_path
235 .entry(norm.clone())
236 .or_default()
237 .insert(wrapper.clone());
238 let req = required_by_path.entry(norm.clone()).or_insert(false);
239 *req = *req || decl.required;
240 declared_paths.insert(norm);
241 }
242 }
243
244 let mut records: Vec<AssetRecord> = Vec::new();
246 let mut hashed = 0usize;
247 let mut preserved = 0usize;
248 for (path, wrappers) in &wrappers_by_path {
249 let required = *required_by_path.get(path).unwrap_or(&true);
250 let wrappers: Vec<String> = wrappers.iter().cloned().collect();
251
252 let abs = match store::ensure_path_within_store(&store.root, &store.root.join(path)) {
254 Ok(p) => p,
255 Err(_) => {
256 warnings.push(format!("{path}: escapes the store root; skipped"));
257 continue;
258 }
259 };
260
261 if abs.is_dir() {
262 warnings.push(format!("{path}: is a directory, not a file; skipped"));
263 continue;
264 }
265 if abs.is_file() {
266 let (sha256, bytes) = sha256_file(&abs)?;
267 records.push(AssetRecord {
268 path: path.clone(),
269 sha256,
270 bytes,
271 media_type: media_type_for(path),
272 wrappers,
273 required,
274 });
275 hashed += 1;
276 } else if let Some(prev) = existing_by_path.get(path) {
277 records.push(AssetRecord {
280 path: path.clone(),
281 sha256: prev.sha256.clone(),
282 bytes: prev.bytes,
283 media_type: media_type_for(path),
284 wrappers,
285 required,
286 });
287 preserved += 1;
288 } else {
289 warnings.push(format!(
290 "{path}: declared but absent and never cataloged; cannot hash (skipped)"
291 ));
292 }
293 }
294 records.sort_by(|a, b| a.path.cmp(&b.path));
295
296 let bytes: u64 = records.iter().map(|r| r.bytes).sum();
297 let cataloged = records.len();
298
299 let untracked_list = if untracked {
300 find_untracked(store, &declared_paths)?
301 } else {
302 Vec::new()
303 };
304
305 let mut wrote = false;
307 if !dry_run {
308 let current = read_manifest(store).unwrap_or_default();
309 if current != records {
310 write_manifest(store, &records)?;
311 wrote = true;
312 }
313 }
314
315 Ok(ScanReport {
316 manifest: MANIFEST_FILE.to_string(),
317 cataloged,
318 hashed,
319 preserved,
320 bytes,
321 wrote,
322 dry_run,
323 warnings,
324 untracked: untracked_list,
325 })
326}
327
328pub fn verify(store: &Store, include_optional: bool, quick: bool) -> crate::Result<VerifyReport> {
338 let records = read_manifest(store)?;
339 let mut missing = Vec::new();
340 let mut corrupt = Vec::new();
341 let mut checked = 0usize;
342
343 for rec in &records {
344 if !rec.required && !include_optional {
345 continue;
346 }
347 checked += 1;
348 let abs = match store::ensure_path_within_store(&store.root, &store.root.join(&rec.path)) {
349 Ok(p) => p,
350 Err(_) => {
351 corrupt.push(rec.path.clone());
353 continue;
354 }
355 };
356 if !abs.is_file() {
357 missing.push(rec.path.clone());
358 continue;
359 }
360 if quick {
361 let len = std::fs::metadata(&abs)?.len();
362 if len != rec.bytes {
363 corrupt.push(rec.path.clone());
364 }
365 } else {
366 let (sha, bytes) = sha256_file(&abs)?;
367 if sha != rec.sha256 || bytes != rec.bytes {
368 corrupt.push(rec.path.clone());
369 }
370 }
371 }
372
373 let ok = checked - missing.len() - corrupt.len();
374 let complete = missing.is_empty() && corrupt.is_empty();
375 Ok(VerifyReport {
376 mode: if quick { "quick" } else { "deep" }.to_string(),
377 checked,
378 ok,
379 missing,
380 corrupt,
381 complete,
382 })
383}
384
385pub fn status(store: &Store) -> crate::Result<StatusReport> {
393 let records = read_manifest(store)?;
394 let mut present = 0usize;
395 let mut missing = 0usize;
396 let mut required_missing = 0usize;
397 let mut optional_missing = 0usize;
398 let mut bytes_total = 0u64;
399 let mut bytes_missing = 0u64;
400 let mut assets = Vec::with_capacity(records.len());
401
402 for rec in &records {
403 bytes_total += rec.bytes;
404 let is_present = store::ensure_path_within_store(&store.root, &store.root.join(&rec.path))
412 .map(|p| p.is_file())
413 .unwrap_or(false);
414 let state = if is_present {
415 present += 1;
416 "present"
417 } else {
418 missing += 1;
419 bytes_missing += rec.bytes;
420 if rec.required {
421 required_missing += 1;
422 } else {
423 optional_missing += 1;
424 }
425 "missing"
426 };
427 assets.push(AssetState {
428 path: rec.path.clone(),
429 sha256: rec.sha256.clone(),
430 bytes: rec.bytes,
431 required: rec.required,
432 state: state.to_string(),
433 });
434 }
435
436 Ok(StatusReport {
437 total: records.len(),
438 present,
439 missing,
440 required_missing,
441 optional_missing,
442 bytes_total,
443 bytes_missing,
444 assets,
445 })
446}
447
448pub fn paths(store: &Store) -> crate::Result<Vec<String>> {
456 Ok(read_manifest(store)?.into_iter().map(|r| r.path).collect())
457}
458
459pub fn declared_assets(fm: &parser::Frontmatter) -> Vec<Declaration> {
469 let mut out = Vec::new();
470 if let Some(v) = fm.get("asset") {
471 collect_declarations(&v, &mut out);
472 }
473 if let Some(v) = fm.get("assets") {
474 collect_declarations(&v, &mut out);
475 }
476 out
477}
478
479pub fn declarations_from_yaml_map(map: &BTreeMap<String, Value>) -> Vec<Declaration> {
483 let mut out = Vec::new();
484 if let Some(v) = map.get("asset") {
485 collect_declarations(v, &mut out);
486 }
487 if let Some(v) = map.get("assets") {
488 collect_declarations(v, &mut out);
489 }
490 out
491}
492
493fn collect_declarations(v: &Value, out: &mut Vec<Declaration>) {
494 match v {
495 Value::String(s) => out.push(Declaration {
496 path: s.clone(),
497 required: true,
498 }),
499 Value::Sequence(items) => {
500 for item in items {
501 match item {
502 Value::String(s) => out.push(Declaration {
503 path: s.clone(),
504 required: true,
505 }),
506 Value::Mapping(m) => {
507 let path = m
508 .get(Value::String("path".to_string()))
509 .and_then(|x| x.as_str())
510 .map(|s| s.to_string());
511 if let Some(path) = path {
512 let required = m
513 .get(Value::String("required".to_string()))
514 .and_then(|x| x.as_bool())
515 .unwrap_or(true);
516 out.push(Declaration { path, required });
517 }
518 }
519 _ => {}
520 }
521 }
522 }
523 _ => {}
524 }
525}
526
527pub fn normalize_asset_path(raw: &str) -> Result<String, String> {
543 let trimmed = raw.trim();
544 if trimmed.is_empty() {
545 return Err("empty asset path".to_string());
546 }
547 let p = Path::new(trimmed);
548 if p.is_absolute() {
549 return Err(format!("absolute asset path not allowed: {raw}"));
550 }
551 let mut normal: Vec<&std::ffi::OsStr> = Vec::new();
552 for c in p.components() {
553 match c {
554 Component::ParentDir => return Err(format!("`..` not allowed in asset path: {raw}")),
555 Component::Prefix(_) | Component::RootDir => {
556 return Err(format!("asset path escapes the store: {raw}"))
557 }
558 Component::CurDir => {}
561 Component::Normal(seg) => normal.push(seg),
562 }
563 }
564 if normal.is_empty() {
565 return Err(format!("asset path names no file: {raw}"));
567 }
568 let joined: PathBuf = normal.into_iter().collect();
569 Ok(joined.to_string_lossy().replace('\\', "/"))
570}
571
572fn is_markdown(path: &str) -> bool {
573 Path::new(path)
574 .extension()
575 .and_then(|e| e.to_str())
576 .map(|e| e.eq_ignore_ascii_case("md"))
577 .unwrap_or(false)
578}
579
580fn rel_to_string(p: &Path) -> String {
581 p.to_string_lossy().replace('\\', "/")
582}
583
584fn sha256_file(abs: &Path) -> std::io::Result<(String, u64)> {
587 let mut f = std::fs::File::open(abs)?;
588 let mut hasher = Sha256::new();
589 let mut buf = [0u8; 65536];
590 let mut total: u64 = 0;
591 loop {
592 let n = f.read(&mut buf)?;
593 if n == 0 {
594 break;
595 }
596 hasher.update(&buf[..n]);
597 total += n as u64;
598 }
599 let digest = hasher.finalize();
600 let mut hex = String::with_capacity(64);
601 for b in digest.iter() {
602 let _ = write!(hex, "{b:02x}");
603 }
604 Ok((hex, total))
605}
606
607fn media_type_for(path: &str) -> String {
611 let ext = Path::new(path)
612 .extension()
613 .and_then(|e| e.to_str())
614 .unwrap_or("")
615 .to_ascii_lowercase();
616 let mt = match ext.as_str() {
617 "pdf" => "application/pdf",
618 "png" => "image/png",
619 "jpg" | "jpeg" => "image/jpeg",
620 "gif" => "image/gif",
621 "webp" => "image/webp",
622 "svg" => "image/svg+xml",
623 "tiff" | "tif" => "image/tiff",
624 "mp4" => "video/mp4",
625 "mov" => "video/quicktime",
626 "webm" => "video/webm",
627 "mkv" => "video/x-matroska",
628 "mp3" => "audio/mpeg",
629 "wav" => "audio/wav",
630 "m4a" => "audio/mp4",
631 "flac" => "audio/flac",
632 "zip" => "application/zip",
633 "gz" | "tgz" => "application/gzip",
634 "tar" => "application/x-tar",
635 "csv" => "text/csv",
636 "tsv" => "text/tab-separated-values",
637 "json" => "application/json",
638 "xml" => "application/xml",
639 "txt" => "text/plain",
640 "vtt" => "text/vtt",
641 "srt" => "application/x-subrip",
642 "html" | "htm" => "text/html",
643 "epub" => "application/epub+zip",
644 "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
645 "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
646 "pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
647 "doc" => "application/msword",
648 "xls" => "application/vnd.ms-excel",
649 "ppt" => "application/vnd.ms-powerpoint",
650 _ => "application/octet-stream",
651 };
652 mt.to_string()
653}
654
655fn find_untracked(store: &Store, declared: &BTreeSet<String>) -> crate::Result<Vec<String>> {
659 let sources = store.root.join("sources");
660 if !sources.is_dir() {
661 return Ok(Vec::new());
662 }
663 let mut out = Vec::new();
664 for entry in walkdir::WalkDir::new(&sources)
665 .into_iter()
666 .filter_entry(|e| !is_hidden(e.file_name().to_str().unwrap_or("")))
667 {
668 let entry = match entry {
669 Ok(e) => e,
670 Err(_) => continue,
671 };
672 if !entry.file_type().is_file() {
673 continue;
674 }
675 let name = entry.file_name().to_str().unwrap_or("");
676 if is_markdown(name) || name == "index.jsonl" {
677 continue;
678 }
679 let rel = match entry.path().strip_prefix(&store.root) {
680 Ok(r) => rel_to_string(r),
681 Err(_) => continue,
682 };
683 if !declared.contains(&rel) {
684 out.push(rel);
685 }
686 }
687 out.sort();
688 Ok(out)
689}
690
691fn is_hidden(name: &str) -> bool {
692 name.starts_with('.') && name != "." && name != ".."
693}
694
695#[cfg(test)]
696mod tests {
697 use super::*;
698
699 #[test]
705 fn normalize_asset_path_folds_curdir_and_rejects_traversal() {
706 assert_eq!(
707 normalize_asset_path("./sources/x.pdf").unwrap(),
708 "sources/x.pdf"
709 );
710 assert_eq!(
711 normalize_asset_path("sources/x.pdf").unwrap(),
712 "sources/x.pdf"
713 );
714 assert_eq!(
715 normalize_asset_path("sources/./x.pdf").unwrap(),
716 "sources/x.pdf"
717 );
718 assert_eq!(
719 normalize_asset_path("sources/x.pdf/").unwrap(),
720 "sources/x.pdf"
721 );
722
723 assert!(normalize_asset_path("../outside.txt").is_err());
725 assert!(normalize_asset_path("sources/../../etc/passwd").is_err());
726 assert!(normalize_asset_path("/abs/x.pdf").is_err());
727 assert!(normalize_asset_path(".").is_err());
729 assert!(normalize_asset_path("./").is_err());
730 assert!(normalize_asset_path("").is_err());
731 }
732}