use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Write as _;
use std::io::Read as _;
use std::path::{Component, Path, PathBuf};
use serde::{Deserialize, Serialize};
use serde_norway::Value;
use sha2::{Digest, Sha256};
use crate::parser;
use crate::store::{self, Store};
use crate::write_atomic;
pub const MANIFEST_FILE: &str = "assets.jsonl";
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct AssetRecord {
pub path: String,
pub sha256: String,
pub bytes: u64,
pub media_type: String,
pub wrappers: Vec<String>,
pub required: bool,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Declaration {
pub path: String,
pub required: bool,
}
#[derive(Debug, Serialize)]
pub struct ScanReport {
pub manifest: String,
pub cataloged: usize,
pub hashed: usize,
pub preserved: usize,
pub bytes: u64,
pub wrote: bool,
pub dry_run: bool,
pub warnings: Vec<String>,
pub untracked: Vec<String>,
}
#[derive(Debug, Serialize)]
pub struct AssetState {
pub path: String,
pub sha256: String,
pub bytes: u64,
pub required: bool,
pub state: String,
}
#[derive(Debug, Serialize)]
pub struct StatusReport {
pub total: usize,
pub present: usize,
pub missing: usize,
pub required_missing: usize,
pub optional_missing: usize,
pub bytes_total: u64,
pub bytes_missing: u64,
pub assets: Vec<AssetState>,
}
#[derive(Debug, Serialize)]
pub struct VerifyReport {
pub mode: String,
pub checked: usize,
pub ok: usize,
pub missing: Vec<String>,
pub corrupt: Vec<String>,
pub complete: bool,
}
pub fn read_manifest(store: &Store) -> crate::Result<Vec<AssetRecord>> {
let abs = store.root.join(MANIFEST_FILE);
if !abs.exists() {
return Ok(Vec::new());
}
let text = std::fs::read_to_string(&abs)?;
let mut by_path: BTreeMap<String, AssetRecord> = BTreeMap::new();
for (i, line) in text.lines().enumerate() {
if line.trim().is_empty() {
continue;
}
let rec: AssetRecord = serde_json::from_str(line).map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("{MANIFEST_FILE} line {}: {e}", i + 1),
)
})?;
by_path.insert(rec.path.clone(), rec);
}
Ok(by_path.into_values().collect())
}
pub fn write_manifest(store: &Store, records: &[AssetRecord]) -> crate::Result<()> {
let abs = store.root.join(MANIFEST_FILE);
if records.is_empty() {
if abs.exists() {
std::fs::remove_file(&abs)?;
}
return Ok(());
}
let mut sorted = records.to_vec();
sorted.sort_by(|a, b| a.path.cmp(&b.path));
let mut out = String::new();
for rec in &sorted {
let line = serde_json::to_string(rec).expect("AssetRecord serializes");
out.push_str(&line);
out.push('\n');
}
write_atomic(&abs, out.as_bytes())?;
Ok(())
}
pub fn scan(store: &Store, dry_run: bool, untracked: bool) -> crate::Result<ScanReport> {
let existing_by_path: BTreeMap<String, AssetRecord> = read_manifest(store)
.unwrap_or_default()
.into_iter()
.map(|r| (r.path.clone(), r))
.collect();
let mut wrappers_by_path: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
let mut required_by_path: BTreeMap<String, bool> = BTreeMap::new();
let mut declared_paths: BTreeSet<String> = BTreeSet::new();
let mut warnings: Vec<String> = Vec::new();
for rel in store.walk()? {
let abs = store.abs_path(&rel);
let (fm, _body) = match parser::read_file(&abs) {
Ok(v) => v,
Err(_) => continue, };
let wrapper = rel_to_string(&rel);
for decl in declared_assets(&fm) {
let norm = match normalize_asset_path(&decl.path) {
Ok(n) => n,
Err(e) => {
warnings.push(format!("{wrapper}: {e}"));
continue;
}
};
if is_markdown(&norm) {
warnings.push(format!(
"{wrapper}: asset path points at a markdown content file ({norm}); skipped"
));
continue;
}
wrappers_by_path
.entry(norm.clone())
.or_default()
.insert(wrapper.clone());
let req = required_by_path.entry(norm.clone()).or_insert(false);
*req = *req || decl.required;
declared_paths.insert(norm);
}
}
let mut records: Vec<AssetRecord> = Vec::new();
let mut hashed = 0usize;
let mut preserved = 0usize;
for (path, wrappers) in &wrappers_by_path {
let required = *required_by_path.get(path).unwrap_or(&true);
let wrappers: Vec<String> = wrappers.iter().cloned().collect();
let abs = match store::ensure_path_within_store(&store.root, &store.root.join(path)) {
Ok(p) => p,
Err(_) => {
warnings.push(format!("{path}: escapes the store root; skipped"));
continue;
}
};
if abs.is_dir() {
warnings.push(format!("{path}: is a directory, not a file; skipped"));
continue;
}
if abs.is_file() {
let (sha256, bytes) = sha256_file(&abs)?;
records.push(AssetRecord {
path: path.clone(),
sha256,
bytes,
media_type: media_type_for(path),
wrappers,
required,
});
hashed += 1;
} else if let Some(prev) = existing_by_path.get(path) {
records.push(AssetRecord {
path: path.clone(),
sha256: prev.sha256.clone(),
bytes: prev.bytes,
media_type: media_type_for(path),
wrappers,
required,
});
preserved += 1;
} else {
warnings.push(format!(
"{path}: declared but absent and never cataloged; cannot hash (skipped)"
));
}
}
records.sort_by(|a, b| a.path.cmp(&b.path));
let bytes: u64 = records.iter().map(|r| r.bytes).sum();
let cataloged = records.len();
let untracked_list = if untracked {
find_untracked(store, &declared_paths)?
} else {
Vec::new()
};
let mut wrote = false;
if !dry_run {
let current = read_manifest(store).unwrap_or_default();
if current != records {
write_manifest(store, &records)?;
wrote = true;
}
}
Ok(ScanReport {
manifest: MANIFEST_FILE.to_string(),
cataloged,
hashed,
preserved,
bytes,
wrote,
dry_run,
warnings,
untracked: untracked_list,
})
}
pub fn verify(store: &Store, include_optional: bool, quick: bool) -> crate::Result<VerifyReport> {
let records = read_manifest(store)?;
let mut missing = Vec::new();
let mut corrupt = Vec::new();
let mut checked = 0usize;
for rec in &records {
if !rec.required && !include_optional {
continue;
}
checked += 1;
let abs = match store::ensure_path_within_store(&store.root, &store.root.join(&rec.path)) {
Ok(p) => p,
Err(_) => {
corrupt.push(rec.path.clone());
continue;
}
};
if !abs.is_file() {
missing.push(rec.path.clone());
continue;
}
if quick {
let len = std::fs::metadata(&abs)?.len();
if len != rec.bytes {
corrupt.push(rec.path.clone());
}
} else {
let (sha, bytes) = sha256_file(&abs)?;
if sha != rec.sha256 || bytes != rec.bytes {
corrupt.push(rec.path.clone());
}
}
}
let ok = checked - missing.len() - corrupt.len();
let complete = missing.is_empty() && corrupt.is_empty();
Ok(VerifyReport {
mode: if quick { "quick" } else { "deep" }.to_string(),
checked,
ok,
missing,
corrupt,
complete,
})
}
pub fn status(store: &Store) -> crate::Result<StatusReport> {
let records = read_manifest(store)?;
let mut present = 0usize;
let mut missing = 0usize;
let mut required_missing = 0usize;
let mut optional_missing = 0usize;
let mut bytes_total = 0u64;
let mut bytes_missing = 0u64;
let mut assets = Vec::with_capacity(records.len());
for rec in &records {
bytes_total += rec.bytes;
let is_present = store::ensure_path_within_store(&store.root, &store.root.join(&rec.path))
.map(|p| p.is_file())
.unwrap_or(false);
let state = if is_present {
present += 1;
"present"
} else {
missing += 1;
bytes_missing += rec.bytes;
if rec.required {
required_missing += 1;
} else {
optional_missing += 1;
}
"missing"
};
assets.push(AssetState {
path: rec.path.clone(),
sha256: rec.sha256.clone(),
bytes: rec.bytes,
required: rec.required,
state: state.to_string(),
});
}
Ok(StatusReport {
total: records.len(),
present,
missing,
required_missing,
optional_missing,
bytes_total,
bytes_missing,
assets,
})
}
pub fn paths(store: &Store) -> crate::Result<Vec<String>> {
Ok(read_manifest(store)?.into_iter().map(|r| r.path).collect())
}
pub fn declared_assets(fm: &parser::Frontmatter) -> Vec<Declaration> {
let mut out = Vec::new();
if let Some(v) = fm.get("asset") {
collect_declarations(&v, &mut out);
}
if let Some(v) = fm.get("assets") {
collect_declarations(&v, &mut out);
}
out
}
pub fn declarations_from_yaml_map(map: &BTreeMap<String, Value>) -> Vec<Declaration> {
let mut out = Vec::new();
if let Some(v) = map.get("asset") {
collect_declarations(v, &mut out);
}
if let Some(v) = map.get("assets") {
collect_declarations(v, &mut out);
}
out
}
fn collect_declarations(v: &Value, out: &mut Vec<Declaration>) {
match v {
Value::String(s) => out.push(Declaration {
path: s.clone(),
required: true,
}),
Value::Sequence(items) => {
for item in items {
match item {
Value::String(s) => out.push(Declaration {
path: s.clone(),
required: true,
}),
Value::Mapping(m) => {
let path = m
.get(Value::String("path".to_string()))
.and_then(|x| x.as_str())
.map(|s| s.to_string());
if let Some(path) = path {
let required = m
.get(Value::String("required".to_string()))
.and_then(|x| x.as_bool())
.unwrap_or(true);
out.push(Declaration { path, required });
}
}
_ => {}
}
}
}
_ => {}
}
}
pub fn normalize_asset_path(raw: &str) -> Result<String, String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return Err("empty asset path".to_string());
}
let p = Path::new(trimmed);
if p.is_absolute() {
return Err(format!("absolute asset path not allowed: {raw}"));
}
let mut normal: Vec<&std::ffi::OsStr> = Vec::new();
for c in p.components() {
match c {
Component::ParentDir => return Err(format!("`..` not allowed in asset path: {raw}")),
Component::Prefix(_) | Component::RootDir => {
return Err(format!("asset path escapes the store: {raw}"))
}
Component::CurDir => {}
Component::Normal(seg) => normal.push(seg),
}
}
if normal.is_empty() {
return Err(format!("asset path names no file: {raw}"));
}
let joined: PathBuf = normal.into_iter().collect();
Ok(joined.to_string_lossy().replace('\\', "/"))
}
fn is_markdown(path: &str) -> bool {
Path::new(path)
.extension()
.and_then(|e| e.to_str())
.map(|e| e.eq_ignore_ascii_case("md"))
.unwrap_or(false)
}
fn rel_to_string(p: &Path) -> String {
p.to_string_lossy().replace('\\', "/")
}
fn sha256_file(abs: &Path) -> std::io::Result<(String, u64)> {
let mut f = std::fs::File::open(abs)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; 65536];
let mut total: u64 = 0;
loop {
let n = f.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
total += n as u64;
}
let digest = hasher.finalize();
let mut hex = String::with_capacity(64);
for b in digest.iter() {
let _ = write!(hex, "{b:02x}");
}
Ok((hex, total))
}
fn media_type_for(path: &str) -> String {
let ext = Path::new(path)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
let mt = match ext.as_str() {
"pdf" => "application/pdf",
"png" => "image/png",
"jpg" | "jpeg" => "image/jpeg",
"gif" => "image/gif",
"webp" => "image/webp",
"svg" => "image/svg+xml",
"tiff" | "tif" => "image/tiff",
"mp4" => "video/mp4",
"mov" => "video/quicktime",
"webm" => "video/webm",
"mkv" => "video/x-matroska",
"mp3" => "audio/mpeg",
"wav" => "audio/wav",
"m4a" => "audio/mp4",
"flac" => "audio/flac",
"zip" => "application/zip",
"gz" | "tgz" => "application/gzip",
"tar" => "application/x-tar",
"csv" => "text/csv",
"tsv" => "text/tab-separated-values",
"json" => "application/json",
"xml" => "application/xml",
"txt" => "text/plain",
"vtt" => "text/vtt",
"srt" => "application/x-subrip",
"html" | "htm" => "text/html",
"epub" => "application/epub+zip",
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
"doc" => "application/msword",
"xls" => "application/vnd.ms-excel",
"ppt" => "application/vnd.ms-powerpoint",
_ => "application/octet-stream",
};
mt.to_string()
}
fn find_untracked(store: &Store, declared: &BTreeSet<String>) -> crate::Result<Vec<String>> {
let sources = store.root.join("sources");
if !sources.is_dir() {
return Ok(Vec::new());
}
let mut out = Vec::new();
for entry in walkdir::WalkDir::new(&sources)
.into_iter()
.filter_entry(|e| !is_hidden(e.file_name().to_str().unwrap_or("")))
{
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
if !entry.file_type().is_file() {
continue;
}
let name = entry.file_name().to_str().unwrap_or("");
if is_markdown(name) || name == "index.jsonl" {
continue;
}
let rel = match entry.path().strip_prefix(&store.root) {
Ok(r) => rel_to_string(r),
Err(_) => continue,
};
if !declared.contains(&rel) {
out.push(rel);
}
}
out.sort();
Ok(out)
}
fn is_hidden(name: &str) -> bool {
name.starts_with('.') && name != "." && name != ".."
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_asset_path_folds_curdir_and_rejects_traversal() {
assert_eq!(
normalize_asset_path("./sources/x.pdf").unwrap(),
"sources/x.pdf"
);
assert_eq!(
normalize_asset_path("sources/x.pdf").unwrap(),
"sources/x.pdf"
);
assert_eq!(
normalize_asset_path("sources/./x.pdf").unwrap(),
"sources/x.pdf"
);
assert_eq!(
normalize_asset_path("sources/x.pdf/").unwrap(),
"sources/x.pdf"
);
assert!(normalize_asset_path("../outside.txt").is_err());
assert!(normalize_asset_path("sources/../../etc/passwd").is_err());
assert!(normalize_asset_path("/abs/x.pdf").is_err());
assert!(normalize_asset_path(".").is_err());
assert!(normalize_asset_path("./").is_err());
assert!(normalize_asset_path("").is_err());
}
}