1use crate::replay::manifest::ReplayManifest;
8use anyhow::{Context, Result};
9use flate2::read::GzDecoder;
10use flate2::Compression;
11use flate2::GzBuilder;
12use serde_json;
13use sha2::{Digest, Sha256};
14use std::collections::BTreeMap;
15use std::io::{Read, Write};
16use std::path::Path;
17use tar::{Archive, Builder, Header};
18
19pub mod paths {
21 pub const MANIFEST: &str = "manifest.json";
23 pub const FILES_PREFIX: &str = "files/";
25 pub const OUTPUTS_PREFIX: &str = "outputs/";
27 pub const CASSETTES_PREFIX: &str = "cassettes/";
29}
30
31#[derive(Debug, Clone)]
33pub struct BundleEntry {
34 pub path: String,
36 pub data: Vec<u8>,
38}
39
40pub fn write_bundle_tar_gz<W: Write>(
43 w: W,
44 manifest: &ReplayManifest,
45 entries: &[BundleEntry],
46) -> Result<()> {
47 let manifest_json = serde_json::to_vec(manifest).context("serialize manifest")?;
48
49 let gz = GzBuilder::new().mtime(0).write(w, Compression::default());
50 let mut tar = Builder::new(gz);
51 tar.mode(tar::HeaderMode::Deterministic);
52
53 write_tar_entry(&mut tar, paths::MANIFEST, &manifest_json)?;
54
55 let mut sorted: Vec<_> = entries.iter().collect();
56 sorted.sort_by(|a, b| a.path.as_str().cmp(b.path.as_str()));
57
58 for e in &sorted {
59 normalize_path_and_append(&mut tar, &e.path, &e.data)?;
60 }
61
62 let gz = tar.into_inner().context("finalize tar")?;
63 gz.finish().context("finish gzip")?;
64 Ok(())
65}
66
67pub fn bundle_digest(manifest: &ReplayManifest, entries: &[BundleEntry]) -> Result<String> {
70 let mut buf = Vec::new();
71 write_bundle_tar_gz(&mut buf, manifest, entries)?;
72 let hash = Sha256::digest(&buf);
73 Ok(hex::encode(hash))
74}
75
76fn write_tar_entry<T: Write>(tar: &mut Builder<T>, path: &str, data: &[u8]) -> Result<()> {
77 let mut header = Header::new_gnu();
78 header.set_path(path).context("set_path")?;
79 header.set_size(data.len() as u64);
80 header.set_mode(0o644);
81 header.set_uid(0);
82 header.set_gid(0);
83 header.set_mtime(0);
84 header.set_cksum();
85 tar.append(&header, data).context("append entry")?;
86 Ok(())
87}
88
89fn validate_entry_path(path: &str) -> Result<String> {
98 let normalized = path.replace('\\', "/").trim_start_matches('/').to_string();
99 if normalized.is_empty() {
100 anyhow::bail!("invalid bundle path: empty path");
101 }
102 let segments: Vec<&str> = normalized.split('/').collect();
103 if segments[0].contains(':') {
104 anyhow::bail!(
105 "invalid bundle path: drive-letter or ':' in first segment (path: {})",
106 path
107 );
108 }
109 for seg in &segments {
110 if seg.is_empty() {
111 anyhow::bail!("invalid bundle path: empty segment (path: {})", path);
112 }
113 if *seg == "." || *seg == ".." {
114 anyhow::bail!(
115 "invalid bundle path: traversal segment '.' or '..' (path: {})",
116 path
117 );
118 }
119 }
120 let has_canonical_prefix = normalized.starts_with(paths::FILES_PREFIX)
121 || normalized.starts_with(paths::OUTPUTS_PREFIX)
122 || normalized.starts_with(paths::CASSETTES_PREFIX);
123 if !has_canonical_prefix {
124 anyhow::bail!(
125 "invalid bundle path prefix: must be files/, outputs/, or cassettes/ (path: {})",
126 path
127 );
128 }
129 Ok(normalized)
130}
131
132fn normalize_path_and_append<T: Write>(
134 tar: &mut Builder<T>,
135 path: &str,
136 data: &[u8],
137) -> Result<()> {
138 let normalized = validate_entry_path(path)?;
139 write_tar_entry(tar, &normalized, data)
140}
141
142pub fn build_file_manifest(
145 entries: &[BundleEntry],
146) -> Result<BTreeMap<String, crate::replay::manifest::FileManifestEntry>> {
147 let mut out = BTreeMap::new();
148 for e in entries {
149 let path = validate_entry_path(&e.path)?;
150 let hash = Sha256::digest(&e.data);
151 out.insert(
152 path.clone(),
153 crate::replay::manifest::FileManifestEntry {
154 sha256: format!("sha256:{}", hex::encode(hash)),
155 size: e.data.len() as u64,
156 mode: Some(0o644),
157 content_type: content_type_hint(Path::new(&path)),
158 },
159 );
160 }
161 Ok(out)
162}
163
164#[derive(Debug)]
167pub struct ReadBundle {
168 pub manifest: ReplayManifest,
169 pub entries: Vec<(String, Vec<u8>)>,
170}
171
172pub fn read_bundle_tar_gz<R: Read>(r: R) -> Result<ReadBundle> {
177 let dec = GzDecoder::new(r);
178 let mut ar = Archive::new(dec);
179 let mut manifest_data: Option<Vec<u8>> = None;
180 let mut seen = BTreeMap::new();
181 for entry in ar.entries().context("list tar entries")? {
182 let mut e = entry.context("read tar entry")?;
183 let path = e.path().context("entry path")?;
184 let path_str = path.to_string_lossy().replace('\\', "/");
185 if path_str == paths::MANIFEST {
186 let mut data = Vec::new();
187 e.read_to_end(&mut data).context("read manifest body")?;
188 manifest_data = Some(data);
189 continue;
190 }
191 validate_entry_path(&path_str)?;
192 let mut data = Vec::new();
193 e.read_to_end(&mut data).context("read entry body")?;
194 if seen.insert(path_str.clone(), data).is_some() {
195 anyhow::bail!("duplicate path in bundle: {}", path_str);
196 }
197 }
198 let manifest_json = manifest_data.context("manifest.json missing in bundle")?;
199 let manifest: ReplayManifest =
200 serde_json::from_slice(&manifest_json).context("parse manifest.json")?;
201 let entries = seen.into_iter().collect();
202 Ok(ReadBundle { manifest, entries })
203}
204
205fn content_type_hint(path: &Path) -> Option<String> {
206 let ext = path.extension()?.to_str()?;
207 Some(match ext {
208 "json" => "application/json".to_string(),
209 "jsonl" => "application/x-ndjson".to_string(),
210 "xml" => "application/xml".to_string(),
211 "yaml" | "yml" => "application/x-yaml".to_string(),
212 _ => return None,
213 })
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219 use crate::replay::manifest::{
220 ReplayCoverage, ReplayManifest, ReplayOutputs, ReplaySeeds, ScrubPolicy,
221 };
222 use std::collections::BTreeMap;
223
224 #[test]
225 fn write_bundle_minimal_roundtrip() {
226 let manifest = ReplayManifest::minimal("2.15.0".into());
227 let entries = vec![BundleEntry {
228 path: "outputs/summary.json".into(),
229 data: br#"{"schema_version":1}"#.to_vec(),
230 }];
231 let mut buf = Vec::new();
232 write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap();
233 assert!(!buf.is_empty());
234 let digest = bundle_digest(&manifest, &entries).unwrap();
235 assert_eq!(digest.len(), 64);
236 }
237
238 #[test]
239 fn read_bundle_roundtrip() {
240 let manifest = ReplayManifest::minimal("2.15.0".into());
241 let entries = vec![
242 BundleEntry {
243 path: "files/trace.jsonl".into(),
244 data: b"[]".to_vec(),
245 },
246 BundleEntry {
247 path: "outputs/summary.json".into(),
248 data: br#"{"schema_version":1}"#.to_vec(),
249 },
250 ];
251 let mut buf = Vec::new();
252 write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap();
253 let read = read_bundle_tar_gz(std::io::Cursor::new(&buf)).unwrap();
254 assert_eq!(read.manifest.schema_version, manifest.schema_version);
255 assert_eq!(read.manifest.assay_version, manifest.assay_version);
256 let paths: std::collections::BTreeSet<_> =
257 read.entries.iter().map(|(p, _)| p.as_str()).collect();
258 assert!(paths.contains("files/trace.jsonl"));
259 assert!(paths.contains("outputs/summary.json"));
260 let data: std::collections::BTreeMap<_, _> = read.entries.into_iter().collect();
261 assert_eq!(data.get("files/trace.jsonl").unwrap(), &b"[]"[..]);
262 }
263
264 #[test]
266 fn read_bundle_fails_manifest_missing() {
267 let mut buf = Vec::new();
268 let gz = GzBuilder::new()
269 .mtime(0)
270 .write(&mut buf, flate2::Compression::default());
271 let mut tar = Builder::new(gz);
272 let mut header = Header::new_gnu();
273 header.set_path("files/x").unwrap();
274 header.set_size(0);
275 header.set_mode(0o644);
276 header.set_cksum();
277 tar.append(&header, &[] as &[u8]).unwrap();
278 let gz = tar.into_inner().unwrap();
279 gz.finish().unwrap();
280 let err = read_bundle_tar_gz(std::io::Cursor::new(&buf)).unwrap_err();
281 assert!(err.to_string().contains("manifest.json missing"), "{}", err);
282 }
283
284 #[test]
286 fn read_bundle_fails_duplicate_path() {
287 let manifest = ReplayManifest::minimal("2.15.0".into());
288 let manifest_json = serde_json::to_vec(&manifest).unwrap();
289 let mut buf = Vec::new();
290 let gz = GzBuilder::new()
291 .mtime(0)
292 .write(&mut buf, flate2::Compression::default());
293 let mut tar = Builder::new(gz);
294 tar.mode(tar::HeaderMode::Deterministic);
295 let mut h = Header::new_gnu();
296 h.set_path(paths::MANIFEST).unwrap();
297 h.set_size(manifest_json.len() as u64);
298 h.set_mode(0o644);
299 h.set_cksum();
300 tar.append(&h, &manifest_json[..]).unwrap();
301 for _ in 0..2 {
302 let mut h2 = Header::new_gnu();
303 h2.set_path("files/x").unwrap();
304 h2.set_size(1);
305 h2.set_mode(0o644);
306 h2.set_cksum();
307 tar.append(&h2, &b"x"[..]).unwrap();
308 }
309 let gz = tar.into_inner().unwrap();
310 gz.finish().unwrap();
311 let err = read_bundle_tar_gz(std::io::Cursor::new(&buf)).unwrap_err();
312 assert!(err.to_string().contains("duplicate path"), "{}", err);
313 }
314
315 #[test]
316 fn build_file_manifest_normalizes_paths() {
317 let entries = vec![BundleEntry {
318 path: "files/trace.jsonl".into(),
319 data: vec![1, 2, 3],
320 }];
321 let manifest_map = build_file_manifest(&entries).unwrap();
322 assert_eq!(manifest_map.len(), 1);
323 let entry = manifest_map.get("files/trace.jsonl").unwrap();
324 assert_eq!(entry.size, 3);
325 assert!(entry.sha256.starts_with("sha256:"));
326 }
327
328 #[test]
330 fn path_segment_dotdot_allows_literal_dotdot_in_filename() {
331 let manifest = ReplayManifest::minimal("2.15.0".into());
332 let entries = vec![BundleEntry {
333 path: "files/a..b.txt".into(),
334 data: b"ok".to_vec(),
335 }];
336 let mut buf = Vec::new();
337 write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap();
338 let names = list_tar_gz_paths(&buf);
339 assert!(names.contains(&"files/a..b.txt".to_string()));
340 }
341
342 #[test]
344 fn path_must_have_canonical_prefix() {
345 let manifest = ReplayManifest::minimal("2.15.0".into());
346 for bad in ["evil.txt", "x/y/z", "output/run.json"] {
347 let entries = vec![BundleEntry {
348 path: bad.to_string(),
349 data: vec![],
350 }];
351 let err = write_bundle_tar_gz(&mut Vec::new(), &manifest, &entries).unwrap_err();
352 assert!(
353 err.to_string().contains("invalid bundle path prefix"),
354 "{}",
355 bad
356 );
357 }
358 }
359
360 #[test]
362 fn path_rejects_empty_segment() {
363 let manifest = ReplayManifest::minimal("2.15.0".into());
364 let entries = vec![BundleEntry {
365 path: "files//x.json".into(),
366 data: vec![],
367 }];
368 let err = write_bundle_tar_gz(&mut Vec::new(), &manifest, &entries).unwrap_err();
369 assert!(err.to_string().contains("empty segment"), "files//x");
370 }
371
372 #[test]
374 fn path_rejects_drive_letter() {
375 let manifest = ReplayManifest::minimal("2.15.0".into());
376 for bad in ["C:/foo", "C:\\foo", "D:bar"] {
377 let entries = vec![BundleEntry {
378 path: bad.to_string(),
379 data: vec![],
380 }];
381 let err = write_bundle_tar_gz(&mut Vec::new(), &manifest, &entries).unwrap_err();
382 assert!(
383 err.to_string().contains("drive-letter")
384 || err.to_string().contains("first segment"),
385 "{}",
386 bad
387 );
388 }
389 }
390
391 #[test]
393 fn build_file_manifest_fail_closed_on_invalid_path() {
394 let entries = vec![
395 BundleEntry {
396 path: "files/ok.json".into(),
397 data: vec![],
398 },
399 BundleEntry {
400 path: "../secrets.txt".into(),
401 data: vec![],
402 },
403 ];
404 let err = build_file_manifest(&entries).unwrap_err();
405 assert!(err.to_string().contains("invalid bundle path"));
406 }
407
408 #[test]
410 fn bundle_digest_equals_sha256_of_written_bytes() {
411 let manifest = ReplayManifest::minimal("2.15.0".into());
412 let entries = vec![
413 BundleEntry {
414 path: "files/trace.jsonl".into(),
415 data: b"[]".to_vec(),
416 },
417 BundleEntry {
418 path: "outputs/summary.json".into(),
419 data: b"{}".to_vec(),
420 },
421 ];
422 let mut buf = Vec::new();
423 write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap();
424 let digest_from_fn = bundle_digest(&manifest, &entries).unwrap();
425 let hash_of_bytes = hex::encode(Sha256::digest(&buf));
426 assert_eq!(
427 digest_from_fn, hash_of_bytes,
428 "bundle_digest must equal sha256(written bytes)"
429 );
430 }
431
432 #[test]
434 fn path_traversal_rejected_and_output_has_no_traversal() {
435 let manifest = ReplayManifest::minimal("2.15.0".into());
436 for bad_path in [
437 "../secrets.txt",
438 "files/../../etc/passwd",
439 "outputs/../leak",
440 "",
441 ] {
442 let entries = vec![BundleEntry {
443 path: bad_path.to_string(),
444 data: vec![],
445 }];
446 let mut buf = Vec::new();
447 let err = write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap_err();
448 assert!(
449 err.to_string().contains("invalid bundle path"),
450 "{}",
451 bad_path
452 );
453 }
454 let entries = vec![
456 BundleEntry {
457 path: "files/trace.jsonl".into(),
458 data: b"[]".to_vec(),
459 },
460 BundleEntry {
461 path: "outputs/run.json".into(),
462 data: b"{}".to_vec(),
463 },
464 ];
465 let mut buf = Vec::new();
466 write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap();
467 let names = list_tar_gz_paths(&buf);
468 for name in &names {
469 assert!(!name.contains(".."), "no .. in archive path: {}", name);
470 assert!(
471 !name.starts_with('/'),
472 "no leading / in archive path: {}",
473 name
474 );
475 }
476 assert!(names.iter().any(|s| s == "manifest.json"));
477 assert!(names.iter().any(|s| s.starts_with("files/")));
478 assert!(names.iter().any(|s| s.starts_with("outputs/")));
479 }
480
481 #[test]
483 fn audit_full_manifest_and_canonical_layout() {
484 let mut reason = BTreeMap::new();
485 reason.insert(
486 "test_b".to_string(),
487 "judge response not cached".to_string(),
488 );
489 let manifest = ReplayManifest {
490 schema_version: 1,
491 assay_version: "2.15.0".to_string(),
492 created_at: Some("2025-01-27T12:00:00Z".to_string()),
493 source_run_path: Some(".assay/run_abc123".to_string()),
494 selection_method: Some("run-id".to_string()),
495 git_sha: Some("a1b2c3d4e5f6".to_string()),
496 git_dirty: Some(false),
497 workflow_run_id: None,
498 config_digest: None,
499 policy_digest: None,
500 baseline_digest: None,
501 trace_digest: None,
502 trace_path: Some("files/trace.jsonl".to_string()),
503 outputs: Some(ReplayOutputs {
504 run: Some("outputs/run.json".to_string()),
505 summary: Some("outputs/summary.json".to_string()),
506 junit: None,
507 sarif: None,
508 }),
509 toolchain: None,
510 seeds: Some(ReplaySeeds {
511 seed_version: Some(1),
512 order_seed: Some("42".to_string()),
513 judge_seed: None,
514 }),
515 replay_coverage: Some(ReplayCoverage {
516 complete_tests: vec!["test_a".to_string()],
517 incomplete_tests: vec!["test_b".to_string()],
518 reason: Some(reason),
519 }),
520 scrub_policy: Some(ScrubPolicy::default()),
521 files: None,
522 env: None,
523 };
524 let entries = vec![
525 BundleEntry {
526 path: "files/trace.jsonl".into(),
527 data: b"[]".to_vec(),
528 },
529 BundleEntry {
530 path: "outputs/run.json".into(),
531 data: b"{}".to_vec(),
532 },
533 BundleEntry {
534 path: "outputs/summary.json".into(),
535 data: b"{}".to_vec(),
536 },
537 BundleEntry {
538 path: "cassettes/.gitkeep".into(),
539 data: vec![],
540 },
541 ];
542 let mut buf = Vec::new();
543 write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap();
544 let names = list_tar_gz_paths(&buf);
545 assert!(
546 names.contains(&"manifest.json".to_string()),
547 "canonical: manifest at root"
548 );
549 assert!(names
550 .iter()
551 .all(|p| !p.contains("..") && !p.starts_with('/')));
552 assert!(names.contains(&"manifest.json".to_string()));
553 assert!(names.iter().any(|p| p.starts_with("files/")));
554 assert!(names.iter().any(|p| p.starts_with("outputs/")));
555 assert!(names.iter().any(|p| p.starts_with("cassettes/")));
556 }
557
558 #[test]
563 fn golden_digest_snapshot() {
564 let manifest = ReplayManifest::minimal("2.15.0".into());
565 let entries = vec![BundleEntry {
566 path: "files/trace.jsonl".into(),
567 data: b"[]".to_vec(),
568 }];
569 let digest = bundle_digest(&manifest, &entries).unwrap();
570 assert_eq!(
571 digest, "e982d2dd1d7cf56df6b417c7af1bc3f7f334ecfc47298bf5d240f4485f3b7a7c",
572 "Golden digest changed — if intentional, update this value after verifying \
573 that the new output is still deterministic across platforms"
574 );
575 }
576
577 fn list_tar_gz_paths(gz: &[u8]) -> Vec<String> {
581 let dec = flate2::read::GzDecoder::new(gz);
582 let mut ar = tar::Archive::new(dec);
583 let mut names = Vec::new();
584 for e in ar.entries().unwrap() {
585 let e = e.unwrap();
586 let path = e.path().unwrap();
587 names.push(path.to_string_lossy().replace('\\', "/"));
588 }
589 names
590 }
591
592 #[test]
595 fn entries_written_in_sorted_order() {
596 let manifest = ReplayManifest::minimal("2.15.0".into());
597 let entries = vec![
599 BundleEntry {
600 path: "outputs/z.json".into(),
601 data: b"{}".to_vec(),
602 },
603 BundleEntry {
604 path: "files/a.jsonl".into(),
605 data: b"[]".to_vec(),
606 },
607 BundleEntry {
608 path: "cassettes/m.json".into(),
609 data: b"{}".to_vec(),
610 },
611 ];
612 let mut buf = Vec::new();
613 write_bundle_tar_gz(&mut buf, &manifest, &entries).unwrap();
614 let names = list_tar_gz_paths(&buf);
615 assert_eq!(names[0], "manifest.json", "manifest must be first");
616 let data_entries: Vec<_> = names[1..].to_vec();
617 let mut expected = data_entries.clone();
618 expected.sort();
619 assert_eq!(
620 data_entries, expected,
621 "entries after manifest must be in sorted order"
622 );
623 }
624
625 #[test]
628 fn validate_entry_path_accepts_valid_paths() {
629 for good in [
630 "files/trace.jsonl",
631 "outputs/run.json",
632 "cassettes/openai/embed.json",
633 "files/a..b.txt",
634 "files/deep/nested/dir/file.json",
635 ] {
636 let result = validate_entry_path(good);
637 assert!(result.is_ok(), "should accept: {}", good);
638 assert_eq!(result.unwrap(), good, "valid path returned unchanged");
639 }
640 }
641
642 #[test]
643 fn validate_entry_path_normalizes_backslash_and_leading_slash() {
644 assert_eq!(
645 validate_entry_path("files\\trace.jsonl").unwrap(),
646 "files/trace.jsonl"
647 );
648 assert_eq!(
649 validate_entry_path("/files/trace.jsonl").unwrap(),
650 "files/trace.jsonl"
651 );
652 assert_eq!(
653 validate_entry_path("\\files\\trace.jsonl").unwrap(),
654 "files/trace.jsonl"
655 );
656 }
657
658 #[test]
659 fn validate_entry_path_rejects_empty() {
660 let err = validate_entry_path("").unwrap_err();
661 assert!(err.to_string().contains("empty path"));
662 }
663
664 #[test]
665 fn validate_entry_path_rejects_empty_segment() {
666 let err = validate_entry_path("files//x.json").unwrap_err();
667 assert!(err.to_string().contains("empty segment"));
668 }
669
670 #[test]
671 fn validate_entry_path_rejects_dot_segments() {
672 for bad in ["files/./x.json", "files/../x.json", "outputs/.."] {
673 let err = validate_entry_path(bad).unwrap_err();
674 assert!(
675 err.to_string().contains("traversal segment"),
676 "should reject: {}",
677 bad
678 );
679 }
680 }
681
682 #[test]
683 fn validate_entry_path_rejects_drive_letter() {
684 for bad in ["C:/foo", "D:bar"] {
685 let err = validate_entry_path(bad).unwrap_err();
686 assert!(
687 err.to_string().contains("drive-letter"),
688 "should reject: {}",
689 bad
690 );
691 }
692 }
693
694 #[test]
695 fn validate_entry_path_rejects_non_canonical_prefix() {
696 for bad in ["evil.txt", "x/y/z", "output/run.json", "file/x.json"] {
697 let err = validate_entry_path(bad).unwrap_err();
698 assert!(
699 err.to_string().contains("invalid bundle path prefix"),
700 "should reject: {}",
701 bad
702 );
703 }
704 }
705}