1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
//! FJ-1346: FAR (Forjar ARchive) binary format — encode and decode.
//!
//! Layout: magic → manifest_len → zstd(manifest_yaml) → chunk_count
//! → chunk_table(hash+offset+len) → zstd(chunks) → sig_len → sig
use serde::{Deserialize, Serialize};
use std::io::{Read, Write};
/// 12-byte magic identifying a FAR archive.
pub const FAR_MAGIC: &[u8; 12] = b"FORJAR-FAR\x00\x01";
/// Upper bound on the compressed manifest length declared in the header.
///
/// The manifest is a small zstd-compressed YAML blob; a real one is a few KB.
/// We cap at 16 MiB so a corrupt/malicious header claiming a huge `manifest_len`
/// cannot drive an unbounded allocation (OOM abort / capacity-overflow panic).
const MAX_MANIFEST_LEN: usize = 16 * 1024 * 1024;
/// On-disk size in bytes of one serialized chunk-table entry: hash(32) + offset(8) + length(8).
const CHUNK_ENTRY_BYTES: u64 = 32 + 8 + 8;
/// A single chunk entry in the chunk table.
#[derive(Debug, Clone, PartialEq)]
pub struct ChunkEntry {
/// BLAKE3 hash of the chunk content.
pub hash: [u8; 32],
/// Byte offset within the archive data section.
pub offset: u64,
/// Compressed length in bytes.
pub length: u64,
}
/// Manifest embedded in a FAR archive.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FarManifest {
/// Package or artifact name.
pub name: String,
/// Version string.
pub version: String,
/// Target architecture (e.g., "x86_64").
pub arch: String,
/// Content-addressed store hash.
pub store_hash: String,
/// Merkle tree hash for streaming verification.
pub tree_hash: String,
/// Number of files in the archive.
pub file_count: u64,
/// Total uncompressed size in bytes.
pub total_size: u64,
/// Per-file entries with path, size, and hash.
pub files: Vec<FarFileEntry>,
/// Build provenance metadata.
pub provenance: FarProvenance,
/// Optional kernel contract metadata (for ML models).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub kernel_contracts: Option<KernelContractInfo>,
}
/// Kernel contract metadata embedded in a FAR manifest.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct KernelContractInfo {
/// Model type identifier (e.g., "llama", "qwen2").
pub model_type: String,
/// Required kernel operations.
pub required_ops: Vec<String>,
/// Contract coverage percentage.
pub coverage_pct: f64,
}
/// A file entry within the FAR manifest.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FarFileEntry {
/// Relative file path within the archive.
pub path: String,
/// File size in bytes.
pub size: u64,
/// BLAKE3 hash of the file content.
pub blake3: String,
}
/// Provenance metadata for the FAR archive.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FarProvenance {
/// Provider that produced this archive (e.g., "apt", "conda").
pub origin_provider: String,
/// Upstream reference for traceability.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub origin_ref: Option<String>,
/// Upstream content hash at build time.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub origin_hash: Option<String>,
/// ISO 8601 creation timestamp.
pub created_at: String,
/// Generator string (e.g., "forjar 1.0.0").
pub generator: String,
}
/// Encode a FAR archive to a writer.
///
/// Writes: magic → manifest_len(u64) → zstd(manifest_yaml)
/// → chunk_count(u64) → chunk_table → zstd(chunks)
/// → signature_len(u64=0)
pub fn encode_far<W: Write>(
manifest: &FarManifest,
chunks: &[([u8; 32], Vec<u8>)],
mut writer: W,
) -> Result<(), String> {
// Magic
writer
.write_all(FAR_MAGIC)
.map_err(|e| format!("write magic: {e}"))?;
// Manifest → YAML → zstd
let yaml =
serde_yaml_ng::to_string(manifest).map_err(|e| format!("serialize manifest: {e}"))?;
let compressed =
zstd::encode_all(yaml.as_bytes(), 3).map_err(|e| format!("zstd manifest: {e}"))?;
writer
.write_all(&(compressed.len() as u64).to_le_bytes())
.map_err(|e| format!("write manifest_len: {e}"))?;
writer
.write_all(&compressed)
.map_err(|e| format!("write manifest: {e}"))?;
// Chunk count
let chunk_count = chunks.len() as u64;
writer
.write_all(&chunk_count.to_le_bytes())
.map_err(|e| format!("write chunk_count: {e}"))?;
// Compress all chunks and build table
let mut compressed_chunks: Vec<Vec<u8>> = Vec::with_capacity(chunks.len());
for (_, data) in chunks {
let cc = zstd::encode_all(data.as_slice(), 3).map_err(|e| format!("zstd chunk: {e}"))?;
compressed_chunks.push(cc);
}
// Chunk table: hash(32) + offset(u64) + length(u64) per entry
let mut offset: u64 = 0;
for (i, (hash, _)) in chunks.iter().enumerate() {
let len = compressed_chunks[i].len() as u64;
writer
.write_all(hash)
.map_err(|e| format!("write chunk hash: {e}"))?;
writer
.write_all(&offset.to_le_bytes())
.map_err(|e| format!("write chunk offset: {e}"))?;
writer
.write_all(&len.to_le_bytes())
.map_err(|e| format!("write chunk length: {e}"))?;
offset += len;
}
// Chunk data
for cc in &compressed_chunks {
writer
.write_all(cc)
.map_err(|e| format!("write chunk data: {e}"))?;
}
// Signature (0 = unsigned)
writer
.write_all(&0u64.to_le_bytes())
.map_err(|e| format!("write sig_len: {e}"))?;
writer.flush().map_err(|e| format!("flush: {e}"))?;
Ok(())
}
/// Decode the manifest and chunk table from a FAR archive (streaming — no full load).
pub fn decode_far_manifest<R: Read>(
mut reader: R,
) -> Result<(FarManifest, Vec<ChunkEntry>), String> {
// Magic
let mut magic = [0u8; 12];
reader
.read_exact(&mut magic)
.map_err(|e| format!("read magic: {e}"))?;
if magic != *FAR_MAGIC {
return Err("invalid FAR magic".to_string());
}
// Manifest length
let mut len_buf = [0u8; 8];
reader
.read_exact(&mut len_buf)
.map_err(|e| format!("read manifest_len: {e}"))?;
let manifest_len = u64::from_le_bytes(len_buf) as usize;
// #17/#24: the declared length is attacker-controlled. Reject absurd values
// BEFORE allocating, then read via a length-limited reader so the buffer grows
// only as bytes actually arrive (a short file fails gracefully, never OOMs).
if manifest_len > MAX_MANIFEST_LEN {
return Err(format!(
"manifest too large: {manifest_len} bytes exceeds {MAX_MANIFEST_LEN} limit"
));
}
// Compressed manifest
let mut compressed = Vec::new();
let read = (&mut reader)
.take(manifest_len as u64)
.read_to_end(&mut compressed)
.map_err(|e| format!("read manifest: {e}"))?;
if read != manifest_len {
return Err(format!(
"read manifest: expected {manifest_len} bytes, got {read}"
));
}
let yaml_bytes =
zstd::decode_all(compressed.as_slice()).map_err(|e| format!("zstd decompress: {e}"))?;
let manifest: FarManifest =
serde_yaml_ng::from_slice(&yaml_bytes).map_err(|e| format!("parse manifest: {e}"))?;
// Chunk count
reader
.read_exact(&mut len_buf)
.map_err(|e| format!("read chunk_count: {e}"))?;
let chunk_count = u64::from_le_bytes(len_buf);
// Chunk table. #18/#25: chunk_count is attacker-controlled, so we do NOT
// pre-size the Vec from it (Vec::with_capacity(chunk_count * 48) overflows
// isize / OOMs for huge counts). Instead grow on each successful read — a
// truncated/corrupt archive then fails via read_exact's Err. We also reject
// counts that cannot possibly fit in the remaining input as a fast bail-out.
let mut entries: Vec<ChunkEntry> = Vec::new();
let max_chunks = u64::MAX / CHUNK_ENTRY_BYTES;
if chunk_count > max_chunks {
return Err(format!("chunk_count too large: {chunk_count}"));
}
for _ in 0..chunk_count {
let mut hash = [0u8; 32];
reader
.read_exact(&mut hash)
.map_err(|e| format!("read chunk hash: {e}"))?;
reader
.read_exact(&mut len_buf)
.map_err(|e| format!("read chunk offset: {e}"))?;
let offset = u64::from_le_bytes(len_buf);
reader
.read_exact(&mut len_buf)
.map_err(|e| format!("read chunk length: {e}"))?;
let length = u64::from_le_bytes(len_buf);
entries.push(ChunkEntry {
hash,
offset,
length,
});
}
Ok((manifest, entries))
}