llmtxt_core/export_archive.rs
1//! Export archive primitives for GDPR data portability (T094).
2//!
3//! This module defines the canonical [`ExportArchive`] schema and two pure
4//! functions:
5//!
6//! * [`serialize_export_archive`] — produce a deterministic, versioned JSON
7//! representation of a user export archive. The output is byte-identical
8//! across every platform (native Rust, WASM) for the same input.
9//!
10//! * [`deserialize_export_archive`] — parse the JSON representation back into
11//! an [`ExportArchive`] and verify the embedded [`ExportArchive::content_hash`]
12//! field so consumers can detect tampering or truncation.
13//!
14//! # Versioning
15//!
16//! The `archive_version` field MUST be incremented whenever the schema changes
17//! in a backwards-incompatible way. Deserializers SHOULD reject archives whose
18//! `archive_version` is greater than the version they understand.
19//!
20//! # Security
21//!
22//! The `content_hash` is the SHA-256 hex digest of the canonical payload bytes
23//! (the UTF-8 serialisation of the archive with `content_hash` set to an empty
24//! string placeholder). This means the hash covers all user data, preventing
25//! partial-data delivery without detection.
26
27use serde::{Deserialize, Serialize};
28use sha2::{Digest, Sha256};
29
30#[cfg(feature = "wasm")]
31use wasm_bindgen::prelude::*;
32
33/// Current archive format version.
34pub const ARCHIVE_VERSION: u32 = 1;
35
36// ── Schema ──────────────────────────────────────────────────────────────────
37
38/// Canonical schema for a GDPR user-data export archive (T094).
39///
40/// All fields are serialised deterministically: `serde_json` with the default
41/// feature set preserves insertion order for struct fields, so the output is
42/// byte-identical on every platform for the same `ExportArchive` value.
43#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
44#[serde(rename_all = "camelCase")]
45pub struct ExportArchive {
46 /// Format version. Consumers MUST reject archives with a higher version.
47 pub archive_version: u32,
48
49 /// ISO 8601 timestamp at which the archive was generated (e.g.
50 /// `"2026-04-18T00:00:00Z"`). Injected by the backend at export time.
51 pub exported_at: String,
52
53 // ── User profile ──────────────────────────────────────────────────────
54 /// Internal user ID (opaque string).
55 pub user_id: String,
56 /// Display name (may be empty for anonymous users).
57 pub user_name: String,
58 /// Email address. Empty string for anonymous / email-less accounts.
59 pub user_email: String,
60 /// ISO 8601 account creation timestamp.
61 pub user_created_at: String,
62
63 // ── Documents ─────────────────────────────────────────────────────────
64 /// All documents owned by the user at export time.
65 pub documents: Vec<ExportDocument>,
66
67 // ── API keys ─────────────────────────────────────────────────────────
68 /// SHA-256 hashes of all API keys (raw key values are never stored).
69 pub api_key_hashes: Vec<ExportApiKey>,
70
71 // ── Audit log ────────────────────────────────────────────────────────
72 /// The user's audit-log slice (actions performed by or on their behalf).
73 /// Entries are pseudonymised — no raw IP addresses are included.
74 pub audit_log: Vec<ExportAuditEntry>,
75
76 // ── Webhooks ─────────────────────────────────────────────────────────
77 /// Webhook registrations owned by the user. Signing secrets are NOT
78 /// exported for security reasons.
79 pub webhooks: Vec<ExportWebhook>,
80
81 // ── Integrity ────────────────────────────────────────────────────────
82 /// SHA-256 hex digest of the canonical archive bytes (computed with this
83 /// field set to `""`). Verified by [`deserialize_export_archive`].
84 pub content_hash: String,
85}
86
87/// A single owned document in the export archive.
88#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
89#[serde(rename_all = "camelCase")]
90pub struct ExportDocument {
91 /// Internal document ID.
92 pub id: String,
93 /// Short URL slug (8 chars).
94 pub slug: String,
95 /// Document title / first heading.
96 pub title: Option<String>,
97 /// Lifecycle state at export time: `DRAFT | REVIEW | LOCKED | ARCHIVED`.
98 pub state: String,
99 /// Document format: `json | text | markdown`.
100 pub format: String,
101 /// ISO 8601 creation timestamp.
102 pub created_at: String,
103 /// ISO 8601 last-updated timestamp.
104 pub updated_at: Option<String>,
105 /// Current document content (UTF-8 text; decompressed inline).
106 pub content: String,
107 /// All version snapshots.
108 pub versions: Vec<ExportVersion>,
109}
110
111/// A single version snapshot in the export archive.
112#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
113#[serde(rename_all = "camelCase")]
114pub struct ExportVersion {
115 /// Monotonically increasing version number (starts at 0).
116 pub version_number: u32,
117 /// SHA-256 hex of the uncompressed content at this version.
118 pub content_hash: String,
119 /// ISO 8601 creation timestamp.
120 pub created_at: String,
121 /// Agent / user that authored this version.
122 pub created_by: Option<String>,
123 /// Human-readable changelog entry for this version.
124 pub changelog: Option<String>,
125}
126
127/// An API key entry — raw key values are never exported, only metadata.
128#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
129#[serde(rename_all = "camelCase")]
130pub struct ExportApiKey {
131 /// Internal API key ID.
132 pub id: String,
133 /// Human-readable key name (e.g. `"CI Bot"`).
134 pub name: String,
135 /// Display prefix visible in the dashboard (e.g. `"llmtxt_abcd1234"`).
136 pub key_prefix: String,
137 /// SHA-256 hex digest of the raw key. Allows users to identify keys.
138 pub key_hash: String,
139 /// ISO 8601 creation timestamp.
140 pub created_at: String,
141 /// ISO 8601 expiry timestamp, or `null` for no-expiry keys.
142 pub expires_at: Option<String>,
143 /// Whether the key was revoked at export time.
144 pub revoked: bool,
145}
146
147/// A single audit log entry in the export archive.
148///
149/// IP addresses are NOT included to minimise personal data in the export.
150#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
151#[serde(rename_all = "camelCase")]
152pub struct ExportAuditEntry {
153 /// Opaque audit log entry ID.
154 pub id: String,
155 /// Structured action name (e.g. `"document.create"`).
156 pub action: String,
157 /// Resource type (e.g. `"document"`).
158 pub resource_type: String,
159 /// Resource ID/slug.
160 pub resource_id: Option<String>,
161 /// Unix millisecond timestamp.
162 pub timestamp: i64,
163}
164
165/// A webhook registration in the export archive.
166#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
167#[serde(rename_all = "camelCase")]
168pub struct ExportWebhook {
169 /// Internal webhook ID.
170 pub id: String,
171 /// Target callback URL.
172 pub url: String,
173 /// JSON array of subscribed event types (e.g. `["version.created"]`).
174 pub events: String,
175 /// Optional document slug scope (`null` = all documents).
176 pub document_slug: Option<String>,
177 /// Whether the webhook is active at export time.
178 pub active: bool,
179 /// ISO 8601 creation timestamp.
180 pub created_at: String,
181}
182
183// ── Serialisation ────────────────────────────────────────────────────────────
184
185/// Compute the content hash of an [`ExportArchive`].
186///
187/// The hash covers the canonical JSON bytes with `content_hash` set to `""`
188/// (empty string), so the hash field itself is not part of the payload.
189fn compute_content_hash(archive: &ExportArchive) -> String {
190 let mut scratch = archive.clone();
191 scratch.content_hash = String::new();
192 // serde_json produces deterministic output for structs (field order = declaration order).
193 #[allow(clippy::expect_used)]
194 let canonical = serde_json::to_vec(&scratch)
195 .expect("ExportArchive serialisation must not fail — no unbounded types");
196 let mut hasher = Sha256::new();
197 hasher.update(&canonical);
198 hex::encode(hasher.finalize())
199}
200
201/// Serialise an [`ExportArchive`] to a JSON string.
202///
203/// Computes and embeds the `content_hash` before serialisation, so the
204/// returned JSON string contains a valid integrity field.
205///
206/// # Panics
207///
208/// Never panics — all types in [`ExportArchive`] are serialisable.
209///
210/// # WASM
211///
212/// Exposed as `serialize_export_archive(json: String) -> String` via
213/// `wasm_bindgen`. The input JSON is first deserialised into an
214/// [`ExportArchive`] (so the hash is recomputed from the actual data),
215/// then re-serialised with the hash embedded.
216pub fn serialize_export_archive(archive: &ExportArchive) -> String {
217 let mut stamped = archive.clone();
218 stamped.content_hash = compute_content_hash(archive);
219 #[allow(clippy::expect_used)]
220 serde_json::to_string(&stamped).expect("ExportArchive serialisation must not fail")
221}
222
223/// Deserialise an [`ExportArchive`] from a JSON string and verify its integrity.
224///
225/// Returns `Err(String)` if:
226/// - The JSON is malformed.
227/// - The `archive_version` is greater than [`ARCHIVE_VERSION`].
228/// - The `content_hash` does not match the recomputed hash.
229pub fn deserialize_export_archive(json: &str) -> Result<ExportArchive, String> {
230 let archive: ExportArchive = serde_json::from_str(json)
231 .map_err(|e| format!("deserialize_export_archive: JSON parse error: {e}"))?;
232
233 if archive.archive_version > ARCHIVE_VERSION {
234 return Err(format!(
235 "deserialize_export_archive: unsupported archive_version {}; max supported: {}",
236 archive.archive_version, ARCHIVE_VERSION
237 ));
238 }
239
240 let expected_hash = compute_content_hash(&archive);
241 if archive.content_hash != expected_hash {
242 return Err(format!(
243 "deserialize_export_archive: content_hash mismatch — \
244 expected {expected_hash}, got {}",
245 archive.content_hash
246 ));
247 }
248
249 Ok(archive)
250}
251
252// ── WASM shims ───────────────────────────────────────────────────────────────
253
254/// WASM binding for [`serialize_export_archive`].
255///
256/// Accepts a JSON string representing an [`ExportArchive`] *without*
257/// a valid `content_hash`, computes the hash, and returns the JSON
258/// string with the hash embedded.
259///
260/// Returns `{"error":"..."}` on parse failure.
261#[cfg_attr(feature = "wasm", wasm_bindgen)]
262pub fn serialize_export_archive_wasm(archive_json: &str) -> String {
263 let archive: ExportArchive = match serde_json::from_str(archive_json) {
264 Ok(a) => a,
265 Err(e) => {
266 return format!(r#"{{"error":"serialize_export_archive_wasm parse error: {e}"}}"#);
267 }
268 };
269 serialize_export_archive(&archive)
270}
271
272/// WASM binding for [`deserialize_export_archive`].
273///
274/// Returns the verified archive JSON on success, or `{"error":"..."}` on
275/// any failure (parse error, version mismatch, hash mismatch).
276#[cfg_attr(feature = "wasm", wasm_bindgen)]
277pub fn deserialize_export_archive_wasm(archive_json: &str) -> String {
278 match deserialize_export_archive(archive_json) {
279 Ok(archive) => serde_json::to_string(&archive)
280 .unwrap_or_else(|e| format!(r#"{{"error":"re-serialise failed: {e}"}}"#)),
281 Err(e) => format!(r#"{{"error":{}}}"#, serde_json::json!(e)),
282 }
283}
284
285// ── Retention policy DSL ─────────────────────────────────────────────────────
286
287/// Retention policy configuration (T186).
288///
289/// Describes per-resource-type retention windows in days. A value of `0`
290/// means "retain indefinitely". The backend enforces these policies via a
291/// nightly background job.
292#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
293#[serde(rename_all = "camelCase")]
294pub struct RetentionPolicy {
295 /// Policy format version (incremented on incompatible schema changes).
296 pub policy_version: u32,
297 /// Audit log entries: days to retain in the hot database.
298 /// Entries older than this are moved to cold archive (S3).
299 /// 0 = keep forever in hot DB.
300 pub audit_log_hot_days: u32,
301 /// Audit log entries: total retention in days (hot + cold combined).
302 /// 0 = keep forever. MUST be >= `audit_log_hot_days`.
303 pub audit_log_total_days: u32,
304 /// Soft-deleted documents: days before hard deletion.
305 /// 0 = hard-delete immediately (not recommended).
306 pub soft_deleted_docs_days: u32,
307 /// Expired (anonymous) document TTL in days.
308 /// 0 = purge immediately when `expires_at` passes.
309 pub anonymous_doc_days: u32,
310 /// Expired API keys: days before hard purge of revoked rows.
311 /// 0 = purge immediately.
312 pub revoked_api_key_days: u32,
313 /// Agent inbox messages: days before hard purge.
314 pub agent_inbox_days: u32,
315}
316
317impl Default for RetentionPolicy {
318 fn default() -> Self {
319 Self {
320 policy_version: 1,
321 // Audit: 90 days hot, 7 years total (2555 days ≈ 7 * 365).
322 audit_log_hot_days: 90,
323 audit_log_total_days: 2555,
324 // Soft-deleted documents: 30-day grace period (T187).
325 soft_deleted_docs_days: 30,
326 // Anonymous docs: purge 1 day after expiry.
327 anonymous_doc_days: 1,
328 // Revoked API keys: keep 90 days for audit reference.
329 revoked_api_key_days: 90,
330 // Agent inbox: 2-day TTL (matches schema).
331 agent_inbox_days: 2,
332 }
333 }
334}
335
336/// Serialise a [`RetentionPolicy`] to a JSON string.
337pub fn serialize_retention_policy(policy: &RetentionPolicy) -> String {
338 #[allow(clippy::expect_used)]
339 serde_json::to_string(policy).expect("RetentionPolicy serialisation must not fail")
340}
341
342/// Deserialise a [`RetentionPolicy`] from a JSON string.
343pub fn deserialize_retention_policy(json: &str) -> Result<RetentionPolicy, String> {
344 serde_json::from_str(json).map_err(|e| format!("deserialize_retention_policy: {e}"))
345}
346
347// ── Tests ────────────────────────────────────────────────────────────────────
348
349#[cfg(test)]
350mod tests {
351 use super::*;
352
353 fn sample_archive() -> ExportArchive {
354 ExportArchive {
355 archive_version: ARCHIVE_VERSION,
356 exported_at: "2026-04-18T00:00:00Z".to_string(),
357 user_id: "usr_test".to_string(),
358 user_name: "Test User".to_string(),
359 user_email: "test@example.com".to_string(),
360 user_created_at: "2026-01-01T00:00:00Z".to_string(),
361 documents: vec![ExportDocument {
362 id: "doc_1".to_string(),
363 slug: "abcd1234".to_string(),
364 title: Some("My Document".to_string()),
365 state: "DRAFT".to_string(),
366 format: "markdown".to_string(),
367 created_at: "2026-01-01T00:00:00Z".to_string(),
368 updated_at: Some("2026-01-02T00:00:00Z".to_string()),
369 content: "# My Document\n\nHello world.".to_string(),
370 versions: vec![ExportVersion {
371 version_number: 0,
372 content_hash: "abc123".to_string(),
373 created_at: "2026-01-01T00:00:00Z".to_string(),
374 created_by: Some("agent_1".to_string()),
375 changelog: None,
376 }],
377 }],
378 api_key_hashes: vec![ExportApiKey {
379 id: "key_1".to_string(),
380 name: "CI Bot".to_string(),
381 key_prefix: "llmtxt_abc".to_string(),
382 key_hash: "deadbeef".to_string(),
383 created_at: "2026-01-01T00:00:00Z".to_string(),
384 expires_at: None,
385 revoked: false,
386 }],
387 audit_log: vec![ExportAuditEntry {
388 id: "evt_1".to_string(),
389 action: "document.create".to_string(),
390 resource_type: "document".to_string(),
391 resource_id: Some("abcd1234".to_string()),
392 timestamp: 1_700_000_000_000,
393 }],
394 webhooks: vec![ExportWebhook {
395 id: "wh_1".to_string(),
396 url: "https://example.com/hook".to_string(),
397 events: r#"["version.created"]"#.to_string(),
398 document_slug: None,
399 active: true,
400 created_at: "2026-01-01T00:00:00Z".to_string(),
401 }],
402 content_hash: String::new(), // will be filled by serialize
403 }
404 }
405
406 #[test]
407 fn test_serialize_sets_content_hash() {
408 let archive = sample_archive();
409 let json = serialize_export_archive(&archive);
410 let parsed: ExportArchive = serde_json::from_str(&json).unwrap();
411 assert!(
412 !parsed.content_hash.is_empty(),
413 "content_hash should be set"
414 );
415 assert_eq!(parsed.content_hash.len(), 64, "SHA-256 hex is 64 chars");
416 }
417
418 #[test]
419 fn test_deserialize_verifies_hash() {
420 let json = serialize_export_archive(&sample_archive());
421 let result = deserialize_export_archive(&json);
422 assert!(
423 result.is_ok(),
424 "valid archive should deserialise: {:?}",
425 result
426 );
427 }
428
429 #[test]
430 fn test_tampered_archive_rejected() {
431 let json = serialize_export_archive(&sample_archive());
432 // Tamper: replace a character in the content field.
433 let tampered = json.replace("Hello world.", "Hello tamper.");
434 let result = deserialize_export_archive(&tampered);
435 assert!(result.is_err(), "tampered archive must be rejected");
436 assert!(
437 result.unwrap_err().contains("content_hash mismatch"),
438 "error should mention hash mismatch"
439 );
440 }
441
442 #[test]
443 fn test_roundtrip_byte_identical() {
444 let archive = sample_archive();
445 let json1 = serialize_export_archive(&archive);
446 // Deserialise then re-serialise — must be byte-identical.
447 let recovered = deserialize_export_archive(&json1).unwrap();
448 let json2 = serialize_export_archive(&recovered);
449 assert_eq!(json1, json2, "round-trip must be byte-identical");
450 }
451
452 #[test]
453 fn test_unsupported_version_rejected() {
454 let mut archive = sample_archive();
455 archive.archive_version = ARCHIVE_VERSION + 1;
456 archive.content_hash = String::new();
457 // Manually compute hash so the hash check doesn't trip first.
458 let hash = {
459 let canonical = serde_json::to_vec(&archive).unwrap();
460 let mut hasher = sha2::Sha256::new();
461 hasher.update(&canonical);
462 hex::encode(hasher.finalize())
463 };
464 archive.content_hash = hash;
465 let json = serde_json::to_string(&archive).unwrap();
466 let result = deserialize_export_archive(&json);
467 assert!(result.is_err());
468 assert!(
469 result.unwrap_err().contains("unsupported archive_version"),
470 "error should mention unsupported version"
471 );
472 }
473
474 #[test]
475 fn test_retention_policy_default_roundtrip() {
476 let policy = RetentionPolicy::default();
477 let json = serialize_retention_policy(&policy);
478 let recovered = deserialize_retention_policy(&json).unwrap();
479 assert_eq!(policy, recovered);
480 }
481
482 #[test]
483 fn test_retention_policy_defaults() {
484 let p = RetentionPolicy::default();
485 assert_eq!(p.audit_log_hot_days, 90);
486 assert_eq!(p.audit_log_total_days, 2555);
487 assert_eq!(p.soft_deleted_docs_days, 30);
488 assert_eq!(p.agent_inbox_days, 2);
489 }
490}