Skip to main content

llmtxt_core/
export_archive.rs

1//! Export archive primitives for GDPR data portability (T094).
2//!
3//! This module defines the canonical [`ExportArchive`] schema and two pure
4//! functions:
5//!
6//! * [`serialize_export_archive`] — produce a deterministic, versioned JSON
7//!   representation of a user export archive.  The output is byte-identical
8//!   across every platform (native Rust, WASM) for the same input.
9//!
10//! * [`deserialize_export_archive`] — parse the JSON representation back into
11//!   an [`ExportArchive`] and verify the embedded [`ExportArchive::content_hash`]
12//!   field so consumers can detect tampering or truncation.
13//!
14//! # Versioning
15//!
16//! The `archive_version` field MUST be incremented whenever the schema changes
17//! in a backwards-incompatible way.  Deserializers SHOULD reject archives whose
18//! `archive_version` is greater than the version they understand.
19//!
20//! # Security
21//!
22//! The `content_hash` is the SHA-256 hex digest of the canonical payload bytes
23//! (the UTF-8 serialisation of the archive with `content_hash` set to an empty
24//! string placeholder).  This means the hash covers all user data, preventing
25//! partial-data delivery without detection.
26
27use serde::{Deserialize, Serialize};
28use sha2::{Digest, Sha256};
29
30#[cfg(feature = "wasm")]
31use wasm_bindgen::prelude::*;
32
33/// Current archive format version.
34pub const ARCHIVE_VERSION: u32 = 1;
35
36// ── Schema ──────────────────────────────────────────────────────────────────
37
38/// Canonical schema for a GDPR user-data export archive (T094).
39///
40/// All fields are serialised deterministically: `serde_json` with the default
41/// feature set preserves insertion order for struct fields, so the output is
42/// byte-identical on every platform for the same `ExportArchive` value.
43#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
44#[serde(rename_all = "camelCase")]
45pub struct ExportArchive {
46    /// Format version.  Consumers MUST reject archives with a higher version.
47    pub archive_version: u32,
48
49    /// ISO 8601 timestamp at which the archive was generated (e.g.
50    /// `"2026-04-18T00:00:00Z"`).  Injected by the backend at export time.
51    pub exported_at: String,
52
53    // ── User profile ──────────────────────────────────────────────────────
54    /// Internal user ID (opaque string).
55    pub user_id: String,
56    /// Display name (may be empty for anonymous users).
57    pub user_name: String,
58    /// Email address.  Empty string for anonymous / email-less accounts.
59    pub user_email: String,
60    /// ISO 8601 account creation timestamp.
61    pub user_created_at: String,
62
63    // ── Documents ─────────────────────────────────────────────────────────
64    /// All documents owned by the user at export time.
65    pub documents: Vec<ExportDocument>,
66
67    // ── API keys ─────────────────────────────────────────────────────────
68    /// SHA-256 hashes of all API keys (raw key values are never stored).
69    pub api_key_hashes: Vec<ExportApiKey>,
70
71    // ── Audit log ────────────────────────────────────────────────────────
72    /// The user's audit-log slice (actions performed by or on their behalf).
73    /// Entries are pseudonymised — no raw IP addresses are included.
74    pub audit_log: Vec<ExportAuditEntry>,
75
76    // ── Webhooks ─────────────────────────────────────────────────────────
77    /// Webhook registrations owned by the user.  Signing secrets are NOT
78    /// exported for security reasons.
79    pub webhooks: Vec<ExportWebhook>,
80
81    // ── Integrity ────────────────────────────────────────────────────────
82    /// SHA-256 hex digest of the canonical archive bytes (computed with this
83    /// field set to `""`).  Verified by [`deserialize_export_archive`].
84    pub content_hash: String,
85}
86
87/// A single owned document in the export archive.
88#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
89#[serde(rename_all = "camelCase")]
90pub struct ExportDocument {
91    /// Internal document ID.
92    pub id: String,
93    /// Short URL slug (8 chars).
94    pub slug: String,
95    /// Document title / first heading.
96    pub title: Option<String>,
97    /// Lifecycle state at export time: `DRAFT | REVIEW | LOCKED | ARCHIVED`.
98    pub state: String,
99    /// Document format: `json | text | markdown`.
100    pub format: String,
101    /// ISO 8601 creation timestamp.
102    pub created_at: String,
103    /// ISO 8601 last-updated timestamp.
104    pub updated_at: Option<String>,
105    /// Current document content (UTF-8 text; decompressed inline).
106    pub content: String,
107    /// All version snapshots.
108    pub versions: Vec<ExportVersion>,
109}
110
111/// A single version snapshot in the export archive.
112#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
113#[serde(rename_all = "camelCase")]
114pub struct ExportVersion {
115    /// Monotonically increasing version number (starts at 0).
116    pub version_number: u32,
117    /// SHA-256 hex of the uncompressed content at this version.
118    pub content_hash: String,
119    /// ISO 8601 creation timestamp.
120    pub created_at: String,
121    /// Agent / user that authored this version.
122    pub created_by: Option<String>,
123    /// Human-readable changelog entry for this version.
124    pub changelog: Option<String>,
125}
126
127/// An API key entry — raw key values are never exported, only metadata.
128#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
129#[serde(rename_all = "camelCase")]
130pub struct ExportApiKey {
131    /// Internal API key ID.
132    pub id: String,
133    /// Human-readable key name (e.g. `"CI Bot"`).
134    pub name: String,
135    /// Display prefix visible in the dashboard (e.g. `"llmtxt_abcd1234"`).
136    pub key_prefix: String,
137    /// SHA-256 hex digest of the raw key.  Allows users to identify keys.
138    pub key_hash: String,
139    /// ISO 8601 creation timestamp.
140    pub created_at: String,
141    /// ISO 8601 expiry timestamp, or `null` for no-expiry keys.
142    pub expires_at: Option<String>,
143    /// Whether the key was revoked at export time.
144    pub revoked: bool,
145}
146
147/// A single audit log entry in the export archive.
148///
149/// IP addresses are NOT included to minimise personal data in the export.
150#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
151#[serde(rename_all = "camelCase")]
152pub struct ExportAuditEntry {
153    /// Opaque audit log entry ID.
154    pub id: String,
155    /// Structured action name (e.g. `"document.create"`).
156    pub action: String,
157    /// Resource type (e.g. `"document"`).
158    pub resource_type: String,
159    /// Resource ID/slug.
160    pub resource_id: Option<String>,
161    /// Unix millisecond timestamp.
162    pub timestamp: i64,
163}
164
165/// A webhook registration in the export archive.
166#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
167#[serde(rename_all = "camelCase")]
168pub struct ExportWebhook {
169    /// Internal webhook ID.
170    pub id: String,
171    /// Target callback URL.
172    pub url: String,
173    /// JSON array of subscribed event types (e.g. `["version.created"]`).
174    pub events: String,
175    /// Optional document slug scope (`null` = all documents).
176    pub document_slug: Option<String>,
177    /// Whether the webhook is active at export time.
178    pub active: bool,
179    /// ISO 8601 creation timestamp.
180    pub created_at: String,
181}
182
183// ── Serialisation ────────────────────────────────────────────────────────────
184
185/// Compute the content hash of an [`ExportArchive`].
186///
187/// The hash covers the canonical JSON bytes with `content_hash` set to `""`
188/// (empty string), so the hash field itself is not part of the payload.
189fn compute_content_hash(archive: &ExportArchive) -> String {
190    let mut scratch = archive.clone();
191    scratch.content_hash = String::new();
192    // serde_json produces deterministic output for structs (field order = declaration order).
193    #[allow(clippy::expect_used)]
194    let canonical = serde_json::to_vec(&scratch)
195        .expect("ExportArchive serialisation must not fail — no unbounded types");
196    let mut hasher = Sha256::new();
197    hasher.update(&canonical);
198    hex::encode(hasher.finalize())
199}
200
201/// Serialise an [`ExportArchive`] to a JSON string.
202///
203/// Computes and embeds the `content_hash` before serialisation, so the
204/// returned JSON string contains a valid integrity field.
205///
206/// # Panics
207///
208/// Never panics — all types in [`ExportArchive`] are serialisable.
209///
210/// # WASM
211///
212/// Exposed as `serialize_export_archive(json: String) -> String` via
213/// `wasm_bindgen`.  The input JSON is first deserialised into an
214/// [`ExportArchive`] (so the hash is recomputed from the actual data),
215/// then re-serialised with the hash embedded.
216pub fn serialize_export_archive(archive: &ExportArchive) -> String {
217    let mut stamped = archive.clone();
218    stamped.content_hash = compute_content_hash(archive);
219    #[allow(clippy::expect_used)]
220    serde_json::to_string(&stamped).expect("ExportArchive serialisation must not fail")
221}
222
223/// Deserialise an [`ExportArchive`] from a JSON string and verify its integrity.
224///
225/// Returns `Err(String)` if:
226/// - The JSON is malformed.
227/// - The `archive_version` is greater than [`ARCHIVE_VERSION`].
228/// - The `content_hash` does not match the recomputed hash.
229pub fn deserialize_export_archive(json: &str) -> Result<ExportArchive, String> {
230    let archive: ExportArchive = serde_json::from_str(json)
231        .map_err(|e| format!("deserialize_export_archive: JSON parse error: {e}"))?;
232
233    if archive.archive_version > ARCHIVE_VERSION {
234        return Err(format!(
235            "deserialize_export_archive: unsupported archive_version {}; max supported: {}",
236            archive.archive_version, ARCHIVE_VERSION
237        ));
238    }
239
240    let expected_hash = compute_content_hash(&archive);
241    if archive.content_hash != expected_hash {
242        return Err(format!(
243            "deserialize_export_archive: content_hash mismatch — \
244             expected {expected_hash}, got {}",
245            archive.content_hash
246        ));
247    }
248
249    Ok(archive)
250}
251
252// ── WASM shims ───────────────────────────────────────────────────────────────
253
254/// WASM binding for [`serialize_export_archive`].
255///
256/// Accepts a JSON string representing an [`ExportArchive`] *without*
257/// a valid `content_hash`, computes the hash, and returns the JSON
258/// string with the hash embedded.
259///
260/// Returns `{"error":"..."}` on parse failure.
261#[cfg_attr(feature = "wasm", wasm_bindgen)]
262pub fn serialize_export_archive_wasm(archive_json: &str) -> String {
263    let archive: ExportArchive = match serde_json::from_str(archive_json) {
264        Ok(a) => a,
265        Err(e) => {
266            return format!(r#"{{"error":"serialize_export_archive_wasm parse error: {e}"}}"#);
267        }
268    };
269    serialize_export_archive(&archive)
270}
271
272/// WASM binding for [`deserialize_export_archive`].
273///
274/// Returns the verified archive JSON on success, or `{"error":"..."}` on
275/// any failure (parse error, version mismatch, hash mismatch).
276#[cfg_attr(feature = "wasm", wasm_bindgen)]
277pub fn deserialize_export_archive_wasm(archive_json: &str) -> String {
278    match deserialize_export_archive(archive_json) {
279        Ok(archive) => serde_json::to_string(&archive)
280            .unwrap_or_else(|e| format!(r#"{{"error":"re-serialise failed: {e}"}}"#)),
281        Err(e) => format!(r#"{{"error":{}}}"#, serde_json::json!(e)),
282    }
283}
284
285// ── Retention policy DSL ─────────────────────────────────────────────────────
286
287/// Retention policy configuration (T186).
288///
289/// Describes per-resource-type retention windows in days.  A value of `0`
290/// means "retain indefinitely".  The backend enforces these policies via a
291/// nightly background job.
292#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
293#[serde(rename_all = "camelCase")]
294pub struct RetentionPolicy {
295    /// Policy format version (incremented on incompatible schema changes).
296    pub policy_version: u32,
297    /// Audit log entries: days to retain in the hot database.
298    /// Entries older than this are moved to cold archive (S3).
299    /// 0 = keep forever in hot DB.
300    pub audit_log_hot_days: u32,
301    /// Audit log entries: total retention in days (hot + cold combined).
302    /// 0 = keep forever.  MUST be >= `audit_log_hot_days`.
303    pub audit_log_total_days: u32,
304    /// Soft-deleted documents: days before hard deletion.
305    /// 0 = hard-delete immediately (not recommended).
306    pub soft_deleted_docs_days: u32,
307    /// Expired (anonymous) document TTL in days.
308    /// 0 = purge immediately when `expires_at` passes.
309    pub anonymous_doc_days: u32,
310    /// Expired API keys: days before hard purge of revoked rows.
311    /// 0 = purge immediately.
312    pub revoked_api_key_days: u32,
313    /// Agent inbox messages: days before hard purge.
314    pub agent_inbox_days: u32,
315}
316
317impl Default for RetentionPolicy {
318    fn default() -> Self {
319        Self {
320            policy_version: 1,
321            // Audit: 90 days hot, 7 years total (2555 days ≈ 7 * 365).
322            audit_log_hot_days: 90,
323            audit_log_total_days: 2555,
324            // Soft-deleted documents: 30-day grace period (T187).
325            soft_deleted_docs_days: 30,
326            // Anonymous docs: purge 1 day after expiry.
327            anonymous_doc_days: 1,
328            // Revoked API keys: keep 90 days for audit reference.
329            revoked_api_key_days: 90,
330            // Agent inbox: 2-day TTL (matches schema).
331            agent_inbox_days: 2,
332        }
333    }
334}
335
336/// Serialise a [`RetentionPolicy`] to a JSON string.
337pub fn serialize_retention_policy(policy: &RetentionPolicy) -> String {
338    #[allow(clippy::expect_used)]
339    serde_json::to_string(policy).expect("RetentionPolicy serialisation must not fail")
340}
341
342/// Deserialise a [`RetentionPolicy`] from a JSON string.
343pub fn deserialize_retention_policy(json: &str) -> Result<RetentionPolicy, String> {
344    serde_json::from_str(json).map_err(|e| format!("deserialize_retention_policy: {e}"))
345}
346
347// ── Tests ────────────────────────────────────────────────────────────────────
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    fn sample_archive() -> ExportArchive {
354        ExportArchive {
355            archive_version: ARCHIVE_VERSION,
356            exported_at: "2026-04-18T00:00:00Z".to_string(),
357            user_id: "usr_test".to_string(),
358            user_name: "Test User".to_string(),
359            user_email: "test@example.com".to_string(),
360            user_created_at: "2026-01-01T00:00:00Z".to_string(),
361            documents: vec![ExportDocument {
362                id: "doc_1".to_string(),
363                slug: "abcd1234".to_string(),
364                title: Some("My Document".to_string()),
365                state: "DRAFT".to_string(),
366                format: "markdown".to_string(),
367                created_at: "2026-01-01T00:00:00Z".to_string(),
368                updated_at: Some("2026-01-02T00:00:00Z".to_string()),
369                content: "# My Document\n\nHello world.".to_string(),
370                versions: vec![ExportVersion {
371                    version_number: 0,
372                    content_hash: "abc123".to_string(),
373                    created_at: "2026-01-01T00:00:00Z".to_string(),
374                    created_by: Some("agent_1".to_string()),
375                    changelog: None,
376                }],
377            }],
378            api_key_hashes: vec![ExportApiKey {
379                id: "key_1".to_string(),
380                name: "CI Bot".to_string(),
381                key_prefix: "llmtxt_abc".to_string(),
382                key_hash: "deadbeef".to_string(),
383                created_at: "2026-01-01T00:00:00Z".to_string(),
384                expires_at: None,
385                revoked: false,
386            }],
387            audit_log: vec![ExportAuditEntry {
388                id: "evt_1".to_string(),
389                action: "document.create".to_string(),
390                resource_type: "document".to_string(),
391                resource_id: Some("abcd1234".to_string()),
392                timestamp: 1_700_000_000_000,
393            }],
394            webhooks: vec![ExportWebhook {
395                id: "wh_1".to_string(),
396                url: "https://example.com/hook".to_string(),
397                events: r#"["version.created"]"#.to_string(),
398                document_slug: None,
399                active: true,
400                created_at: "2026-01-01T00:00:00Z".to_string(),
401            }],
402            content_hash: String::new(), // will be filled by serialize
403        }
404    }
405
406    #[test]
407    fn test_serialize_sets_content_hash() {
408        let archive = sample_archive();
409        let json = serialize_export_archive(&archive);
410        let parsed: ExportArchive = serde_json::from_str(&json).unwrap();
411        assert!(
412            !parsed.content_hash.is_empty(),
413            "content_hash should be set"
414        );
415        assert_eq!(parsed.content_hash.len(), 64, "SHA-256 hex is 64 chars");
416    }
417
418    #[test]
419    fn test_deserialize_verifies_hash() {
420        let json = serialize_export_archive(&sample_archive());
421        let result = deserialize_export_archive(&json);
422        assert!(
423            result.is_ok(),
424            "valid archive should deserialise: {:?}",
425            result
426        );
427    }
428
429    #[test]
430    fn test_tampered_archive_rejected() {
431        let json = serialize_export_archive(&sample_archive());
432        // Tamper: replace a character in the content field.
433        let tampered = json.replace("Hello world.", "Hello tamper.");
434        let result = deserialize_export_archive(&tampered);
435        assert!(result.is_err(), "tampered archive must be rejected");
436        assert!(
437            result.unwrap_err().contains("content_hash mismatch"),
438            "error should mention hash mismatch"
439        );
440    }
441
442    #[test]
443    fn test_roundtrip_byte_identical() {
444        let archive = sample_archive();
445        let json1 = serialize_export_archive(&archive);
446        // Deserialise then re-serialise — must be byte-identical.
447        let recovered = deserialize_export_archive(&json1).unwrap();
448        let json2 = serialize_export_archive(&recovered);
449        assert_eq!(json1, json2, "round-trip must be byte-identical");
450    }
451
452    #[test]
453    fn test_unsupported_version_rejected() {
454        let mut archive = sample_archive();
455        archive.archive_version = ARCHIVE_VERSION + 1;
456        archive.content_hash = String::new();
457        // Manually compute hash so the hash check doesn't trip first.
458        let hash = {
459            let canonical = serde_json::to_vec(&archive).unwrap();
460            let mut hasher = sha2::Sha256::new();
461            hasher.update(&canonical);
462            hex::encode(hasher.finalize())
463        };
464        archive.content_hash = hash;
465        let json = serde_json::to_string(&archive).unwrap();
466        let result = deserialize_export_archive(&json);
467        assert!(result.is_err());
468        assert!(
469            result.unwrap_err().contains("unsupported archive_version"),
470            "error should mention unsupported version"
471        );
472    }
473
474    #[test]
475    fn test_retention_policy_default_roundtrip() {
476        let policy = RetentionPolicy::default();
477        let json = serialize_retention_policy(&policy);
478        let recovered = deserialize_retention_policy(&json).unwrap();
479        assert_eq!(policy, recovered);
480    }
481
482    #[test]
483    fn test_retention_policy_defaults() {
484        let p = RetentionPolicy::default();
485        assert_eq!(p.audit_log_hot_days, 90);
486        assert_eq!(p.audit_log_total_days, 2555);
487        assert_eq!(p.soft_deleted_docs_days, 30);
488        assert_eq!(p.agent_inbox_days, 2);
489    }
490}