infiniloom_engine/embedding/
audit.rs

1//! Tamper-evident audit logging with hash chain
2//!
3//! This module provides cryptographic audit logging for embedding operations.
4//! Each log entry is linked to the previous via a hash chain, making tampering
5//! detectable.
6//!
7//! # Features
8//!
9//! - **Hash chain**: Each entry's hash includes the previous entry's hash
10//! - **Tamper detection**: Any modification breaks the chain
11//! - **Append-only**: Logs can only be extended, not modified
12//! - **Portable**: JSON-based format with optional binary storage
13//!
14//! # Example
15//!
16//! ```rust,ignore
17//! use infiniloom_engine::embedding::{AuditLog, AuditOperation};
18//!
19//! // Create a new audit log
20//! let mut log = AuditLog::new();
21//!
22//! // Record an embedding operation
23//! log.record(AuditOperation::EmbedStart {
24//!     repo_path: "/path/to/repo".to_string(),
25//!     settings_hash: "abc123".to_string(),
26//! });
27//!
28//! // Record completion
29//! log.record(AuditOperation::EmbedComplete {
30//!     chunks_count: 150,
31//!     total_tokens: 75000,
32//!     manifest_hash: "def456".to_string(),
33//! });
34//!
35//! // Verify integrity
36//! assert!(log.verify_integrity());
37//!
38//! // Save to file
39//! log.save(Path::new("audit.log"))?;
40//! ```
41
42use std::path::Path;
43use std::time::{SystemTime, UNIX_EPOCH};
44
45use blake3::Hasher;
46use serde::{Deserialize, Serialize};
47
48use super::error::EmbedError;
49
50/// A single audit log entry with hash chain link
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct AuditEntry {
53    /// Sequential entry number (0-indexed)
54    pub sequence: u64,
55
56    /// Unix timestamp (seconds since epoch)
57    pub timestamp: u64,
58
59    /// Hash of the previous entry (empty string for first entry)
60    pub prev_hash: String,
61
62    /// Hash of this entry (includes prev_hash for chain)
63    pub hash: String,
64
65    /// The operation being logged
66    pub operation: AuditOperation,
67}
68
69/// Operations that can be logged
70#[derive(Debug, Clone, Serialize, Deserialize)]
71#[serde(tag = "type")]
72pub enum AuditOperation {
73    /// Log file created
74    LogCreated { version: u32, created_by: String },
75
76    /// Embedding operation started
77    EmbedStart { repo_path: String, settings_hash: String },
78
79    /// Embedding operation completed successfully
80    EmbedComplete { chunks_count: usize, total_tokens: u64, manifest_hash: String },
81
82    /// Embedding operation failed
83    EmbedFailed { error_code: String, error_message: String },
84
85    /// Manifest loaded from disk
86    ManifestLoaded { path: String, manifest_hash: String, chunks_count: usize },
87
88    /// Manifest saved to disk
89    ManifestSaved { path: String, manifest_hash: String },
90
91    /// Diff computed between manifest and current state
92    DiffComputed { added: usize, modified: usize, removed: usize },
93
94    /// Batch embedding started
95    BatchStart { repo_count: usize, total_settings_hash: String },
96
97    /// Single repo in batch completed
98    BatchRepoComplete { repo_index: usize, repo_path: String, chunks_count: usize, success: bool },
99
100    /// Batch embedding completed
101    BatchComplete { successful: usize, failed: usize, total_chunks: usize },
102
103    /// Security scan performed
104    SecurityScan { findings_count: usize, secrets_redacted: bool },
105
106    /// Checkpoint created for resume
107    CheckpointCreated { checkpoint_hash: String, files_processed: usize, chunks_generated: usize },
108
109    /// Resume from checkpoint
110    ResumeFromCheckpoint { checkpoint_hash: String, files_remaining: usize },
111
112    /// Custom user-defined operation
113    Custom { name: String, data: String },
114}
115
116/// Current audit log format version
117pub const AUDIT_LOG_VERSION: u32 = 1;
118
119/// Tamper-evident audit log with hash chain
120#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct AuditLog {
122    /// Log format version
123    pub version: u32,
124
125    /// All entries in chronological order
126    pub entries: Vec<AuditEntry>,
127}
128
129impl Default for AuditLog {
130    fn default() -> Self {
131        Self::new()
132    }
133}
134
135impl AuditLog {
136    /// Create a new empty audit log
137    pub fn new() -> Self {
138        let mut log = Self { version: AUDIT_LOG_VERSION, entries: Vec::new() };
139
140        // Add initial entry
141        log.record(AuditOperation::LogCreated {
142            version: AUDIT_LOG_VERSION,
143            created_by: format!("infiniloom-engine/{}", env!("CARGO_PKG_VERSION")),
144        });
145
146        log
147    }
148
149    /// Record a new operation in the audit log
150    ///
151    /// Returns the hash of the new entry.
152    pub fn record(&mut self, operation: AuditOperation) -> String {
153        let sequence = self.entries.len() as u64;
154        let timestamp = SystemTime::now()
155            .duration_since(UNIX_EPOCH)
156            .unwrap_or_default()
157            .as_secs();
158
159        let prev_hash = self
160            .entries
161            .last()
162            .map(|e| e.hash.clone())
163            .unwrap_or_default();
164
165        // Compute hash of this entry
166        let hash = compute_entry_hash(sequence, timestamp, &prev_hash, &operation);
167
168        let entry = AuditEntry { sequence, timestamp, prev_hash, hash: hash.clone(), operation };
169
170        self.entries.push(entry);
171        hash
172    }
173
174    /// Verify the integrity of the entire hash chain
175    ///
176    /// Returns `true` if all hashes are valid and the chain is intact.
177    pub fn verify_integrity(&self) -> bool {
178        let mut prev_hash = String::new();
179
180        for entry in &self.entries {
181            // Check prev_hash link
182            if entry.prev_hash != prev_hash {
183                return false;
184            }
185
186            // Recompute and verify hash
187            let expected_hash =
188                compute_entry_hash(entry.sequence, entry.timestamp, &prev_hash, &entry.operation);
189
190            if entry.hash != expected_hash {
191                return false;
192            }
193
194            prev_hash = entry.hash.clone();
195        }
196
197        true
198    }
199
200    /// Verify integrity and return detailed results
201    pub fn verify_integrity_detailed(&self) -> IntegrityReport {
202        let mut errors = Vec::new();
203        let mut prev_hash = String::new();
204
205        for (index, entry) in self.entries.iter().enumerate() {
206            // Check prev_hash link
207            if entry.prev_hash != prev_hash {
208                errors.push(IntegrityError::ChainBroken {
209                    entry_index: index,
210                    expected_prev: prev_hash.clone(),
211                    actual_prev: entry.prev_hash.clone(),
212                });
213            }
214
215            // Recompute and verify hash
216            let expected_hash =
217                compute_entry_hash(entry.sequence, entry.timestamp, &prev_hash, &entry.operation);
218
219            if entry.hash != expected_hash {
220                errors.push(IntegrityError::HashMismatch {
221                    entry_index: index,
222                    expected: expected_hash,
223                    actual: entry.hash.clone(),
224                });
225            }
226
227            prev_hash = entry.hash.clone();
228        }
229
230        IntegrityReport { is_valid: errors.is_empty(), entries_checked: self.entries.len(), errors }
231    }
232
233    /// Get the number of entries in the log
234    pub fn len(&self) -> usize {
235        self.entries.len()
236    }
237
238    /// Check if the log is empty (has only the initial entry)
239    pub fn is_empty(&self) -> bool {
240        self.entries.len() <= 1
241    }
242
243    /// Get the hash of the latest entry (chain head)
244    pub fn head_hash(&self) -> Option<&str> {
245        self.entries.last().map(|e| e.hash.as_str())
246    }
247
248    /// Get entries filtered by operation type
249    pub fn filter_by_type<F>(&self, predicate: F) -> Vec<&AuditEntry>
250    where
251        F: Fn(&AuditOperation) -> bool,
252    {
253        self.entries
254            .iter()
255            .filter(|e| predicate(&e.operation))
256            .collect()
257    }
258
259    /// Get entries in a time range (inclusive)
260    pub fn filter_by_time(&self, start: u64, end: u64) -> Vec<&AuditEntry> {
261        self.entries
262            .iter()
263            .filter(|e| e.timestamp >= start && e.timestamp <= end)
264            .collect()
265    }
266
267    /// Save the audit log to a file (JSON format)
268    pub fn save(&self, path: &Path) -> Result<(), EmbedError> {
269        let json =
270            serde_json::to_string_pretty(self).map_err(|e| EmbedError::SerializationError {
271                reason: format!("Failed to serialize audit log: {}", e),
272            })?;
273
274        std::fs::write(path, json)
275            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
276
277        Ok(())
278    }
279
280    /// Save as newline-delimited JSON (each entry on one line)
281    ///
282    /// This format is more suitable for streaming and appending.
283    pub fn save_jsonl(&self, path: &Path) -> Result<(), EmbedError> {
284        use std::io::Write;
285
286        let file = std::fs::File::create(path)
287            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
288
289        let mut writer = std::io::BufWriter::new(file);
290
291        // Write header line with version
292        let header = serde_json::json!({
293            "audit_log_version": self.version,
294            "entry_count": self.entries.len()
295        });
296        writeln!(writer, "{}", header)
297            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
298
299        // Write each entry
300        for entry in &self.entries {
301            let line =
302                serde_json::to_string(entry).map_err(|e| EmbedError::SerializationError {
303                    reason: format!("Failed to serialize audit entry: {}", e),
304                })?;
305            writeln!(writer, "{}", line)
306                .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
307        }
308
309        writer
310            .flush()
311            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
312
313        Ok(())
314    }
315
316    /// Load an audit log from a file (JSON format)
317    pub fn load(path: &Path) -> Result<Self, EmbedError> {
318        let content = std::fs::read_to_string(path)
319            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
320
321        let log: Self =
322            serde_json::from_str(&content).map_err(|e| EmbedError::DeserializationError {
323                reason: format!("Failed to deserialize audit log: {}", e),
324            })?;
325
326        // Verify integrity on load
327        if !log.verify_integrity() {
328            return Err(EmbedError::ManifestCorrupted {
329                path: path.to_path_buf(),
330                expected: "valid hash chain".to_owned(),
331                actual: "hash chain broken".to_owned(),
332            });
333        }
334
335        Ok(log)
336    }
337
338    /// Append a single entry to an existing JSONL file
339    ///
340    /// This is more efficient than rewriting the entire file.
341    pub fn append_entry_to_file(path: &Path, entry: &AuditEntry) -> Result<(), EmbedError> {
342        use std::io::Write;
343
344        let file = std::fs::OpenOptions::new()
345            .create(true)
346            .append(true)
347            .open(path)
348            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
349
350        let mut writer = std::io::BufWriter::new(file);
351        let line = serde_json::to_string(entry).map_err(|e| EmbedError::SerializationError {
352            reason: format!("Failed to serialize audit entry: {}", e),
353        })?;
354        writeln!(writer, "{}", line)
355            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
356
357        writer
358            .flush()
359            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
360
361        Ok(())
362    }
363}
364
365/// Compute the hash of an entry for the chain
366fn compute_entry_hash(
367    sequence: u64,
368    timestamp: u64,
369    prev_hash: &str,
370    operation: &AuditOperation,
371) -> String {
372    let mut hasher = Hasher::new();
373
374    // Include sequence number
375    hasher.update(&sequence.to_le_bytes());
376
377    // Include timestamp
378    hasher.update(&timestamp.to_le_bytes());
379
380    // Include previous hash (chain link)
381    hasher.update(prev_hash.as_bytes());
382
383    // Include operation as JSON
384    let op_json = serde_json::to_string(operation).unwrap_or_default();
385    hasher.update(op_json.as_bytes());
386
387    // Return hex-encoded hash
388    hasher.finalize().to_hex().to_string()
389}
390
391/// Result of integrity verification
392#[derive(Debug, Clone)]
393pub struct IntegrityReport {
394    /// Whether the entire log is valid
395    pub is_valid: bool,
396
397    /// Number of entries checked
398    pub entries_checked: usize,
399
400    /// List of errors found
401    pub errors: Vec<IntegrityError>,
402}
403
404/// Types of integrity errors
405#[derive(Debug, Clone)]
406pub enum IntegrityError {
407    /// Hash chain is broken (prev_hash doesn't match)
408    ChainBroken { entry_index: usize, expected_prev: String, actual_prev: String },
409
410    /// Entry hash doesn't match computed value
411    HashMismatch { entry_index: usize, expected: String, actual: String },
412}
413
414impl std::fmt::Display for IntegrityError {
415    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
416        match self {
417            Self::ChainBroken { entry_index, expected_prev, actual_prev } => {
418                write!(
419                    f,
420                    "Chain broken at entry {}: expected prev_hash '{}', got '{}'",
421                    entry_index,
422                    &expected_prev[..8.min(expected_prev.len())],
423                    &actual_prev[..8.min(actual_prev.len())]
424                )
425            },
426            Self::HashMismatch { entry_index, expected, actual } => {
427                write!(
428                    f,
429                    "Hash mismatch at entry {}: expected '{}', got '{}'",
430                    entry_index,
431                    &expected[..8.min(expected.len())],
432                    &actual[..8.min(actual.len())]
433                )
434            },
435        }
436    }
437}
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442
443    #[test]
444    fn test_new_audit_log() {
445        let log = AuditLog::new();
446        assert_eq!(log.version, AUDIT_LOG_VERSION);
447        assert_eq!(log.entries.len(), 1); // Initial LogCreated entry
448        assert!(log.verify_integrity());
449    }
450
451    #[test]
452    fn test_record_operations() {
453        let mut log = AuditLog::new();
454
455        log.record(AuditOperation::EmbedStart {
456            repo_path: "/test/repo".to_owned(),
457            settings_hash: "abc123".to_owned(),
458        });
459
460        log.record(AuditOperation::EmbedComplete {
461            chunks_count: 100,
462            total_tokens: 50000,
463            manifest_hash: "def456".to_owned(),
464        });
465
466        assert_eq!(log.entries.len(), 3);
467        assert!(log.verify_integrity());
468    }
469
470    #[test]
471    fn test_hash_chain_integrity() {
472        let mut log = AuditLog::new();
473
474        for i in 0..10 {
475            log.record(AuditOperation::Custom {
476                name: format!("test_{}", i),
477                data: format!("data_{}", i),
478            });
479        }
480
481        assert!(log.verify_integrity());
482
483        // Tamper with an entry's data
484        if let AuditOperation::Custom { ref mut data, .. } = log.entries[5].operation {
485            *data = "tampered".to_owned();
486        }
487
488        // Integrity should now fail
489        assert!(!log.verify_integrity());
490    }
491
492    #[test]
493    fn test_verify_integrity_detailed() {
494        let mut log = AuditLog::new();
495
496        log.record(AuditOperation::EmbedStart {
497            repo_path: "/test".to_owned(),
498            settings_hash: "hash".to_owned(),
499        });
500
501        let report = log.verify_integrity_detailed();
502        assert!(report.is_valid);
503        assert_eq!(report.entries_checked, 2);
504        assert!(report.errors.is_empty());
505    }
506
507    #[test]
508    fn test_tamper_detection_chain_broken() {
509        let mut log = AuditLog::new();
510
511        log.record(AuditOperation::Custom { name: "op1".to_owned(), data: "data1".to_owned() });
512        log.record(AuditOperation::Custom { name: "op2".to_owned(), data: "data2".to_owned() });
513
514        // Break the chain by modifying prev_hash
515        log.entries[2].prev_hash = "fake_hash".to_owned();
516
517        let report = log.verify_integrity_detailed();
518        assert!(!report.is_valid);
519        assert!(!report.errors.is_empty());
520        assert!(matches!(report.errors[0], IntegrityError::ChainBroken { .. }));
521    }
522
523    #[test]
524    fn test_tamper_detection_hash_mismatch() {
525        let mut log = AuditLog::new();
526
527        log.record(AuditOperation::Custom { name: "op1".to_owned(), data: "data1".to_owned() });
528
529        // Modify the entry's own hash
530        log.entries[1].hash = "fake_hash".to_owned();
531
532        let report = log.verify_integrity_detailed();
533        assert!(!report.is_valid);
534        assert!(report
535            .errors
536            .iter()
537            .any(|e| matches!(e, IntegrityError::HashMismatch { .. })));
538    }
539
540    #[test]
541    fn test_head_hash() {
542        let mut log = AuditLog::new();
543
544        let initial_head = log.head_hash().map(String::from);
545        assert!(initial_head.is_some());
546
547        let new_hash =
548            log.record(AuditOperation::Custom { name: "test".to_owned(), data: "data".to_owned() });
549
550        assert_eq!(log.head_hash(), Some(new_hash.as_str()));
551        assert_ne!(log.head_hash().map(String::from), initial_head);
552    }
553
554    #[test]
555    fn test_filter_by_type() {
556        let mut log = AuditLog::new();
557
558        log.record(AuditOperation::EmbedStart {
559            repo_path: "/repo1".to_owned(),
560            settings_hash: "h1".to_owned(),
561        });
562        log.record(AuditOperation::EmbedComplete {
563            chunks_count: 100,
564            total_tokens: 50000,
565            manifest_hash: "m1".to_owned(),
566        });
567        log.record(AuditOperation::EmbedStart {
568            repo_path: "/repo2".to_owned(),
569            settings_hash: "h2".to_owned(),
570        });
571
572        let starts = log.filter_by_type(|op| matches!(op, AuditOperation::EmbedStart { .. }));
573        assert_eq!(starts.len(), 2);
574
575        let completes = log.filter_by_type(|op| matches!(op, AuditOperation::EmbedComplete { .. }));
576        assert_eq!(completes.len(), 1);
577    }
578
579    #[test]
580    fn test_filter_by_time() {
581        let mut log = AuditLog::new();
582
583        // All entries will have the same timestamp (or very close)
584        log.record(AuditOperation::Custom { name: "test".to_owned(), data: "data".to_owned() });
585
586        let now = SystemTime::now()
587            .duration_since(UNIX_EPOCH)
588            .unwrap()
589            .as_secs();
590
591        let entries = log.filter_by_time(now - 60, now + 60);
592        assert!(!entries.is_empty());
593    }
594
595    #[test]
596    fn test_save_and_load() {
597        let temp_dir = tempfile::TempDir::new().unwrap();
598        let log_path = temp_dir.path().join("audit.json");
599
600        let mut log = AuditLog::new();
601        log.record(AuditOperation::EmbedStart {
602            repo_path: "/test/repo".to_owned(),
603            settings_hash: "abc123".to_owned(),
604        });
605        log.record(AuditOperation::EmbedComplete {
606            chunks_count: 100,
607            total_tokens: 50000,
608            manifest_hash: "def456".to_owned(),
609        });
610
611        // Save
612        log.save(&log_path).unwrap();
613
614        // Load and verify
615        let loaded = AuditLog::load(&log_path).unwrap();
616        assert_eq!(loaded.entries.len(), log.entries.len());
617        assert!(loaded.verify_integrity());
618
619        // Compare hashes
620        for (orig, loaded) in log.entries.iter().zip(loaded.entries.iter()) {
621            assert_eq!(orig.hash, loaded.hash);
622            assert_eq!(orig.prev_hash, loaded.prev_hash);
623        }
624    }
625
626    #[test]
627    fn test_save_jsonl() {
628        let temp_dir = tempfile::TempDir::new().unwrap();
629        let log_path = temp_dir.path().join("audit.jsonl");
630
631        let mut log = AuditLog::new();
632        log.record(AuditOperation::Custom { name: "test".to_owned(), data: "data".to_owned() });
633
634        log.save_jsonl(&log_path).unwrap();
635
636        // Verify file exists and has content
637        let content = std::fs::read_to_string(&log_path).unwrap();
638        assert!(!content.is_empty());
639
640        // Count lines (header + entries)
641        let lines: Vec<_> = content.lines().collect();
642        assert_eq!(lines.len(), 3); // header + 2 entries
643    }
644
645    #[test]
646    fn test_security_scan_operation() {
647        let mut log = AuditLog::new();
648
649        log.record(AuditOperation::SecurityScan { findings_count: 5, secrets_redacted: true });
650
651        assert!(log.verify_integrity());
652
653        let scans = log.filter_by_type(|op| matches!(op, AuditOperation::SecurityScan { .. }));
654        assert_eq!(scans.len(), 1);
655
656        if let AuditOperation::SecurityScan { findings_count, secrets_redacted } =
657            &scans[0].operation
658        {
659            assert_eq!(*findings_count, 5);
660            assert!(*secrets_redacted);
661        }
662    }
663
664    #[test]
665    fn test_batch_operations() {
666        let mut log = AuditLog::new();
667
668        log.record(AuditOperation::BatchStart {
669            repo_count: 3,
670            total_settings_hash: "settings_hash".to_owned(),
671        });
672
673        for i in 0..3 {
674            log.record(AuditOperation::BatchRepoComplete {
675                repo_index: i,
676                repo_path: format!("/repo{}", i),
677                chunks_count: 100 * (i + 1),
678                success: true,
679            });
680        }
681
682        log.record(AuditOperation::BatchComplete { successful: 3, failed: 0, total_chunks: 600 });
683
684        assert!(log.verify_integrity());
685        assert_eq!(log.entries.len(), 6); // LogCreated + BatchStart + 3 repos + BatchComplete
686    }
687
688    #[test]
689    fn test_checkpoint_operations() {
690        let mut log = AuditLog::new();
691
692        log.record(AuditOperation::CheckpointCreated {
693            checkpoint_hash: "ckpt_abc123".to_owned(),
694            files_processed: 50,
695            chunks_generated: 200,
696        });
697
698        log.record(AuditOperation::ResumeFromCheckpoint {
699            checkpoint_hash: "ckpt_abc123".to_owned(),
700            files_remaining: 100,
701        });
702
703        assert!(log.verify_integrity());
704    }
705}