infiniloom_engine/embedding/
audit.rs

1//! Tamper-evident audit logging with hash chain
2//!
3//! This module provides cryptographic audit logging for embedding operations.
4//! Each log entry is linked to the previous via a hash chain, making tampering
5//! detectable.
6//!
7//! # Features
8//!
9//! - **Hash chain**: Each entry's hash includes the previous entry's hash
10//! - **Tamper detection**: Any modification breaks the chain
11//! - **Append-only**: Logs can only be extended, not modified
12//! - **Portable**: JSON-based format with optional binary storage
13//!
14//! # Example
15//!
16//! ```rust,ignore
17//! use infiniloom_engine::embedding::{AuditLog, AuditOperation};
18//!
19//! // Create a new audit log
20//! let mut log = AuditLog::new();
21//!
22//! // Record an embedding operation
23//! log.record(AuditOperation::EmbedStart {
24//!     repo_path: "/path/to/repo".to_string(),
25//!     settings_hash: "abc123".to_string(),
26//! });
27//!
28//! // Record completion
29//! log.record(AuditOperation::EmbedComplete {
30//!     chunks_count: 150,
31//!     total_tokens: 75000,
32//!     manifest_hash: "def456".to_string(),
33//! });
34//!
35//! // Verify integrity
36//! assert!(log.verify_integrity());
37//!
38//! // Save to file
39//! log.save(Path::new("audit.log"))?;
40//! ```
41
42use std::path::Path;
43use std::time::{SystemTime, UNIX_EPOCH};
44
45use blake3::Hasher;
46use serde::{Deserialize, Serialize};
47
48use super::error::EmbedError;
49
50/// A single audit log entry with hash chain link
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct AuditEntry {
53    /// Sequential entry number (0-indexed)
54    pub sequence: u64,
55
56    /// Unix timestamp (seconds since epoch)
57    pub timestamp: u64,
58
59    /// Hash of the previous entry (empty string for first entry)
60    pub prev_hash: String,
61
62    /// Hash of this entry (includes prev_hash for chain)
63    pub hash: String,
64
65    /// The operation being logged
66    pub operation: AuditOperation,
67}
68
69/// Operations that can be logged
70#[derive(Debug, Clone, Serialize, Deserialize)]
71#[serde(tag = "type")]
72pub enum AuditOperation {
73    /// Log file created
74    LogCreated {
75        version: u32,
76        created_by: String,
77    },
78
79    /// Embedding operation started
80    EmbedStart {
81        repo_path: String,
82        settings_hash: String,
83    },
84
85    /// Embedding operation completed successfully
86    EmbedComplete {
87        chunks_count: usize,
88        total_tokens: u64,
89        manifest_hash: String,
90    },
91
92    /// Embedding operation failed
93    EmbedFailed {
94        error_code: String,
95        error_message: String,
96    },
97
98    /// Manifest loaded from disk
99    ManifestLoaded {
100        path: String,
101        manifest_hash: String,
102        chunks_count: usize,
103    },
104
105    /// Manifest saved to disk
106    ManifestSaved {
107        path: String,
108        manifest_hash: String,
109    },
110
111    /// Diff computed between manifest and current state
112    DiffComputed {
113        added: usize,
114        modified: usize,
115        removed: usize,
116    },
117
118    /// Batch embedding started
119    BatchStart {
120        repo_count: usize,
121        total_settings_hash: String,
122    },
123
124    /// Single repo in batch completed
125    BatchRepoComplete {
126        repo_index: usize,
127        repo_path: String,
128        chunks_count: usize,
129        success: bool,
130    },
131
132    /// Batch embedding completed
133    BatchComplete {
134        successful: usize,
135        failed: usize,
136        total_chunks: usize,
137    },
138
139    /// Security scan performed
140    SecurityScan {
141        findings_count: usize,
142        secrets_redacted: bool,
143    },
144
145    /// Checkpoint created for resume
146    CheckpointCreated {
147        checkpoint_hash: String,
148        files_processed: usize,
149        chunks_generated: usize,
150    },
151
152    /// Resume from checkpoint
153    ResumeFromCheckpoint {
154        checkpoint_hash: String,
155        files_remaining: usize,
156    },
157
158    /// Custom user-defined operation
159    Custom {
160        name: String,
161        data: String,
162    },
163}
164
165/// Current audit log format version
166pub const AUDIT_LOG_VERSION: u32 = 1;
167
168/// Tamper-evident audit log with hash chain
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct AuditLog {
171    /// Log format version
172    pub version: u32,
173
174    /// All entries in chronological order
175    pub entries: Vec<AuditEntry>,
176}
177
178impl Default for AuditLog {
179    fn default() -> Self {
180        Self::new()
181    }
182}
183
184impl AuditLog {
185    /// Create a new empty audit log
186    pub fn new() -> Self {
187        let mut log = Self {
188            version: AUDIT_LOG_VERSION,
189            entries: Vec::new(),
190        };
191
192        // Add initial entry
193        log.record(AuditOperation::LogCreated {
194            version: AUDIT_LOG_VERSION,
195            created_by: format!("infiniloom-engine/{}", env!("CARGO_PKG_VERSION")),
196        });
197
198        log
199    }
200
201    /// Record a new operation in the audit log
202    ///
203    /// Returns the hash of the new entry.
204    pub fn record(&mut self, operation: AuditOperation) -> String {
205        let sequence = self.entries.len() as u64;
206        let timestamp = SystemTime::now()
207            .duration_since(UNIX_EPOCH)
208            .unwrap_or_default()
209            .as_secs();
210
211        let prev_hash = self
212            .entries
213            .last()
214            .map(|e| e.hash.clone())
215            .unwrap_or_default();
216
217        // Compute hash of this entry
218        let hash = compute_entry_hash(sequence, timestamp, &prev_hash, &operation);
219
220        let entry = AuditEntry {
221            sequence,
222            timestamp,
223            prev_hash,
224            hash: hash.clone(),
225            operation,
226        };
227
228        self.entries.push(entry);
229        hash
230    }
231
232    /// Verify the integrity of the entire hash chain
233    ///
234    /// Returns `true` if all hashes are valid and the chain is intact.
235    pub fn verify_integrity(&self) -> bool {
236        let mut prev_hash = String::new();
237
238        for entry in &self.entries {
239            // Check prev_hash link
240            if entry.prev_hash != prev_hash {
241                return false;
242            }
243
244            // Recompute and verify hash
245            let expected_hash =
246                compute_entry_hash(entry.sequence, entry.timestamp, &prev_hash, &entry.operation);
247
248            if entry.hash != expected_hash {
249                return false;
250            }
251
252            prev_hash = entry.hash.clone();
253        }
254
255        true
256    }
257
258    /// Verify integrity and return detailed results
259    pub fn verify_integrity_detailed(&self) -> IntegrityReport {
260        let mut errors = Vec::new();
261        let mut prev_hash = String::new();
262
263        for (index, entry) in self.entries.iter().enumerate() {
264            // Check prev_hash link
265            if entry.prev_hash != prev_hash {
266                errors.push(IntegrityError::ChainBroken {
267                    entry_index: index,
268                    expected_prev: prev_hash.clone(),
269                    actual_prev: entry.prev_hash.clone(),
270                });
271            }
272
273            // Recompute and verify hash
274            let expected_hash =
275                compute_entry_hash(entry.sequence, entry.timestamp, &prev_hash, &entry.operation);
276
277            if entry.hash != expected_hash {
278                errors.push(IntegrityError::HashMismatch {
279                    entry_index: index,
280                    expected: expected_hash,
281                    actual: entry.hash.clone(),
282                });
283            }
284
285            prev_hash = entry.hash.clone();
286        }
287
288        IntegrityReport {
289            is_valid: errors.is_empty(),
290            entries_checked: self.entries.len(),
291            errors,
292        }
293    }
294
295    /// Get the number of entries in the log
296    pub fn len(&self) -> usize {
297        self.entries.len()
298    }
299
300    /// Check if the log is empty (has only the initial entry)
301    pub fn is_empty(&self) -> bool {
302        self.entries.len() <= 1
303    }
304
305    /// Get the hash of the latest entry (chain head)
306    pub fn head_hash(&self) -> Option<&str> {
307        self.entries.last().map(|e| e.hash.as_str())
308    }
309
310    /// Get entries filtered by operation type
311    pub fn filter_by_type<F>(&self, predicate: F) -> Vec<&AuditEntry>
312    where
313        F: Fn(&AuditOperation) -> bool,
314    {
315        self.entries
316            .iter()
317            .filter(|e| predicate(&e.operation))
318            .collect()
319    }
320
321    /// Get entries in a time range (inclusive)
322    pub fn filter_by_time(&self, start: u64, end: u64) -> Vec<&AuditEntry> {
323        self.entries
324            .iter()
325            .filter(|e| e.timestamp >= start && e.timestamp <= end)
326            .collect()
327    }
328
329    /// Save the audit log to a file (JSON format)
330    pub fn save(&self, path: &Path) -> Result<(), EmbedError> {
331        let json = serde_json::to_string_pretty(self).map_err(|e| EmbedError::SerializationError {
332            reason: format!("Failed to serialize audit log: {}", e),
333        })?;
334
335        std::fs::write(path, json).map_err(|e| EmbedError::IoError {
336            path: path.to_path_buf(),
337            source: e,
338        })?;
339
340        Ok(())
341    }
342
343    /// Save as newline-delimited JSON (each entry on one line)
344    ///
345    /// This format is more suitable for streaming and appending.
346    pub fn save_jsonl(&self, path: &Path) -> Result<(), EmbedError> {
347        use std::io::Write;
348
349        let file = std::fs::File::create(path).map_err(|e| EmbedError::IoError {
350            path: path.to_path_buf(),
351            source: e,
352        })?;
353
354        let mut writer = std::io::BufWriter::new(file);
355
356        // Write header line with version
357        let header = serde_json::json!({
358            "audit_log_version": self.version,
359            "entry_count": self.entries.len()
360        });
361        writeln!(writer, "{}", header).map_err(|e| EmbedError::IoError {
362            path: path.to_path_buf(),
363            source: e,
364        })?;
365
366        // Write each entry
367        for entry in &self.entries {
368            let line =
369                serde_json::to_string(entry).map_err(|e| EmbedError::SerializationError {
370                    reason: format!("Failed to serialize audit entry: {}", e),
371                })?;
372            writeln!(writer, "{}", line).map_err(|e| EmbedError::IoError {
373                path: path.to_path_buf(),
374                source: e,
375            })?;
376        }
377
378        writer.flush().map_err(|e| EmbedError::IoError {
379            path: path.to_path_buf(),
380            source: e,
381        })?;
382
383        Ok(())
384    }
385
386    /// Load an audit log from a file (JSON format)
387    pub fn load(path: &Path) -> Result<Self, EmbedError> {
388        let content = std::fs::read_to_string(path).map_err(|e| EmbedError::IoError {
389            path: path.to_path_buf(),
390            source: e,
391        })?;
392
393        let log: Self =
394            serde_json::from_str(&content).map_err(|e| EmbedError::DeserializationError {
395                reason: format!("Failed to deserialize audit log: {}", e),
396            })?;
397
398        // Verify integrity on load
399        if !log.verify_integrity() {
400            return Err(EmbedError::ManifestCorrupted {
401                path: path.to_path_buf(),
402                expected: "valid hash chain".to_string(),
403                actual: "hash chain broken".to_string(),
404            });
405        }
406
407        Ok(log)
408    }
409
410    /// Append a single entry to an existing JSONL file
411    ///
412    /// This is more efficient than rewriting the entire file.
413    pub fn append_entry_to_file(path: &Path, entry: &AuditEntry) -> Result<(), EmbedError> {
414        use std::io::Write;
415
416        let file = std::fs::OpenOptions::new()
417            .create(true)
418            .append(true)
419            .open(path)
420            .map_err(|e| EmbedError::IoError {
421                path: path.to_path_buf(),
422                source: e,
423            })?;
424
425        let mut writer = std::io::BufWriter::new(file);
426        let line = serde_json::to_string(entry).map_err(|e| EmbedError::SerializationError {
427            reason: format!("Failed to serialize audit entry: {}", e),
428        })?;
429        writeln!(writer, "{}", line).map_err(|e| EmbedError::IoError {
430            path: path.to_path_buf(),
431            source: e,
432        })?;
433
434        writer.flush().map_err(|e| EmbedError::IoError {
435            path: path.to_path_buf(),
436            source: e,
437        })?;
438
439        Ok(())
440    }
441}
442
443/// Compute the hash of an entry for the chain
444fn compute_entry_hash(
445    sequence: u64,
446    timestamp: u64,
447    prev_hash: &str,
448    operation: &AuditOperation,
449) -> String {
450    let mut hasher = Hasher::new();
451
452    // Include sequence number
453    hasher.update(&sequence.to_le_bytes());
454
455    // Include timestamp
456    hasher.update(&timestamp.to_le_bytes());
457
458    // Include previous hash (chain link)
459    hasher.update(prev_hash.as_bytes());
460
461    // Include operation as JSON
462    let op_json = serde_json::to_string(operation).unwrap_or_default();
463    hasher.update(op_json.as_bytes());
464
465    // Return hex-encoded hash
466    hasher.finalize().to_hex().to_string()
467}
468
469/// Result of integrity verification
470#[derive(Debug, Clone)]
471pub struct IntegrityReport {
472    /// Whether the entire log is valid
473    pub is_valid: bool,
474
475    /// Number of entries checked
476    pub entries_checked: usize,
477
478    /// List of errors found
479    pub errors: Vec<IntegrityError>,
480}
481
482/// Types of integrity errors
483#[derive(Debug, Clone)]
484pub enum IntegrityError {
485    /// Hash chain is broken (prev_hash doesn't match)
486    ChainBroken {
487        entry_index: usize,
488        expected_prev: String,
489        actual_prev: String,
490    },
491
492    /// Entry hash doesn't match computed value
493    HashMismatch {
494        entry_index: usize,
495        expected: String,
496        actual: String,
497    },
498}
499
500impl std::fmt::Display for IntegrityError {
501    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
502        match self {
503            Self::ChainBroken {
504                entry_index,
505                expected_prev,
506                actual_prev,
507            } => {
508                write!(
509                    f,
510                    "Chain broken at entry {}: expected prev_hash '{}', got '{}'",
511                    entry_index,
512                    &expected_prev[..8.min(expected_prev.len())],
513                    &actual_prev[..8.min(actual_prev.len())]
514                )
515            }
516            Self::HashMismatch {
517                entry_index,
518                expected,
519                actual,
520            } => {
521                write!(
522                    f,
523                    "Hash mismatch at entry {}: expected '{}', got '{}'",
524                    entry_index,
525                    &expected[..8.min(expected.len())],
526                    &actual[..8.min(actual.len())]
527                )
528            }
529        }
530    }
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536
537    #[test]
538    fn test_new_audit_log() {
539        let log = AuditLog::new();
540        assert_eq!(log.version, AUDIT_LOG_VERSION);
541        assert_eq!(log.entries.len(), 1); // Initial LogCreated entry
542        assert!(log.verify_integrity());
543    }
544
545    #[test]
546    fn test_record_operations() {
547        let mut log = AuditLog::new();
548
549        log.record(AuditOperation::EmbedStart {
550            repo_path: "/test/repo".to_string(),
551            settings_hash: "abc123".to_string(),
552        });
553
554        log.record(AuditOperation::EmbedComplete {
555            chunks_count: 100,
556            total_tokens: 50000,
557            manifest_hash: "def456".to_string(),
558        });
559
560        assert_eq!(log.entries.len(), 3);
561        assert!(log.verify_integrity());
562    }
563
564    #[test]
565    fn test_hash_chain_integrity() {
566        let mut log = AuditLog::new();
567
568        for i in 0..10 {
569            log.record(AuditOperation::Custom {
570                name: format!("test_{}", i),
571                data: format!("data_{}", i),
572            });
573        }
574
575        assert!(log.verify_integrity());
576
577        // Tamper with an entry's data
578        if let AuditOperation::Custom { ref mut data, .. } = log.entries[5].operation {
579            *data = "tampered".to_string();
580        }
581
582        // Integrity should now fail
583        assert!(!log.verify_integrity());
584    }
585
586    #[test]
587    fn test_verify_integrity_detailed() {
588        let mut log = AuditLog::new();
589
590        log.record(AuditOperation::EmbedStart {
591            repo_path: "/test".to_string(),
592            settings_hash: "hash".to_string(),
593        });
594
595        let report = log.verify_integrity_detailed();
596        assert!(report.is_valid);
597        assert_eq!(report.entries_checked, 2);
598        assert!(report.errors.is_empty());
599    }
600
601    #[test]
602    fn test_tamper_detection_chain_broken() {
603        let mut log = AuditLog::new();
604
605        log.record(AuditOperation::Custom {
606            name: "op1".to_string(),
607            data: "data1".to_string(),
608        });
609        log.record(AuditOperation::Custom {
610            name: "op2".to_string(),
611            data: "data2".to_string(),
612        });
613
614        // Break the chain by modifying prev_hash
615        log.entries[2].prev_hash = "fake_hash".to_string();
616
617        let report = log.verify_integrity_detailed();
618        assert!(!report.is_valid);
619        assert!(!report.errors.is_empty());
620        assert!(matches!(
621            report.errors[0],
622            IntegrityError::ChainBroken { .. }
623        ));
624    }
625
626    #[test]
627    fn test_tamper_detection_hash_mismatch() {
628        let mut log = AuditLog::new();
629
630        log.record(AuditOperation::Custom {
631            name: "op1".to_string(),
632            data: "data1".to_string(),
633        });
634
635        // Modify the entry's own hash
636        log.entries[1].hash = "fake_hash".to_string();
637
638        let report = log.verify_integrity_detailed();
639        assert!(!report.is_valid);
640        assert!(report
641            .errors
642            .iter()
643            .any(|e| matches!(e, IntegrityError::HashMismatch { .. })));
644    }
645
646    #[test]
647    fn test_head_hash() {
648        let mut log = AuditLog::new();
649
650        let initial_head = log.head_hash().map(String::from);
651        assert!(initial_head.is_some());
652
653        let new_hash = log.record(AuditOperation::Custom {
654            name: "test".to_string(),
655            data: "data".to_string(),
656        });
657
658        assert_eq!(log.head_hash(), Some(new_hash.as_str()));
659        assert_ne!(log.head_hash().map(String::from), initial_head);
660    }
661
662    #[test]
663    fn test_filter_by_type() {
664        let mut log = AuditLog::new();
665
666        log.record(AuditOperation::EmbedStart {
667            repo_path: "/repo1".to_string(),
668            settings_hash: "h1".to_string(),
669        });
670        log.record(AuditOperation::EmbedComplete {
671            chunks_count: 100,
672            total_tokens: 50000,
673            manifest_hash: "m1".to_string(),
674        });
675        log.record(AuditOperation::EmbedStart {
676            repo_path: "/repo2".to_string(),
677            settings_hash: "h2".to_string(),
678        });
679
680        let starts = log.filter_by_type(|op| matches!(op, AuditOperation::EmbedStart { .. }));
681        assert_eq!(starts.len(), 2);
682
683        let completes = log.filter_by_type(|op| matches!(op, AuditOperation::EmbedComplete { .. }));
684        assert_eq!(completes.len(), 1);
685    }
686
687    #[test]
688    fn test_filter_by_time() {
689        let mut log = AuditLog::new();
690
691        // All entries will have the same timestamp (or very close)
692        log.record(AuditOperation::Custom {
693            name: "test".to_string(),
694            data: "data".to_string(),
695        });
696
697        let now = SystemTime::now()
698            .duration_since(UNIX_EPOCH)
699            .unwrap()
700            .as_secs();
701
702        let entries = log.filter_by_time(now - 60, now + 60);
703        assert!(!entries.is_empty());
704    }
705
706    #[test]
707    fn test_save_and_load() {
708        let temp_dir = tempfile::TempDir::new().unwrap();
709        let log_path = temp_dir.path().join("audit.json");
710
711        let mut log = AuditLog::new();
712        log.record(AuditOperation::EmbedStart {
713            repo_path: "/test/repo".to_string(),
714            settings_hash: "abc123".to_string(),
715        });
716        log.record(AuditOperation::EmbedComplete {
717            chunks_count: 100,
718            total_tokens: 50000,
719            manifest_hash: "def456".to_string(),
720        });
721
722        // Save
723        log.save(&log_path).unwrap();
724
725        // Load and verify
726        let loaded = AuditLog::load(&log_path).unwrap();
727        assert_eq!(loaded.entries.len(), log.entries.len());
728        assert!(loaded.verify_integrity());
729
730        // Compare hashes
731        for (orig, loaded) in log.entries.iter().zip(loaded.entries.iter()) {
732            assert_eq!(orig.hash, loaded.hash);
733            assert_eq!(orig.prev_hash, loaded.prev_hash);
734        }
735    }
736
737    #[test]
738    fn test_save_jsonl() {
739        let temp_dir = tempfile::TempDir::new().unwrap();
740        let log_path = temp_dir.path().join("audit.jsonl");
741
742        let mut log = AuditLog::new();
743        log.record(AuditOperation::Custom {
744            name: "test".to_string(),
745            data: "data".to_string(),
746        });
747
748        log.save_jsonl(&log_path).unwrap();
749
750        // Verify file exists and has content
751        let content = std::fs::read_to_string(&log_path).unwrap();
752        assert!(!content.is_empty());
753
754        // Count lines (header + entries)
755        let lines: Vec<_> = content.lines().collect();
756        assert_eq!(lines.len(), 3); // header + 2 entries
757    }
758
759    #[test]
760    fn test_security_scan_operation() {
761        let mut log = AuditLog::new();
762
763        log.record(AuditOperation::SecurityScan {
764            findings_count: 5,
765            secrets_redacted: true,
766        });
767
768        assert!(log.verify_integrity());
769
770        let scans = log.filter_by_type(|op| matches!(op, AuditOperation::SecurityScan { .. }));
771        assert_eq!(scans.len(), 1);
772
773        if let AuditOperation::SecurityScan {
774            findings_count,
775            secrets_redacted,
776        } = &scans[0].operation
777        {
778            assert_eq!(*findings_count, 5);
779            assert!(*secrets_redacted);
780        }
781    }
782
783    #[test]
784    fn test_batch_operations() {
785        let mut log = AuditLog::new();
786
787        log.record(AuditOperation::BatchStart {
788            repo_count: 3,
789            total_settings_hash: "settings_hash".to_string(),
790        });
791
792        for i in 0..3 {
793            log.record(AuditOperation::BatchRepoComplete {
794                repo_index: i,
795                repo_path: format!("/repo{}", i),
796                chunks_count: 100 * (i + 1),
797                success: true,
798            });
799        }
800
801        log.record(AuditOperation::BatchComplete {
802            successful: 3,
803            failed: 0,
804            total_chunks: 600,
805        });
806
807        assert!(log.verify_integrity());
808        assert_eq!(log.entries.len(), 6); // LogCreated + BatchStart + 3 repos + BatchComplete
809    }
810
811    #[test]
812    fn test_checkpoint_operations() {
813        let mut log = AuditLog::new();
814
815        log.record(AuditOperation::CheckpointCreated {
816            checkpoint_hash: "ckpt_abc123".to_string(),
817            files_processed: 50,
818            chunks_generated: 200,
819        });
820
821        log.record(AuditOperation::ResumeFromCheckpoint {
822            checkpoint_hash: "ckpt_abc123".to_string(),
823            files_remaining: 100,
824        });
825
826        assert!(log.verify_integrity());
827    }
828}