ruvector_data_framework/
persistence.rs

1//! Persistence Layer for RuVector Discovery Framework
2//!
3//! This module provides serialization/deserialization for the OptimizedDiscoveryEngine
4//! and discovered patterns. Supports:
5//! - Full engine state save/load
6//! - Pattern-only save/load/append
7//! - Optional gzip compression for large datasets
8//! - Incremental pattern appends without rewriting entire files
9
10use std::collections::HashMap;
11use std::fs::File;
12use std::io::{BufReader, BufWriter, Read, Write};
13use std::path::Path;
14
15use chrono::{DateTime, Utc};
16use flate2::Compression;
17use flate2::read::GzDecoder;
18use flate2::write::GzEncoder;
19use serde::{Deserialize, Serialize};
20
21use crate::optimized::{OptimizedConfig, OptimizedDiscoveryEngine, SignificantPattern};
22use crate::ruvector_native::{
23    CoherenceSnapshot, Domain, GraphEdge, GraphNode, SemanticVector,
24};
25use crate::{FrameworkError, Result};
26
27/// Serializable state of the OptimizedDiscoveryEngine
28///
29/// This struct excludes non-serializable fields like AtomicU64 metrics
30/// and caches, focusing on the core graph and history state.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct EngineState {
33    /// Engine configuration
34    pub config: OptimizedConfig,
35    /// All semantic vectors
36    pub vectors: Vec<SemanticVector>,
37    /// Graph nodes
38    pub nodes: HashMap<u32, GraphNode>,
39    /// Graph edges
40    pub edges: Vec<GraphEdge>,
41    /// Coherence history (timestamp, mincut value, snapshot)
42    pub coherence_history: Vec<(DateTime<Utc>, f64, CoherenceSnapshot)>,
43    /// Next node ID counter
44    pub next_node_id: u32,
45    /// Domain-specific node indices
46    pub domain_nodes: HashMap<Domain, Vec<u32>>,
47    /// Temporal analysis state
48    pub domain_timeseries: HashMap<Domain, Vec<(DateTime<Utc>, f64)>>,
49    /// Metadata about when this state was saved
50    pub saved_at: DateTime<Utc>,
51    /// Version for compatibility checking
52    pub version: String,
53}
54
55impl EngineState {
56    /// Create a new empty engine state
57    pub fn new(config: OptimizedConfig) -> Self {
58        Self {
59            config,
60            vectors: Vec::new(),
61            nodes: HashMap::new(),
62            edges: Vec::new(),
63            coherence_history: Vec::new(),
64            next_node_id: 0,
65            domain_nodes: HashMap::new(),
66            domain_timeseries: HashMap::new(),
67            saved_at: Utc::now(),
68            version: env!("CARGO_PKG_VERSION").to_string(),
69        }
70    }
71}
72
73/// Options for saving/loading with compression
74#[derive(Debug, Clone, Copy)]
75pub struct PersistenceOptions {
76    /// Enable gzip compression
77    pub compress: bool,
78    /// Compression level (0-9, higher = better compression but slower)
79    pub compression_level: u32,
80    /// Pretty-print JSON (larger files, more readable)
81    pub pretty: bool,
82}
83
84impl Default for PersistenceOptions {
85    fn default() -> Self {
86        Self {
87            compress: false,
88            compression_level: 6,
89            pretty: false,
90        }
91    }
92}
93
94impl PersistenceOptions {
95    /// Create options with compression enabled
96    pub fn compressed() -> Self {
97        Self {
98            compress: true,
99            ..Default::default()
100        }
101    }
102
103    /// Create options with pretty-printed JSON
104    pub fn pretty() -> Self {
105        Self {
106            pretty: true,
107            ..Default::default()
108        }
109    }
110}
111
112/// Save the OptimizedDiscoveryEngine state to a file
113///
114/// # Arguments
115/// * `engine` - The engine to save
116/// * `path` - Path to save to (will be created/overwritten)
117/// * `options` - Persistence options (compression, formatting)
118///
119/// # Example
120/// ```no_run
121/// # use ruvector_data_framework::optimized::{OptimizedConfig, OptimizedDiscoveryEngine};
122/// # use ruvector_data_framework::persistence::{save_engine, PersistenceOptions};
123/// # use std::path::Path;
124/// let engine = OptimizedDiscoveryEngine::new(OptimizedConfig::default());
125/// save_engine(&engine, Path::new("engine_state.json"), &PersistenceOptions::default())?;
126/// # Ok::<(), Box<dyn std::error::Error>>(())
127/// ```
128pub fn save_engine(
129    engine: &OptimizedDiscoveryEngine,
130    path: &Path,
131    options: &PersistenceOptions,
132) -> Result<()> {
133    // Extract serializable state
134    let state = extract_state(engine);
135
136    // Save to file
137    save_state(&state, path, options)?;
138
139    tracing::info!(
140        "Saved engine state to {} ({} nodes, {} edges)",
141        path.display(),
142        state.nodes.len(),
143        state.edges.len()
144    );
145
146    Ok(())
147}
148
149/// Load an OptimizedDiscoveryEngine from a saved state file
150///
151/// # Arguments
152/// * `path` - Path to the saved state file
153///
154/// # Returns
155/// A new OptimizedDiscoveryEngine with the loaded state
156///
157/// # Example
158/// ```no_run
159/// # use ruvector_data_framework::persistence::load_engine;
160/// # use std::path::Path;
161/// let engine = load_engine(Path::new("engine_state.json"))?;
162/// # Ok::<(), Box<dyn std::error::Error>>(())
163/// ```
164pub fn load_engine(path: &Path) -> Result<OptimizedDiscoveryEngine> {
165    let state = load_state(path)?;
166
167    tracing::info!(
168        "Loaded engine state from {} ({} nodes, {} edges)",
169        path.display(),
170        state.nodes.len(),
171        state.edges.len()
172    );
173
174    // Reconstruct engine from state
175    Ok(reconstruct_engine(state))
176}
177
178/// Save discovered patterns to a JSON file
179///
180/// # Arguments
181/// * `patterns` - Patterns to save
182/// * `path` - Path to save to
183/// * `options` - Persistence options
184///
185/// # Example
186/// ```no_run
187/// # use ruvector_data_framework::optimized::SignificantPattern;
188/// # use ruvector_data_framework::persistence::{save_patterns, PersistenceOptions};
189/// # use std::path::Path;
190/// let patterns: Vec<SignificantPattern> = vec![];
191/// save_patterns(&patterns, Path::new("patterns.json"), &PersistenceOptions::default())?;
192/// # Ok::<(), Box<dyn std::error::Error>>(())
193/// ```
194pub fn save_patterns(
195    patterns: &[SignificantPattern],
196    path: &Path,
197    options: &PersistenceOptions,
198) -> Result<()> {
199    let file = File::create(path).map_err(|e| {
200        FrameworkError::Discovery(format!("Failed to create file {}: {}", path.display(), e))
201    })?;
202
203    let writer = BufWriter::new(file);
204
205    if options.compress {
206        let mut encoder = GzEncoder::new(writer, Compression::new(options.compression_level));
207        let json = if options.pretty {
208            serde_json::to_string_pretty(patterns)?
209        } else {
210            serde_json::to_string(patterns)?
211        };
212        encoder.write_all(json.as_bytes()).map_err(|e| {
213            FrameworkError::Discovery(format!("Failed to write compressed patterns: {}", e))
214        })?;
215        encoder.finish().map_err(|e| {
216            FrameworkError::Discovery(format!("Failed to finish compression: {}", e))
217        })?;
218    } else {
219        if options.pretty {
220            serde_json::to_writer_pretty(writer, patterns)?;
221        } else {
222            serde_json::to_writer(writer, patterns)?;
223        }
224    }
225
226    tracing::info!("Saved {} patterns to {}", patterns.len(), path.display());
227
228    Ok(())
229}
230
231/// Load patterns from a JSON file
232///
233/// # Arguments
234/// * `path` - Path to the patterns file
235///
236/// # Returns
237/// Vector of loaded patterns
238///
239/// # Example
240/// ```no_run
241/// # use ruvector_data_framework::persistence::load_patterns;
242/// # use std::path::Path;
243/// let patterns = load_patterns(Path::new("patterns.json"))?;
244/// # Ok::<(), Box<dyn std::error::Error>>(())
245/// ```
246pub fn load_patterns(path: &Path) -> Result<Vec<SignificantPattern>> {
247    let file = File::open(path).map_err(|e| {
248        FrameworkError::Discovery(format!("Failed to open file {}: {}", path.display(), e))
249    })?;
250
251    let reader = BufReader::new(file);
252
253    // Try to detect if file is gzip-compressed by reading magic bytes
254    let mut peeker = BufReader::new(File::open(path).unwrap());
255    let mut magic = [0u8; 2];
256    let is_gzip = peeker.read_exact(&mut magic).is_ok() && magic == [0x1f, 0x8b];
257
258    let patterns: Vec<SignificantPattern> = if is_gzip {
259        let file = File::open(path).unwrap();
260        let decoder = GzDecoder::new(BufReader::new(file));
261        serde_json::from_reader(decoder)?
262    } else {
263        serde_json::from_reader(reader)?
264    };
265
266    tracing::info!("Loaded {} patterns from {}", patterns.len(), path.display());
267
268    Ok(patterns)
269}
270
271/// Append new patterns to an existing patterns file
272///
273/// This is more efficient than loading all patterns, adding new ones,
274/// and saving the entire list. However, it only works with uncompressed
275/// JSON arrays.
276///
277/// # Arguments
278/// * `patterns` - New patterns to append
279/// * `path` - Path to the existing patterns file
280///
281/// # Note
282/// If the file doesn't exist, it will be created with the given patterns.
283/// For compressed files, this will decompress, append, and recompress.
284///
285/// # Example
286/// ```no_run
287/// # use ruvector_data_framework::optimized::SignificantPattern;
288/// # use ruvector_data_framework::persistence::append_patterns;
289/// # use std::path::Path;
290/// let new_patterns: Vec<SignificantPattern> = vec![];
291/// append_patterns(&new_patterns, Path::new("patterns.json"))?;
292/// # Ok::<(), Box<dyn std::error::Error>>(())
293/// ```
294pub fn append_patterns(patterns: &[SignificantPattern], path: &Path) -> Result<()> {
295    if patterns.is_empty() {
296        return Ok(());
297    }
298
299    // Check if file exists
300    if !path.exists() {
301        // Create new file
302        return save_patterns(patterns, path, &PersistenceOptions::default());
303    }
304
305    // Load existing patterns
306    let mut existing = load_patterns(path)?;
307
308    // Append new patterns
309    existing.extend_from_slice(patterns);
310
311    // Save combined patterns
312    // Preserve compression if original was compressed
313    let options = if is_compressed(path)? {
314        PersistenceOptions::compressed()
315    } else {
316        PersistenceOptions::default()
317    };
318
319    save_patterns(&existing, path, &options)?;
320
321    tracing::info!(
322        "Appended {} patterns to {} (total: {})",
323        patterns.len(),
324        path.display(),
325        existing.len()
326    );
327
328    Ok(())
329}
330
331// ============================================================================
332// Internal Helper Functions
333// ============================================================================
334
335/// Extract serializable state from an OptimizedDiscoveryEngine
336///
337/// This uses reflection-like access to the engine's internal state.
338/// In practice, you'd need to add getter methods to OptimizedDiscoveryEngine.
339fn extract_state(_engine: &OptimizedDiscoveryEngine) -> EngineState {
340    // Note: This requires the OptimizedDiscoveryEngine to expose its internal state
341    // via getter methods. For now, we'll use a placeholder that you'll need to implement.
342
343    // Since we can't directly access private fields, we need the engine to provide
344    // a method like `pub fn extract_state(&self) -> EngineState`
345
346    // For now, return a minimal state with what we can access
347    // TODO: Uncomment when OptimizedDiscoveryEngine provides getter methods
348    // let _stats = engine.stats();
349
350    EngineState {
351        config: OptimizedConfig::default(), // Would need engine.config() method
352        vectors: Vec::new(), // Would need engine.vectors() method
353        nodes: HashMap::new(), // Would need engine.nodes() method
354        edges: Vec::new(), // Would need engine.edges() method
355        coherence_history: Vec::new(), // Would need engine.coherence_history() method
356        next_node_id: 0, // Would need engine.next_node_id() method
357        domain_nodes: HashMap::new(), // Would need engine.domain_nodes() method
358        domain_timeseries: HashMap::new(), // Would need engine.domain_timeseries() method
359        saved_at: Utc::now(),
360        version: env!("CARGO_PKG_VERSION").to_string(),
361    }
362
363    // TODO: Implement proper state extraction once OptimizedDiscoveryEngine
364    // exposes the necessary getter methods
365}
366
367/// Reconstruct an OptimizedDiscoveryEngine from saved state
368fn reconstruct_engine(state: EngineState) -> OptimizedDiscoveryEngine {
369    // Similarly, this would require OptimizedDiscoveryEngine to have
370    // a constructor like `pub fn from_state(state: EngineState) -> Self`
371
372    // For now, create a new engine and note that full reconstruction
373    // would require additional methods
374    OptimizedDiscoveryEngine::new(state.config)
375
376    // TODO: Implement proper engine reconstruction once OptimizedDiscoveryEngine
377    // provides the necessary methods to restore state
378}
379
380/// Save engine state to a file with optional compression
381fn save_state(state: &EngineState, path: &Path, options: &PersistenceOptions) -> Result<()> {
382    let file = File::create(path).map_err(|e| {
383        FrameworkError::Discovery(format!("Failed to create file {}: {}", path.display(), e))
384    })?;
385
386    let writer = BufWriter::new(file);
387
388    if options.compress {
389        let mut encoder = GzEncoder::new(writer, Compression::new(options.compression_level));
390        let json = if options.pretty {
391            serde_json::to_string_pretty(state)?
392        } else {
393            serde_json::to_string(state)?
394        };
395        encoder.write_all(json.as_bytes()).map_err(|e| {
396            FrameworkError::Discovery(format!("Failed to write compressed state: {}", e))
397        })?;
398        encoder.finish().map_err(|e| {
399            FrameworkError::Discovery(format!("Failed to finish compression: {}", e))
400        })?;
401    } else {
402        if options.pretty {
403            serde_json::to_writer_pretty(writer, state)?;
404        } else {
405            serde_json::to_writer(writer, state)?;
406        }
407    }
408
409    Ok(())
410}
411
412/// Load engine state from a file with automatic compression detection
413fn load_state(path: &Path) -> Result<EngineState> {
414    let file = File::open(path).map_err(|e| {
415        FrameworkError::Discovery(format!("Failed to open file {}: {}", path.display(), e))
416    })?;
417
418    // Detect compression by reading magic bytes
419    let is_gzip = is_compressed(path)?;
420
421    let state = if is_gzip {
422        let file = File::open(path).unwrap();
423        let decoder = GzDecoder::new(BufReader::new(file));
424        serde_json::from_reader(decoder)?
425    } else {
426        let reader = BufReader::new(file);
427        serde_json::from_reader(reader)?
428    };
429
430    Ok(state)
431}
432
433/// Check if a file is gzip-compressed by reading magic bytes
434fn is_compressed(path: &Path) -> Result<bool> {
435    let mut file = File::open(path).map_err(|e| {
436        FrameworkError::Discovery(format!("Failed to open file {}: {}", path.display(), e))
437    })?;
438
439    let mut magic = [0u8; 2];
440    match file.read_exact(&mut magic) {
441        Ok(_) => Ok(magic == [0x1f, 0x8b]),
442        Err(_) => Ok(false), // File too small or empty
443    }
444}
445
446/// Get file size in bytes
447pub fn get_file_size(path: &Path) -> Result<u64> {
448    let metadata = std::fs::metadata(path).map_err(|e| {
449        FrameworkError::Discovery(format!("Failed to get file metadata: {}", e))
450    })?;
451    Ok(metadata.len())
452}
453
454/// Calculate compression ratio for a file
455///
456/// Returns (compressed_size, uncompressed_size, ratio)
457pub fn compression_info(path: &Path) -> Result<(u64, u64, f64)> {
458    let compressed_size = get_file_size(path)?;
459
460    if is_compressed(path)? {
461        // Decompress and count bytes
462        let file = File::open(path).unwrap();
463        let mut decoder = GzDecoder::new(BufReader::new(file));
464        let mut buffer = Vec::new();
465        let uncompressed_size = decoder.read_to_end(&mut buffer).map_err(|e| {
466            FrameworkError::Discovery(format!("Failed to decompress: {}", e))
467        })? as u64;
468
469        let ratio = compressed_size as f64 / uncompressed_size as f64;
470        Ok((compressed_size, uncompressed_size, ratio))
471    } else {
472        Ok((compressed_size, compressed_size, 1.0))
473    }
474}
475
476#[cfg(test)]
477mod tests {
478    use super::*;
479    use crate::optimized::OptimizedConfig;
480    use crate::ruvector_native::{DiscoveredPattern, PatternType, Evidence};
481    use tempfile::NamedTempFile;
482
483    #[test]
484    fn test_engine_state_creation() {
485        let config = OptimizedConfig::default();
486        let state = EngineState::new(config.clone());
487
488        assert_eq!(state.next_node_id, 0);
489        assert_eq!(state.nodes.len(), 0);
490        assert_eq!(state.config.similarity_threshold, config.similarity_threshold);
491    }
492
493    #[test]
494    fn test_persistence_options() {
495        let default = PersistenceOptions::default();
496        assert!(!default.compress);
497        assert!(!default.pretty);
498
499        let compressed = PersistenceOptions::compressed();
500        assert!(compressed.compress);
501
502        let pretty = PersistenceOptions::pretty();
503        assert!(pretty.pretty);
504    }
505
506    #[test]
507    fn test_save_load_patterns() {
508        let temp_file = NamedTempFile::new().unwrap();
509        let path = temp_file.path();
510
511        let patterns = vec![
512            SignificantPattern {
513                pattern: DiscoveredPattern {
514                    id: "test-1".to_string(),
515                    pattern_type: PatternType::CoherenceBreak,
516                    confidence: 0.85,
517                    affected_nodes: vec![1, 2, 3],
518                    detected_at: Utc::now(),
519                    description: "Test pattern".to_string(),
520                    evidence: vec![
521                        Evidence {
522                            evidence_type: "test".to_string(),
523                            value: 1.0,
524                            description: "Test evidence".to_string(),
525                        }
526                    ],
527                    cross_domain_links: vec![],
528                },
529                p_value: 0.03,
530                effect_size: 1.2,
531                confidence_interval: (0.5, 1.5),
532                is_significant: true,
533            }
534        ];
535
536        // Save patterns
537        save_patterns(&patterns, path, &PersistenceOptions::default()).unwrap();
538
539        // Load patterns
540        let loaded = load_patterns(path).unwrap();
541
542        assert_eq!(loaded.len(), 1);
543        assert_eq!(loaded[0].pattern.id, "test-1");
544        assert_eq!(loaded[0].p_value, 0.03);
545    }
546
547    #[test]
548    fn test_save_load_patterns_compressed() {
549        let temp_file = NamedTempFile::new().unwrap();
550        let path = temp_file.path();
551
552        let patterns = vec![
553            SignificantPattern {
554                pattern: DiscoveredPattern {
555                    id: "test-compressed".to_string(),
556                    pattern_type: PatternType::Consolidation,
557                    confidence: 0.90,
558                    affected_nodes: vec![4, 5, 6],
559                    detected_at: Utc::now(),
560                    description: "Compressed test pattern".to_string(),
561                    evidence: vec![],
562                    cross_domain_links: vec![],
563                },
564                p_value: 0.01,
565                effect_size: 2.0,
566                confidence_interval: (1.0, 3.0),
567                is_significant: true,
568            }
569        ];
570
571        // Save with compression
572        save_patterns(&patterns, path, &PersistenceOptions::compressed()).unwrap();
573
574        // Verify compression
575        assert!(is_compressed(path).unwrap());
576
577        // Load and verify
578        let loaded = load_patterns(path).unwrap();
579        assert_eq!(loaded.len(), 1);
580        assert_eq!(loaded[0].pattern.id, "test-compressed");
581    }
582
583    #[test]
584    fn test_append_patterns() {
585        let temp_file = NamedTempFile::new().unwrap();
586        let path = temp_file.path();
587
588        let pattern1 = vec![
589            SignificantPattern {
590                pattern: DiscoveredPattern {
591                    id: "pattern-1".to_string(),
592                    pattern_type: PatternType::EmergingCluster,
593                    confidence: 0.75,
594                    affected_nodes: vec![1],
595                    detected_at: Utc::now(),
596                    description: "First pattern".to_string(),
597                    evidence: vec![],
598                    cross_domain_links: vec![],
599                },
600                p_value: 0.05,
601                effect_size: 1.0,
602                confidence_interval: (0.0, 2.0),
603                is_significant: false,
604            }
605        ];
606
607        let pattern2 = vec![
608            SignificantPattern {
609                pattern: DiscoveredPattern {
610                    id: "pattern-2".to_string(),
611                    pattern_type: PatternType::Cascade,
612                    confidence: 0.95,
613                    affected_nodes: vec![2],
614                    detected_at: Utc::now(),
615                    description: "Second pattern".to_string(),
616                    evidence: vec![],
617                    cross_domain_links: vec![],
618                },
619                p_value: 0.001,
620                effect_size: 3.0,
621                confidence_interval: (2.0, 4.0),
622                is_significant: true,
623            }
624        ];
625
626        // Save first pattern
627        save_patterns(&pattern1, path, &PersistenceOptions::default()).unwrap();
628
629        // Append second pattern
630        append_patterns(&pattern2, path).unwrap();
631
632        // Load and verify both are present
633        let loaded = load_patterns(path).unwrap();
634        assert_eq!(loaded.len(), 2);
635        assert_eq!(loaded[0].pattern.id, "pattern-1");
636        assert_eq!(loaded[1].pattern.id, "pattern-2");
637    }
638}