Skip to main content

trueno/tuner/data_collector/
persistence.rs

1//! Persistent training data storage (T-TUNER-003, GitHub #80).
2//!
3//! APR binary format: `MAGIC(4) + LEN(4) + JSON(n) + CRC32(4)`.
4
5use crate::tuner::error::TunerError;
6use crate::tuner::features::FeatureExtractor;
7use crate::tuner::helpers::crc32_hash;
8
9use super::types::TrainingSample;
10use super::TunerDataCollector;
11
12impl TunerDataCollector {
13    // ========================================================================
14    // T-TUNER-003: Persistent Training Data (GitHub #80)
15    // ========================================================================
16
17    /// Training data cache path
18    #[cfg(feature = "hardware-detect")]
19    pub fn cache_path() -> std::path::PathBuf {
20        let hw_id = Self::hardware_id();
21        dirs::cache_dir()
22            .unwrap_or_else(|| std::path::PathBuf::from(".cache"))
23            .join("trueno")
24            .join(format!("training_data_{}.apr", hw_id))
25    }
26
27    /// Generate hardware fingerprint for hardware-specific models
28    #[cfg(feature = "hardware-detect")]
29    pub fn hardware_id() -> String {
30        use crate::hardware::HardwareCapability;
31        let hw = HardwareCapability::detect();
32
33        // Create a stable fingerprint from hardware characteristics
34        let fingerprint = format!(
35            "{}-{:?}-{}-{}",
36            hw.cpu.cores,
37            hw.cpu.simd,
38            hw.gpu.as_ref().map(|g| g.model.as_str()).unwrap_or("none"),
39            hw.gpu.as_ref().map(|g| g.vram_gb as u32).unwrap_or(0),
40        );
41
42        // Hash to short hex string
43        let hash = crc32_hash(fingerprint.as_bytes());
44        format!("{:08x}", hash)
45    }
46
47    /// Load from cache or create empty
48    #[cfg(feature = "hardware-detect")]
49    pub fn load_or_create() -> Self {
50        let path = Self::cache_path();
51        if path.exists() {
52            if let Ok(collector) = Self::load_apr(&path) {
53                return collector;
54            }
55        }
56        Self::new()
57    }
58
59    /// Save training data to APR format
60    pub fn save_apr<P: AsRef<std::path::Path>>(&self, path: P) -> Result<(), TunerError> {
61        use std::io::Write;
62
63        // Ensure parent directory exists
64        if let Some(parent) = path.as_ref().parent() {
65            std::fs::create_dir_all(parent)
66                .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
67        }
68
69        // Serialize samples to JSON
70        let json = serde_json::to_string(&self.samples)
71            .map_err(|e| TunerError::Serialization(e.to_string()))?;
72        let json_bytes = json.as_bytes();
73
74        // Create APR format: MAGIC + LEN + JSON + CRC32
75        let mut file = std::fs::File::create(path.as_ref())
76            .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
77
78        // Write magic bytes: "APR2" (version 2 for training data)
79        file.write_all(b"APR2").map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
80
81        // Write length as u32 little-endian
82        let len = json_bytes.len() as u32;
83        file.write_all(&len.to_le_bytes())
84            .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
85
86        // Write JSON
87        file.write_all(json_bytes).map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
88
89        // Write CRC32 checksum
90        let checksum = crc32_hash(json_bytes);
91        file.write_all(&checksum.to_le_bytes())
92            .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
93
94        Ok(())
95    }
96
97    /// Load training data from APR format
98    pub fn load_apr<P: AsRef<std::path::Path>>(path: P) -> Result<Self, TunerError> {
99        use std::io::Read;
100
101        let mut file = std::fs::File::open(path.as_ref())
102            .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
103
104        // Read and verify magic
105        let mut magic = [0u8; 4];
106        file.read_exact(&mut magic).map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
107        if &magic != b"APR2" {
108            return Err(TunerError::InvalidFormat(format!("Expected APR2 magic, got {:?}", magic)));
109        }
110
111        // Read length
112        let mut len_bytes = [0u8; 4];
113        file.read_exact(&mut len_bytes)
114            .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
115        let len = u32::from_le_bytes(len_bytes) as usize;
116
117        // Read JSON
118        let mut json_bytes = vec![0u8; len];
119        file.read_exact(&mut json_bytes)
120            .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
121
122        // Read and verify CRC32
123        let mut crc_bytes = [0u8; 4];
124        file.read_exact(&mut crc_bytes)
125            .map_err(|e: std::io::Error| TunerError::Io(e.to_string()))?;
126        let stored_crc = u32::from_le_bytes(crc_bytes);
127        let computed_crc = crc32_hash(&json_bytes);
128
129        if stored_crc != computed_crc {
130            return Err(TunerError::InvalidFormat(format!(
131                "CRC mismatch: stored={:08x}, computed={:08x}",
132                stored_crc, computed_crc
133            )));
134        }
135
136        // Deserialize samples
137        let samples: Vec<TrainingSample> = serde_json::from_slice(&json_bytes)
138            .map_err(|e| TunerError::Serialization(e.to_string()))?;
139
140        Ok(Self {
141            samples,
142            extractor: FeatureExtractor::new(),
143            retrain_threshold: 100,
144            samples_at_last_train: 0,
145            feedback: Vec::new(),
146            online_learning_enabled: false,
147            error_window: Vec::new(),
148            error_window_size: Self::DEFAULT_ERROR_WINDOW_SIZE,
149        })
150    }
151
152    /// Append a sample to the cached training data
153    #[cfg(feature = "hardware-detect")]
154    pub fn record_and_persist(
155        &mut self,
156        profiler: &crate::brick::BrickProfiler,
157        config: &crate::tuner::features::RunConfig,
158        kernel: crate::tuner::types::KernelType,
159    ) -> Result<(), TunerError> {
160        // Record the sample
161        self.record(profiler, config, kernel);
162
163        // Append to cache file
164        let path = Self::cache_path();
165        self.save_apr(&path)?;
166
167        Ok(())
168    }
169
170    /// Import samples from JSON
171    pub fn from_json(json: &str) -> Result<Self, TunerError> {
172        let samples: Vec<TrainingSample> =
173            serde_json::from_str(json).map_err(|e| TunerError::Serialization(e.to_string()))?;
174
175        Ok(Self {
176            samples,
177            extractor: FeatureExtractor::new(),
178            retrain_threshold: 100,
179            samples_at_last_train: 0,
180            feedback: Vec::new(),
181            online_learning_enabled: false,
182            error_window: Vec::new(),
183            error_window_size: Self::DEFAULT_ERROR_WINDOW_SIZE,
184        })
185    }
186
187    /// Import samples from the Five-Whys archive (85 labeled iterations)
188    /// Bootstrap initial training data from historical analysis
189    pub fn bootstrap_from_five_whys() -> Self {
190        // Five-Whys archive has 85 labeled iterations from SHOWCASE-BRICK-001
191        // Each iteration has: features, throughput, kernel selection, bottleneck
192
193        // Returns empty collector -- data will be collected from real runs
194        // Five-Whys archive loading is not yet implemented
195        Self::new()
196    }
197}