siftdb_core/
bench.rs

1use crate::{SiftDB, ingest::{Ingester, IngestOptions}};
2use std::time::{Duration, Instant, SystemTime};
3use std::path::Path;
4use std::fs;
5use anyhow::Result;
6use serde::{Serialize, Deserialize};
7
8#[derive(Serialize, Deserialize, Clone, Debug)]
9pub struct BenchmarkResults {
10    pub name: String,
11    pub duration: Duration, 
12    pub files_processed: u64,
13    pub bytes_processed: u64,
14    pub queries_per_second: Option<f64>,
15    pub throughput_mbps: Option<f64>,
16}
17
18#[derive(Serialize, Deserialize)]
19pub struct BenchmarkSuite {
20    pub version: String,
21    pub timestamp: String,
22    pub git_commit: Option<String>,
23    pub test_environment: TestEnvironment,
24    pub benchmarks: Vec<BenchmarkResults>,
25}
26
27#[derive(Serialize, Deserialize)]
28pub struct TestEnvironment {
29    pub os: String,
30    pub cpu: String,
31    pub memory: String,
32}
33
34impl BenchmarkResults {
35    pub fn print(&self) {
36        println!("=== {} ===", self.name);
37        println!("Duration: {:.2}s", self.duration.as_secs_f64());
38        if self.files_processed > 0 {
39            println!("Files processed: {}", self.files_processed);
40            println!("Files/sec: {:.1}", self.files_processed as f64 / self.duration.as_secs_f64());
41        }
42        if self.bytes_processed > 0 {
43            let mb = self.bytes_processed as f64 / (1024.0 * 1024.0);
44            println!("Data processed: {:.2} MB", mb);
45            if let Some(throughput) = self.throughput_mbps {
46                println!("Throughput: {:.2} MB/s", throughput);
47            }
48        }
49        if let Some(qps) = self.queries_per_second {
50            println!("Queries/sec: {:.1}", qps);
51        }
52        println!();
53    }
54}
55
56pub struct SiftDBBenchmark {
57    collection_path: std::path::PathBuf,
58    source_path: std::path::PathBuf,
59}
60
61impl SiftDBBenchmark {
62    pub fn new<P1: AsRef<Path>, P2: AsRef<Path>>(collection_path: P1, source_path: P2) -> Self {
63        Self {
64            collection_path: collection_path.as_ref().to_path_buf(),
65            source_path: source_path.as_ref().to_path_buf(),
66        }
67    }
68
69    pub fn run_all(&mut self) -> Vec<BenchmarkResults> {
70        println!("🚀 SiftDB Performance Benchmark");
71        println!("================================");
72        println!();
73        
74        let mut results = Vec::new();
75        
76        // Initialize collection
77        results.push(self.bench_init());
78        
79        // Import benchmark
80        results.push(self.bench_import());
81        
82        // Search benchmarks
83        results.extend(self.bench_searches());
84        
85        self.print_summary(&results);
86        
87        // Save results to file
88        if let Err(e) = self.save_results(&results) {
89            eprintln!("Warning: Failed to save benchmark results: {}", e);
90        }
91        
92        results
93    }
94
95    fn bench_init(&self) -> BenchmarkResults {
96        let start = Instant::now();
97        
98        SiftDB::init(&self.collection_path).expect("Failed to initialize collection");
99        
100        let duration = start.elapsed();
101        
102        BenchmarkResults {
103            name: "Collection Initialization".to_string(),
104            duration,
105            files_processed: 0,
106            bytes_processed: 0,
107            queries_per_second: None,
108            throughput_mbps: None,
109        }
110    }
111
112    fn bench_import(&mut self) -> BenchmarkResults {
113        println!("📁 Starting import benchmark...");
114        
115        let start = Instant::now();
116        
117        let _db = SiftDB::open(&self.collection_path).expect("Failed to open collection");
118        
119        let mut options = IngestOptions::default();
120        options.include_patterns = vec!["**/*.rs".to_string(), "**/*.md".to_string(), "**/*.toml".to_string(), "**/*.json".to_string()];
121        
122        let mut ingester = Ingester::new(self.collection_path.clone(), options);
123        
124        // Count source files and bytes before import
125        let (source_files, source_bytes) = self.count_source_files();
126        println!("  Source files found: {}", source_files);  
127        println!("  Source data: {:.2} MB", source_bytes as f64 / (1024.0 * 1024.0));
128        
129        let stats = ingester.ingest_from_fs(&self.source_path).expect("Failed to ingest");
130        
131        let duration = start.elapsed();
132        
133        // Calculate storage efficiency
134        let storage_bytes = self.calculate_total_bytes();
135        let compression_ratio = if source_bytes > 0 {
136            storage_bytes as f64 / source_bytes as f64
137        } else {
138            1.0
139        };
140        
141        println!("  ✅ Import completed in {:.2}s", duration.as_secs_f64());
142        println!("  📊 Files ingested: {} ({} skipped, {} errors)", 
143                 stats.ingested, stats.skipped, stats.errors);
144        println!("  💾 Storage size: {:.2} MB (ratio: {:.2}x)", 
145                 storage_bytes as f64 / (1024.0 * 1024.0), compression_ratio);
146        
147        BenchmarkResults {
148            name: "File Import".to_string(),
149            duration,
150            files_processed: stats.ingested,
151            bytes_processed: source_bytes,
152            queries_per_second: None,
153            throughput_mbps: Some(source_bytes as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()),
154        }
155    }
156
157    fn bench_searches(&self) -> Vec<BenchmarkResults> {
158        let mut results = Vec::new();
159        
160        let db = SiftDB::open(&self.collection_path).expect("Failed to open collection");
161        let snapshot = db.snapshot().expect("Failed to create snapshot");
162        
163        // Common search patterns
164        let queries = vec![
165            ("fn", "Function definitions"),
166            ("println", "Print statements"),
167            ("use", "Import statements"),
168            ("struct", "Struct definitions"),
169            ("impl", "Implementation blocks"),
170            ("pub", "Public items"),
171            ("let", "Variable declarations"),
172            ("match", "Pattern matching"),
173            ("async", "Async code"),
174            ("Result", "Result types"),
175        ];
176
177        for (query, description) in queries {
178            results.push(self.bench_single_search(&snapshot, query, description));
179        }
180
181        results
182    }
183
184    fn bench_single_search(&self, snapshot: &crate::Snapshot, query: &str, description: &str) -> BenchmarkResults {
185        let iterations = 10;
186        let mut total_duration = Duration::new(0, 0);
187        let mut total_hits = 0;
188
189        // Warm up
190        snapshot.find(query, None, Some(1000)).ok();
191
192        // Run benchmark iterations
193        for _ in 0..iterations {
194            let start = Instant::now();
195            if let Ok(hits) = snapshot.find(query, None, Some(1000)) {
196                total_hits = hits.len();
197            }
198            total_duration += start.elapsed();
199        }
200
201        let avg_duration = total_duration / iterations as u32;
202        let qps = iterations as f64 / total_duration.as_secs_f64();
203
204        BenchmarkResults {
205            name: format!("Search: '{}' ({})", query, description),
206            duration: avg_duration,
207            files_processed: total_hits as u64,
208            bytes_processed: 0,
209            queries_per_second: Some(qps),
210            throughput_mbps: None,
211        }
212    }
213
214    fn count_source_files(&self) -> (u64, u64) {
215        let mut file_count = 0;
216        let mut byte_count = 0;
217        
218        let walker = ignore::WalkBuilder::new(&self.source_path)
219            .hidden(false)
220            .git_ignore(true)
221            .build();
222            
223        for entry in walker {
224            if let Ok(entry) = entry {
225                let path = entry.path();
226                if path.is_file() {
227                    // Check if file matches our patterns
228                    let path_str = path.to_string_lossy();
229                    if path_str.ends_with(".rs") || path_str.ends_with(".md") || 
230                       path_str.ends_with(".toml") || path_str.ends_with(".json") {
231                        file_count += 1;
232                        if let Ok(metadata) = path.metadata() {
233                            byte_count += metadata.len();
234                        }
235                    }
236                }
237            }
238        }
239        
240        (file_count, byte_count)
241    }
242
243    fn calculate_total_bytes(&self) -> u64 {
244        let mut total = 0;
245        
246        if let Ok(entries) = fs::read_dir(&self.collection_path.join("store")) {
247            for entry in entries.flatten() {
248                if let Ok(metadata) = entry.metadata() {
249                    total += metadata.len();
250                }
251            }
252        }
253        
254        total
255    }
256    
257    fn save_results(&self, results: &[BenchmarkResults]) -> Result<(), Box<dyn std::error::Error>> {
258        let suite = BenchmarkSuite {
259            version: env!("CARGO_PKG_VERSION").to_string(),
260            timestamp: SystemTime::now()
261                .duration_since(SystemTime::UNIX_EPOCH)?
262                .as_secs().to_string(),
263            git_commit: self.get_git_commit(),
264            test_environment: TestEnvironment {
265                os: std::env::consts::OS.to_string(),
266                cpu: "unknown".to_string(), // Could use sysinfo crate later
267                memory: "unknown".to_string(),
268            },
269            benchmarks: results.to_vec(),
270        };
271        
272        let benchmarks_dir = self.collection_path.parent()
273            .unwrap_or(&self.collection_path)
274            .join("benchmarks/results");
275            
276        std::fs::create_dir_all(&benchmarks_dir)?;
277        
278        let filename = format!("benchmark-{}.json", suite.timestamp);
279        let filepath = benchmarks_dir.join(filename);
280        
281        let json = serde_json::to_string_pretty(&suite)?;
282        std::fs::write(filepath, json)?;
283        
284        Ok(())
285    }
286    
287    fn get_git_commit(&self) -> Option<String> {
288        std::process::Command::new("git")
289            .arg("rev-parse")
290            .arg("--short")
291            .arg("HEAD")
292            .current_dir(self.collection_path.parent().unwrap_or(&self.collection_path))
293            .output()
294            .ok()
295            .and_then(|output| {
296                if output.status.success() {
297                    String::from_utf8(output.stdout).ok()
298                        .map(|s| s.trim().to_string())
299                } else {
300                    None
301                }
302            })
303    }
304
305    fn print_summary(&self, results: &[BenchmarkResults]) {
306        println!("📊 Benchmark Summary");
307        println!("===================");
308        
309        for result in results {
310            result.print();
311        }
312
313        // Overall stats
314        let import_result = results.iter().find(|r| r.name.contains("Import"));
315        
316        if let Some(import) = import_result {
317            println!("🎯 Key Performance Metrics:");
318            println!("- Import Rate: {:.0} files/sec", import.files_processed as f64 / import.duration.as_secs_f64());
319            if let Some(throughput) = import.throughput_mbps {
320                println!("- Import Throughput: {:.1} MB/s", throughput);
321            }
322            
323            let search_results: Vec<_> = results.iter()
324                .filter(|r| r.name.contains("Search"))
325                .collect();
326            
327            if !search_results.is_empty() {
328                let avg_qps: f64 = search_results.iter()
329                    .filter_map(|r| r.queries_per_second)
330                    .sum::<f64>() / search_results.len() as f64;
331                println!("- Average Search Rate: {:.1} queries/sec", avg_qps);
332            }
333        }
334        
335        println!("✅ Benchmark completed successfully!");
336    }
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342    use tempfile::TempDir;
343
344    #[test]
345    #[ignore] // Run with: cargo test --release bench_test -- --ignored
346    fn bench_test() {
347        let temp_dir = TempDir::new().unwrap();
348        let collection_path = temp_dir.path().join("test-bench.sift");
349        
350        let mut benchmark = SiftDBBenchmark::new(&collection_path, ".");
351        let _results = benchmark.run_full_benchmark();
352    }
353}