siftdb_core/
bench.rs

1use std::time::{Duration, Instant, SystemTime};
2use std::path::Path;
3use std::fs;
4use crate::{SiftDB, Snapshot};
5use crate::ingest::{Ingester, IngestOptions};
6use serde::{Serialize, Deserialize};
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct BenchmarkResults {
10    pub name: String,
11    pub duration: Duration, 
12    pub files_processed: u64,
13    pub bytes_processed: u64,
14    pub queries_per_second: Option<f64>,
15    pub throughput_mbps: Option<f64>,
16}
17
18#[derive(Serialize, Deserialize)]
19pub struct BenchmarkSuite {
20    pub version: String,
21    pub timestamp: String,
22    pub git_commit: Option<String>,
23    pub test_environment: TestEnvironment,
24    pub benchmarks: Vec<BenchmarkResults>,
25}
26
27#[derive(Serialize, Deserialize)]
28pub struct TestEnvironment {
29    pub os: String,
30    pub cpu: String,
31    pub memory: String,
32}
33
34impl BenchmarkResults {
35    pub fn print(&self) {
36        println!("=== {} ===", self.name);
37        println!("Duration: {:.2}s", self.duration.as_secs_f64());
38        if self.files_processed > 0 {
39            println!("Files processed: {}", self.files_processed);
40            println!("Files/sec: {:.1}", self.files_processed as f64 / self.duration.as_secs_f64());
41        }
42        if self.bytes_processed > 0 {
43            let mb = self.bytes_processed as f64 / (1024.0 * 1024.0);
44            println!("Data processed: {:.2} MB", mb);
45            if let Some(throughput) = self.throughput_mbps {
46                println!("Throughput: {:.2} MB/s", throughput);
47            }
48        }
49        if let Some(qps) = self.queries_per_second {
50            println!("Queries/sec: {:.1}", qps);
51        }
52        println!();
53    }
54}
55
56pub struct SiftDBBenchmark {
57    collection_path: std::path::PathBuf,
58    source_path: std::path::PathBuf,
59}
60
61impl SiftDBBenchmark {
62    pub fn new<P1: AsRef<Path>, P2: AsRef<Path>>(collection_path: P1, source_path: P2) -> Self {
63        Self {
64            collection_path: collection_path.as_ref().to_path_buf(),
65            source_path: source_path.as_ref().to_path_buf(),
66        }
67    }
68
69    pub fn run_all(&mut self) -> Vec<BenchmarkResults> {
70        println!("🚀 SiftDB Performance Benchmark");
71        println!("================================");
72        println!();
73        
74        let mut results = Vec::new();
75        
76        // Initialize collection
77        results.push(self.bench_init());
78        
79        // Import benchmark
80        results.push(self.bench_import());
81        
82        // Search benchmarks
83        results.extend(self.bench_searches());
84        
85        self.print_summary(&results);
86        
87        // Save results to file
88        if let Err(e) = self.save_results(&results) {
89            eprintln!("Warning: Failed to save benchmark results: {}", e);
90        }
91        
92        results
93    }
94
95    pub fn run_all_quiet(&mut self) -> Vec<BenchmarkResults> {
96        let mut results = Vec::new();
97        
98        // Initialize collection
99        results.push(self.bench_init_quiet());
100        
101        // Import benchmark
102        results.push(self.bench_import_quiet());
103        
104        // Search benchmarks
105        results.extend(self.bench_searches_quiet());
106        
107        results
108    }
109
110    fn bench_init_quiet(&self) -> BenchmarkResults {
111        let start = Instant::now();
112        SiftDB::init(&self.collection_path).expect("Failed to initialize collection");
113        let duration = start.elapsed();
114        
115        BenchmarkResults {
116            name: "Collection Initialization".to_string(),
117            duration,
118            files_processed: 0,
119            bytes_processed: 0,
120            queries_per_second: None,
121            throughput_mbps: None,
122        }
123    }
124
125    fn bench_import_quiet(&mut self) -> BenchmarkResults {
126        let start = Instant::now();
127        let _db = SiftDB::open(&self.collection_path).expect("Failed to open collection");
128        
129        let mut options = IngestOptions::default();
130        options.include_patterns = vec!["**/*.rs".to_string(), "**/*.md".to_string(), "**/*.toml".to_string(), "**/*.json".to_string()];
131        
132        let mut ingester = Ingester::new(self.collection_path.clone(), options);
133        let (source_files, source_bytes) = self.count_source_files();
134        let stats = ingester.ingest_from_fs(&self.source_path).expect("Failed to ingest");
135        let duration = start.elapsed();
136        
137        BenchmarkResults {
138            name: "File Import".to_string(),
139            duration,
140            files_processed: stats.ingested,
141            bytes_processed: source_bytes,
142            queries_per_second: None,
143            throughput_mbps: Some(source_bytes as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()),
144        }
145    }
146
147    fn bench_searches_quiet(&self) -> Vec<BenchmarkResults> {
148        let mut results = Vec::new();
149        let db = SiftDB::open(&self.collection_path).expect("Failed to open collection");
150        let mut snapshot = db.snapshot().expect("Failed to create snapshot");
151        
152        let queries = vec![
153            ("fn", "Function definitions"),
154            ("println", "Print statements"),
155            ("use", "Import statements"),
156            ("struct", "Struct definitions"),
157            ("impl", "Implementation blocks"),
158            ("pub", "Public items"),
159            ("let", "Variable declarations"),
160            ("match", "Pattern matching"),
161            ("async", "Async code"),
162            ("Result", "Result types"),
163        ];
164
165        for (query, description) in queries {
166            results.push(self.bench_single_search_quiet(&mut snapshot, query, description));
167        }
168
169        results
170    }
171
172    fn bench_single_search_quiet(&self, snapshot: &mut crate::Snapshot, query: &str, description: &str) -> BenchmarkResults {
173        let iterations = 10;
174        let mut total_duration = Duration::new(0, 0);
175        let mut total_hits = 0;
176
177        // Warm up
178        snapshot.find(query, None, Some(1000)).ok();
179
180        // Run benchmark iterations
181        for _ in 0..iterations {
182            let start = Instant::now();
183            if let Ok(hits) = snapshot.find(query, None, Some(1000)) {
184                total_hits = hits.len();
185            }
186            total_duration += start.elapsed();
187        }
188
189        let avg_duration = total_duration / iterations as u32;
190        let qps = if avg_duration.as_secs_f64() > 0.0 {
191            1.0 / avg_duration.as_secs_f64()
192        } else {
193            f64::INFINITY
194        };
195
196        BenchmarkResults {
197            name: format!("Search: '{}' ({})", query, description),
198            duration: avg_duration,
199            files_processed: total_hits as u64,
200            bytes_processed: 0,
201            queries_per_second: Some(qps),
202            throughput_mbps: None,
203        }
204    }
205
206    fn bench_init(&self) -> BenchmarkResults {
207        let start = Instant::now();
208        
209        SiftDB::init(&self.collection_path).expect("Failed to initialize collection");
210        
211        let duration = start.elapsed();
212        
213        BenchmarkResults {
214            name: "Collection Initialization".to_string(),
215            duration,
216            files_processed: 0,
217            bytes_processed: 0,
218            queries_per_second: None,
219            throughput_mbps: None,
220        }
221    }
222
223    fn bench_import(&mut self) -> BenchmarkResults {
224        println!("📁 Starting import benchmark...");
225        
226        let start = Instant::now();
227        
228        let _db = SiftDB::open(&self.collection_path).expect("Failed to open collection");
229        
230        let mut options = IngestOptions::default();
231        options.include_patterns = vec!["**/*.rs".to_string(), "**/*.md".to_string(), "**/*.toml".to_string(), "**/*.json".to_string()];
232        
233        let mut ingester = Ingester::new(self.collection_path.clone(), options);
234        
235        // Count source files and bytes before import
236        let (source_files, source_bytes) = self.count_source_files();
237        println!("  Source files found: {}", source_files);  
238        println!("  Source data: {:.2} MB", source_bytes as f64 / (1024.0 * 1024.0));
239        
240        let stats = ingester.ingest_from_fs(&self.source_path).expect("Failed to ingest");
241        
242        let duration = start.elapsed();
243        
244        // Calculate storage efficiency
245        let storage_bytes = self.calculate_total_bytes();
246        let compression_ratio = if source_bytes > 0 {
247            storage_bytes as f64 / source_bytes as f64
248        } else {
249            1.0
250        };
251        
252        println!("  ✅ Import completed in {:.2}s", duration.as_secs_f64());
253        println!("  📊 Files ingested: {} ({} skipped, {} errors)", 
254                 stats.ingested, stats.skipped, stats.errors);
255        println!("  💾 Storage size: {:.2} MB (ratio: {:.2}x)", 
256                 storage_bytes as f64 / (1024.0 * 1024.0), compression_ratio);
257        
258        BenchmarkResults {
259            name: "File Import".to_string(),
260            duration,
261            files_processed: stats.ingested,
262            bytes_processed: source_bytes,
263            queries_per_second: None,
264            throughput_mbps: Some(source_bytes as f64 / (1024.0 * 1024.0) / duration.as_secs_f64()),
265        }
266    }
267
268    fn bench_searches(&self) -> Vec<BenchmarkResults> {
269        let mut results = Vec::new();
270        
271        let db = SiftDB::open(&self.collection_path).expect("Failed to open collection");
272        let mut snapshot = db.snapshot().expect("Failed to create snapshot");
273        
274        // Common search patterns
275        let queries = vec![
276            ("fn", "Function definitions"),
277            ("println", "Print statements"),
278            ("use", "Import statements"),
279            ("struct", "Struct definitions"),
280            ("impl", "Implementation blocks"),
281            ("pub", "Public items"),
282            ("let", "Variable declarations"),
283            ("match", "Pattern matching"),
284            ("async", "Async code"),
285            ("Result", "Result types"),
286        ];
287
288        for (query, description) in queries {
289            results.push(self.bench_single_search(&mut snapshot, query, description));
290        }
291
292        results
293    }
294
295    fn bench_single_search(&self, snapshot: &mut crate::Snapshot, query: &str, description: &str) -> BenchmarkResults {
296        let iterations = 10;
297        let mut total_duration = Duration::new(0, 0);
298        let mut total_hits = 0;
299
300        // Warm up
301        snapshot.find(query, None, Some(1000)).ok();
302
303        // Run benchmark iterations
304        for _ in 0..iterations {
305            let start = Instant::now();
306            if let Ok(hits) = snapshot.find(query, None, Some(1000)) {
307                total_hits = hits.len();
308            }
309            total_duration += start.elapsed();
310        }
311
312        let avg_duration = total_duration / iterations as u32;
313        let qps = iterations as f64 / total_duration.as_secs_f64();
314
315        BenchmarkResults {
316            name: format!("Search: '{}' ({})", query, description),
317            duration: avg_duration,
318            files_processed: total_hits as u64,
319            bytes_processed: 0,
320            queries_per_second: Some(qps),
321            throughput_mbps: None,
322        }
323    }
324
325    fn count_source_files(&self) -> (u64, u64) {
326        let mut file_count = 0;
327        let mut byte_count = 0;
328        
329        let walker = ignore::WalkBuilder::new(&self.source_path)
330            .hidden(false)
331            .git_ignore(true)
332            .build();
333            
334        for entry in walker {
335            if let Ok(entry) = entry {
336                let path = entry.path();
337                if path.is_file() {
338                    // Check if file matches our patterns
339                    let path_str = path.to_string_lossy();
340                    if path_str.ends_with(".rs") || path_str.ends_with(".md") || 
341                       path_str.ends_with(".toml") || path_str.ends_with(".json") {
342                        file_count += 1;
343                        if let Ok(metadata) = path.metadata() {
344                            byte_count += metadata.len();
345                        }
346                    }
347                }
348            }
349        }
350        
351        (file_count, byte_count)
352    }
353
354    fn calculate_total_bytes(&self) -> u64 {
355        let mut total = 0;
356        
357        if let Ok(entries) = fs::read_dir(&self.collection_path.join("store")) {
358            for entry in entries.flatten() {
359                if let Ok(metadata) = entry.metadata() {
360                    total += metadata.len();
361                }
362            }
363        }
364        
365        total
366    }
367    
368    fn save_results(&self, results: &[BenchmarkResults]) -> Result<(), Box<dyn std::error::Error>> {
369        let suite = BenchmarkSuite {
370            version: env!("CARGO_PKG_VERSION").to_string(),
371            timestamp: SystemTime::now()
372                .duration_since(SystemTime::UNIX_EPOCH)?
373                .as_secs().to_string(),
374            git_commit: self.get_git_commit(),
375            test_environment: TestEnvironment {
376                os: std::env::consts::OS.to_string(),
377                cpu: "unknown".to_string(), // Could use sysinfo crate later
378                memory: "unknown".to_string(),
379            },
380            benchmarks: results.to_vec(),
381        };
382        
383        let benchmarks_dir = self.collection_path.parent()
384            .unwrap_or(&self.collection_path)
385            .join("benchmarks/results");
386            
387        std::fs::create_dir_all(&benchmarks_dir)?;
388        
389        let filename = format!("benchmark-{}.json", suite.timestamp);
390        let filepath = benchmarks_dir.join(filename);
391        
392        let json = serde_json::to_string_pretty(&suite)?;
393        std::fs::write(filepath, json)?;
394        
395        Ok(())
396    }
397    
398    fn get_git_commit(&self) -> Option<String> {
399        std::process::Command::new("git")
400            .arg("rev-parse")
401            .arg("--short")
402            .arg("HEAD")
403            .current_dir(self.collection_path.parent().unwrap_or(&self.collection_path))
404            .output()
405            .ok()
406            .and_then(|output| {
407                if output.status.success() {
408                    String::from_utf8(output.stdout).ok()
409                        .map(|s| s.trim().to_string())
410                } else {
411                    None
412                }
413            })
414    }
415
416    fn print_summary(&self, results: &[BenchmarkResults]) {
417        println!("📊 Benchmark Summary");
418        println!("===================");
419        
420        for result in results {
421            result.print();
422        }
423
424        // Overall stats
425        let import_result = results.iter().find(|r| r.name.contains("Import"));
426        
427        if let Some(import) = import_result {
428            println!("🎯 Key Performance Metrics:");
429            println!("- Import Rate: {:.0} files/sec", import.files_processed as f64 / import.duration.as_secs_f64());
430            if let Some(throughput) = import.throughput_mbps {
431                println!("- Import Throughput: {:.1} MB/s", throughput);
432            }
433            
434            let search_results: Vec<_> = results.iter()
435                .filter(|r| r.name.contains("Search"))
436                .collect();
437            
438            if !search_results.is_empty() {
439                let avg_qps: f64 = search_results.iter()
440                    .filter_map(|r| r.queries_per_second)
441                    .sum::<f64>() / search_results.len() as f64;
442                println!("- Average Search Rate: {:.1} queries/sec", avg_qps);
443            }
444        }
445        
446        println!("✅ Benchmark completed successfully!");
447    }
448}
449
450#[cfg(test)]
451mod tests {
452    use super::*;
453    use tempfile::TempDir;
454
455    #[test]
456    #[ignore] // Run with: cargo test --release bench_test -- --ignored
457    fn bench_test() {
458        let temp_dir = TempDir::new().unwrap();
459        let collection_path = temp_dir.path().join("test-bench.sift");
460        
461        let mut benchmark = SiftDBBenchmark::new(&collection_path, ".");
462        let _results = benchmark.run_all();
463    }
464}