scribe_scaling/
engine.rs

1//! Main scaling engine that integrates all optimization components.
2//!
3//! This module provides the primary interface for the scaling system,
4//! coordinating streaming, caching, parallel processing, adaptive configuration,
5//! and signature extraction for optimal performance at scale.
6
7use std::collections::HashMap;
8use std::path::Path;
9use std::sync::Arc;
10use std::time::{Duration, Instant};
11
12use serde::{Deserialize, Serialize};
13use tracing::{debug, error, info, warn};
14
15use crate::adaptive::AdaptiveConfig;
16use crate::caching::CacheConfig;
17use crate::error::{ScalingError, ScalingResult};
18use crate::memory::MemoryConfig;
19use crate::metrics::{BenchmarkResult, ScalingMetrics};
20use crate::parallel::ParallelConfig;
21use crate::signatures::SignatureConfig;
22use crate::streaming::{FileMetadata, StreamingConfig};
23
24/// Complete scaling configuration combining all subsystems
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct ScalingConfig {
27    pub streaming: StreamingConfig,
28    pub caching: CacheConfig,
29    pub parallel: ParallelConfig,
30    pub adaptive: AdaptiveConfig,
31    pub signatures: SignatureConfig,
32    pub memory: MemoryConfig,
33
34    /// Token budget for intelligent selection (0 = unlimited)
35    pub token_budget: Option<usize>,
36
37    /// Enable intelligent file selection before processing
38    pub enable_intelligent_selection: bool,
39
40    /// Selection algorithm to use when intelligent selection is enabled
41    pub selection_algorithm: Option<String>,
42
43    /// Enable context positioning optimization (HEAD/MIDDLE/TAIL)
44    pub enable_context_positioning: bool,
45
46    /// Query for context positioning (affects HEAD section)
47    pub positioning_query: Option<String>,
48}
49
50impl Default for ScalingConfig {
51    fn default() -> Self {
52        Self {
53            streaming: StreamingConfig::default(),
54            caching: CacheConfig::default(),
55            parallel: ParallelConfig::default(),
56            adaptive: AdaptiveConfig::default(),
57            signatures: SignatureConfig::default(),
58            memory: MemoryConfig::default(),
59            token_budget: None,                  // Unlimited by default
60            enable_intelligent_selection: false, // Off by default for backward compatibility
61            selection_algorithm: None,           // Will use V5Integrated when enabled
62            enable_context_positioning: false,   // Off by default for backward compatibility
63            positioning_query: None,             // No query by default
64        }
65    }
66}
67
68impl ScalingConfig {
69    /// Create configuration optimized for small repositories
70    pub fn small_repository() -> Self {
71        Self {
72            streaming: StreamingConfig {
73                enable_streaming: false,
74                concurrency_limit: 2,
75                memory_limit: 50 * 1024 * 1024, // 50MB
76                selection_heap_size: 1000,
77            },
78            parallel: ParallelConfig {
79                max_concurrent_tasks: 2,
80                async_worker_count: 1,
81                cpu_worker_count: 1,
82                task_timeout: Duration::from_secs(10),
83                enable_work_stealing: false,
84            },
85            token_budget: Some(8000), // Reasonable default for small repos
86            enable_intelligent_selection: true,
87            selection_algorithm: Some("v2_quotas".to_string()),
88            ..Default::default()
89        }
90    }
91
92    /// Create configuration optimized for large repositories
93    pub fn large_repository() -> Self {
94        Self {
95            streaming: StreamingConfig {
96                enable_streaming: true,
97                concurrency_limit: 8,
98                memory_limit: 500 * 1024 * 1024, // 500MB
99                selection_heap_size: 10000,
100            },
101            parallel: ParallelConfig {
102                max_concurrent_tasks: 16,
103                async_worker_count: 8,
104                cpu_worker_count: 8,
105                task_timeout: Duration::from_secs(60),
106                enable_work_stealing: true,
107            },
108            token_budget: Some(15000), // Larger budget for complex repositories
109            enable_intelligent_selection: true,
110            selection_algorithm: Some("v5_integrated".to_string()),
111            ..Default::default()
112        }
113    }
114
115    /// Create configuration with specific token budget
116    pub fn with_token_budget(token_budget: usize) -> Self {
117        Self {
118            token_budget: Some(token_budget),
119            enable_intelligent_selection: true,
120            selection_algorithm: Some(
121                match token_budget {
122                    0..=2000 => "v2_quotas",
123                    2001..=15000 => "v5_integrated",
124                    _ => "v5_integrated",
125                }
126                .to_string(),
127            ),
128            ..Self::default()
129        }
130    }
131}
132
133/// Repository processing results
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct ProcessingResult {
136    /// List of processed files with metadata
137    pub files: Vec<FileMetadata>,
138
139    /// Total number of files processed
140    pub total_files: usize,
141
142    /// Processing time
143    pub processing_time: Duration,
144
145    /// Peak memory usage in bytes
146    pub memory_peak: usize,
147
148    /// Cache hit count
149    pub cache_hits: u64,
150
151    /// Cache miss count
152    pub cache_misses: u64,
153
154    /// Additional metrics
155    pub metrics: ScalingMetrics,
156}
157
158/// Main scaling engine
159pub struct ScalingEngine {
160    config: ScalingConfig,
161    started_at: Option<Instant>,
162}
163
164impl ScalingEngine {
165    /// Create a new scaling engine with the given configuration
166    pub async fn new(config: ScalingConfig) -> ScalingResult<Self> {
167        info!(
168            "Initializing scaling engine with configuration: {:?}",
169            config
170        );
171
172        Ok(Self {
173            config,
174            started_at: None,
175        })
176    }
177
178    /// Create a scaling engine with default configuration
179    pub async fn with_defaults() -> ScalingResult<Self> {
180        Self::new(ScalingConfig::default()).await
181    }
182
183    /// Create a scaling engine with the given configuration
184    pub fn with_config(config: ScalingConfig) -> Self {
185        Self {
186            config,
187            started_at: None,
188        }
189    }
190
191    /// Process a repository with scaling optimizations
192    pub async fn process_repository(&mut self, path: &Path) -> ScalingResult<ProcessingResult> {
193        let start_time = Instant::now();
194        self.started_at = Some(start_time);
195
196        info!("Processing repository: {:?}", path);
197
198        if !path.exists() {
199            return Err(ScalingError::path("Repository path does not exist", path));
200        }
201
202        if !path.is_dir() {
203            return Err(ScalingError::path(
204                "Repository path is not a directory",
205                path,
206            ));
207        }
208
209        // Check if intelligent selection is enabled
210        if self.config.enable_intelligent_selection && self.config.token_budget.is_some() {
211            return self.process_with_intelligent_selection(path).await;
212        }
213
214        // Use optimized streaming file discovery
215        info!("Using optimized streaming file discovery for basic processing");
216
217        // Create a streaming selector even for basic processing to avoid memory issues
218        let streaming_config = crate::streaming::StreamingConfig {
219            enable_streaming: true,
220            concurrency_limit: self.config.parallel.max_concurrent_tasks,
221            memory_limit: self.config.streaming.memory_limit,
222            selection_heap_size: 50000, // Large heap for full discovery
223        };
224
225        let streaming_selector = crate::streaming::StreamingSelector::new(streaming_config);
226
227        // For basic processing, we want all files (within reason), so use very high limits
228        let target_count = 50000; // Reasonable limit to prevent memory explosion
229        let token_budget = 1_000_000; // Very high budget to include most files
230
231        // Simple scoring that doesn't filter much
232        let score_fn = |_file: &FileMetadata| -> f64 { 1.0 };
233        let token_fn = |file: &FileMetadata| -> usize { (file.size / 4) as usize };
234
235        let scored_files = streaming_selector
236            .select_files_streaming(path, target_count, token_budget, score_fn, token_fn)
237            .await?;
238
239        let files: Vec<FileMetadata> = scored_files
240            .into_iter()
241            .map(|scored| scored.metadata)
242            .collect();
243
244        let total_size: u64 = files.iter().map(|f| f.size).sum();
245
246        let processing_time = start_time.elapsed();
247
248        info!("Processed {} files in {:?}", files.len(), processing_time);
249
250        Ok(ProcessingResult {
251            total_files: files.len(),
252            processing_time,
253            memory_peak: estimate_memory_usage(files.len()),
254            cache_hits: 0,                    // Placeholder
255            cache_misses: files.len() as u64, // All cache misses for now
256            metrics: ScalingMetrics {
257                files_processed: files.len() as u64,
258                total_processing_time: processing_time,
259                memory_peak: estimate_memory_usage(files.len()),
260                cache_hits: 0,
261                cache_misses: files.len() as u64,
262                parallel_efficiency: 1.0, // No parallelism yet
263                streaming_overhead: Duration::from_millis(0),
264            },
265            files,
266        })
267    }
268
269    /// Run performance benchmarks
270    pub async fn benchmark(
271        &mut self,
272        path: &Path,
273        iterations: usize,
274    ) -> ScalingResult<Vec<BenchmarkResult>> {
275        let mut results = Vec::with_capacity(iterations);
276
277        for i in 0..iterations {
278            info!("Running benchmark iteration {}/{}", i + 1, iterations);
279
280            let start = Instant::now();
281            let result = self.process_repository(path).await?;
282            let duration = start.elapsed();
283
284            let benchmark_result = BenchmarkResult::new(
285                format!("iteration_{}", i + 1),
286                duration,
287                result.memory_peak,
288                result.total_files as f64 / duration.as_secs_f64(),
289                1.0, // 100% success rate
290            );
291
292            results.push(benchmark_result);
293        }
294
295        Ok(results)
296    }
297
298    /// Get current configuration
299    pub fn config(&self) -> &ScalingConfig {
300        &self.config
301    }
302
303    /// Process repository with intelligent selection enabled
304    async fn process_with_intelligent_selection(
305        &self,
306        path: &Path,
307    ) -> ScalingResult<ProcessingResult> {
308        info!("Processing repository with intelligent selection enabled");
309
310        // Create ScalingSelector with the configured token budget
311        let token_budget = self.config.token_budget.unwrap_or(8000);
312        let mut selector = crate::selector::ScalingSelector::with_token_budget(token_budget);
313
314        // Execute intelligent selection and processing
315        let selection_result = selector.select_and_process(path).await?;
316
317        info!(
318            "Intelligent selection completed: {} files selected, {:.1}% token utilization",
319            selection_result.selected_files.len(),
320            selection_result.token_utilization * 100.0
321        );
322
323        // Return the processing result from the selector
324        Ok(selection_result.processing_result)
325    }
326
327    /// Check if engine is ready for processing
328    pub fn is_ready(&self) -> bool {
329        true // Always ready in this simple implementation
330    }
331}
332
333/// Simple language detection based on file extension
334fn detect_language(path: &Path) -> String {
335    match path.extension().and_then(|s| s.to_str()) {
336        Some("rs") => "Rust".to_string(),
337        Some("py") => "Python".to_string(),
338        Some("js") => "JavaScript".to_string(),
339        Some("ts") => "TypeScript".to_string(),
340        Some("go") => "Go".to_string(),
341        Some("java") => "Java".to_string(),
342        Some("cpp" | "cc" | "cxx") => "C++".to_string(),
343        Some("c") => "C".to_string(),
344        Some("h") => "Header".to_string(),
345        Some("md") => "Markdown".to_string(),
346        Some("json") => "JSON".to_string(),
347        Some("yaml" | "yml") => "YAML".to_string(),
348        Some("toml") => "TOML".to_string(),
349        _ => "Unknown".to_string(),
350    }
351}
352
353/// Simple file type classification
354fn classify_file_type(path: &Path) -> String {
355    match path.extension().and_then(|s| s.to_str()) {
356        Some("rs" | "py" | "js" | "ts" | "go" | "java" | "cpp" | "cc" | "cxx" | "c") => {
357            "Source".to_string()
358        }
359        Some("h" | "hpp" | "hxx") => "Header".to_string(),
360        Some("md" | "txt" | "rst") => "Documentation".to_string(),
361        Some("json" | "yaml" | "yml" | "toml" | "ini" | "cfg") => "Configuration".to_string(),
362        Some("png" | "jpg" | "jpeg" | "gif" | "svg") => "Image".to_string(),
363        _ => "Other".to_string(),
364    }
365}
366
367/// Estimate memory usage based on file count
368fn estimate_memory_usage(file_count: usize) -> usize {
369    // Rough estimate: ~1KB per file metadata
370    file_count * 1024
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376    use std::fs;
377    use tempfile::TempDir;
378
379    #[tokio::test]
380    async fn test_scaling_engine_creation() {
381        let engine = ScalingEngine::with_defaults().await;
382        assert!(engine.is_ok());
383    }
384
385    #[tokio::test]
386    async fn test_repository_processing() {
387        let temp_dir = TempDir::new().unwrap();
388        let repo_path = temp_dir.path();
389
390        // Create test files
391        fs::write(repo_path.join("main.rs"), "fn main() {}").unwrap();
392        fs::write(repo_path.join("lib.rs"), "pub fn test() {}").unwrap();
393
394        let mut engine = ScalingEngine::with_defaults().await.unwrap();
395        let result = engine.process_repository(repo_path).await.unwrap();
396
397        assert!(result.total_files >= 2);
398        assert!(result.processing_time.as_nanos() > 0);
399        assert!(result.memory_peak > 0);
400    }
401
402    #[tokio::test]
403    async fn test_configuration_presets() {
404        let small_config = ScalingConfig::small_repository();
405        assert!(!small_config.streaming.enable_streaming);
406        assert_eq!(small_config.parallel.max_concurrent_tasks, 2);
407
408        let large_config = ScalingConfig::large_repository();
409        assert!(large_config.streaming.enable_streaming);
410        assert!(large_config.parallel.max_concurrent_tasks >= 16);
411    }
412
413    #[tokio::test]
414    async fn test_error_handling() {
415        let mut engine = ScalingEngine::with_defaults().await.unwrap();
416        let non_existent_path = Path::new("/non/existent/path");
417
418        let result = engine.process_repository(non_existent_path).await;
419        assert!(result.is_err());
420    }
421
422    #[tokio::test]
423    async fn test_benchmarking() {
424        let temp_dir = TempDir::new().unwrap();
425        let repo_path = temp_dir.path();
426
427        // Create test file
428        fs::write(repo_path.join("test.rs"), "fn test() {}").unwrap();
429
430        let mut engine = ScalingEngine::with_defaults().await.unwrap();
431        let results = engine.benchmark(repo_path, 3).await.unwrap();
432
433        assert_eq!(results.len(), 3);
434        for result in results {
435            assert!(result.duration.as_nanos() > 0);
436            assert!(result.throughput > 0.0);
437            assert_eq!(result.success_rate, 1.0);
438        }
439    }
440
441    #[test]
442    fn test_language_detection() {
443        assert_eq!(detect_language(Path::new("test.rs")), "Rust");
444        assert_eq!(detect_language(Path::new("test.py")), "Python");
445        assert_eq!(detect_language(Path::new("test.unknown")), "Unknown");
446    }
447
448    #[test]
449    fn test_file_type_classification() {
450        assert_eq!(classify_file_type(Path::new("main.rs")), "Source");
451        assert_eq!(classify_file_type(Path::new("README.md")), "Documentation");
452        assert_eq!(
453            classify_file_type(Path::new("config.json")),
454            "Configuration"
455        );
456    }
457}