scribe_scaling/
selector.rs

1//! Intelligent Scaling Selector - Minimal Self-contained Selection
2//!
3//! This module provides intelligent file selection with token budget awareness.
4//! This is a simplified, self-contained implementation to avoid circular dependencies.
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::time::{Duration, Instant};
9
10use serde::{Deserialize, Serialize};
11use tracing::{debug, info, warn};
12
13use crate::engine::{ProcessingResult, ScalingConfig};
14use crate::error::{ScalingError, ScalingResult};
15use crate::positioning::{ContextPositioner, ContextPositioningConfig, PositionedSelection};
16use crate::streaming::{FileMetadata, ScoredFile, StreamingSelector};
17
18/// File category classification for quota allocation
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
20pub enum FileCategory {
21    Config,
22    Entry,
23    Examples,
24    General,
25}
26
27/// Selection algorithm variants
28#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
29pub enum SelectionAlgorithm {
30    /// Tiered approach with intelligent selection (V5)
31    V5Integrated,
32}
33
34/// Configuration for intelligent scaling selection
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct ScalingSelectionConfig {
37    /// Token budget for selection (like --token-target)
38    pub token_budget: usize,
39
40    /// Selection algorithm variant to use
41    pub selection_algorithm: SelectionAlgorithm,
42
43    /// Enable category-based quota allocation
44    pub enable_quotas: bool,
45
46    /// Context positioning configuration
47    pub positioning_config: ContextPositioningConfig,
48
49    /// Base scaling configuration
50    pub scaling_config: ScalingConfig,
51}
52
53impl Default for ScalingSelectionConfig {
54    fn default() -> Self {
55        Self {
56            token_budget: 8000,
57            selection_algorithm: SelectionAlgorithm::V5Integrated,
58            enable_quotas: true,
59            positioning_config: ContextPositioningConfig::default(),
60            scaling_config: ScalingConfig::default(),
61        }
62    }
63}
64
65impl ScalingSelectionConfig {
66    /// Create configuration for small token budget (should select ~2 files)
67    pub fn small_budget() -> Self {
68        Self {
69            token_budget: 1000,
70            selection_algorithm: SelectionAlgorithm::V5Integrated,
71            enable_quotas: true,
72            positioning_config: ContextPositioningConfig::default(),
73            scaling_config: ScalingConfig::small_repository(),
74        }
75    }
76
77    /// Enable auto-exclusion of test files (focuses on code and docs only)
78    pub fn with_test_exclusion(mut self) -> Self {
79        self.positioning_config.auto_exclude_tests = true;
80        self
81    }
82
83    /// Create configuration for medium token budget (should select ~11 files)
84    pub fn medium_budget() -> Self {
85        Self {
86            token_budget: 10000,
87            selection_algorithm: SelectionAlgorithm::V5Integrated,
88            enable_quotas: true,
89            positioning_config: ContextPositioningConfig::default(),
90            scaling_config: ScalingConfig::default(),
91        }
92    }
93
94    /// Create configuration for large token budget
95    pub fn large_budget() -> Self {
96        Self {
97            token_budget: 100000,
98            selection_algorithm: SelectionAlgorithm::V5Integrated,
99            enable_quotas: true,
100            positioning_config: ContextPositioningConfig::default(),
101            scaling_config: ScalingConfig::large_repository(),
102        }
103    }
104}
105
106/// Results of intelligent scaling selection
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct ScalingSelectionResult {
109    /// Selected files with metadata (if positioning disabled)
110    pub selected_files: Vec<FileMetadata>,
111
112    /// Context-positioned selection (if positioning enabled)
113    pub positioned_selection: Option<PositionedSelection>,
114
115    /// Total files considered during selection
116    pub total_files_considered: usize,
117
118    /// Token budget utilization
119    pub token_utilization: f64,
120
121    /// Actual tokens used by selected files
122    pub tokens_used: usize,
123
124    /// Selection algorithm used
125    pub algorithm_used: SelectionAlgorithm,
126
127    /// Selection performance metrics
128    pub selection_time: Duration,
129
130    /// Processing performance metrics (from scaling system)
131    pub processing_result: ProcessingResult,
132}
133
134/// Scored file for selection (selector-specific version)
135#[derive(Debug, Clone)]
136struct SelectorScoredFile {
137    metadata: FileMetadata,
138    tokens: usize,
139    score: f64,
140    category: FileCategory,
141}
142
143/// Main intelligent scaling selector
144pub struct ScalingSelector {
145    config: ScalingSelectionConfig,
146}
147
148impl ScalingSelector {
149    /// Create new scaling selector with configuration
150    pub fn new(config: ScalingSelectionConfig) -> Self {
151        Self { config }
152    }
153
154    /// Create with default configuration
155    pub fn with_defaults() -> Self {
156        Self::new(ScalingSelectionConfig::default())
157    }
158
159    /// Create with specific token budget (like --token-target)
160    pub fn with_token_budget(token_budget: usize) -> Self {
161        let config = match token_budget {
162            0..=2000 => ScalingSelectionConfig::small_budget(),
163            2001..=15000 => ScalingSelectionConfig::medium_budget(),
164            _ => ScalingSelectionConfig::large_budget(),
165        };
166
167        Self::new(ScalingSelectionConfig {
168            token_budget,
169            ..config
170        })
171    }
172
173    /// Execute intelligent selection with scaling optimizations
174    pub async fn select_and_process(
175        &mut self,
176        repo_path: &Path,
177    ) -> ScalingResult<ScalingSelectionResult> {
178        self.select_and_process_with_query(repo_path, None).await
179    }
180
181    /// Execute intelligent selection with query hint for context positioning
182    pub async fn select_and_process_with_query(
183        &mut self,
184        repo_path: &Path,
185        query_hint: Option<&str>,
186    ) -> ScalingResult<ScalingSelectionResult> {
187        let start_time = Instant::now();
188
189        info!(
190            "Starting intelligent scaling selection for: {:?}",
191            repo_path
192        );
193        info!(
194            "Token budget: {}, Algorithm: {:?}",
195            self.config.token_budget, self.config.selection_algorithm
196        );
197        if let Some(query) = query_hint {
198            info!("Query hint for positioning: '{}'", query);
199        }
200
201        // Phase 1: Optimized streaming discovery and selection
202        let discovery_start = Instant::now();
203        let selected_files = self.discover_and_select_files_streaming(repo_path).await?;
204        let discovery_time = discovery_start.elapsed();
205
206        info!(
207            "Selected {} files in {:?}",
208            selected_files.len(),
209            discovery_time
210        );
211
212        // Phase 3: Apply context positioning if enabled
213        let total_files_considered = selected_files.len();
214        let (positioned_selection, final_files, final_tokens) =
215            if self.config.positioning_config.enable_positioning {
216                let positioner = ContextPositioner::new(self.config.positioning_config.clone());
217                let positioned = positioner
218                    .position_files(selected_files.clone(), query_hint)
219                    .await?;
220
221                info!(
222                    "Context positioning applied: HEAD={}, MIDDLE={}, TAIL={}",
223                    positioned.positioning.head_files.len(),
224                    positioned.positioning.middle_files.len(),
225                    positioned.positioning.tail_files.len()
226                );
227
228                let tokens = positioned.total_tokens;
229                (Some(positioned), selected_files, tokens)
230            } else {
231                let tokens = self.calculate_tokens_used(&selected_files);
232                (None, selected_files, tokens)
233            };
234
235        // Phase 4: Apply scaling optimizations to selected subset
236        let processing_result = self.apply_scaling_optimizations(&final_files).await?;
237
238        // Phase 5: Calculate final metrics
239        let token_utilization = final_tokens as f64 / self.config.token_budget as f64;
240
241        let total_time = start_time.elapsed();
242        info!("Total selection and processing time: {:?}", total_time);
243        info!(
244            "Token utilization: {:.1}% ({}/{})",
245            token_utilization * 100.0,
246            final_tokens,
247            self.config.token_budget
248        );
249
250        Ok(ScalingSelectionResult {
251            selected_files: final_files,
252            positioned_selection,
253            total_files_considered, // We only process selected files now
254            token_utilization,
255            tokens_used: final_tokens,
256            algorithm_used: self.config.selection_algorithm,
257            selection_time: discovery_time, // This now includes both discovery and selection
258            processing_result,
259        })
260    }
261
262    /// Optimized streaming file discovery with intelligent selection
263    async fn discover_and_select_files_streaming(
264        &self,
265        repo_path: &Path,
266    ) -> ScalingResult<Vec<FileMetadata>> {
267        info!("Using optimized streaming file discovery");
268
269        // Create streaming selector
270        let streaming_config = crate::streaming::StreamingConfig {
271            enable_streaming: true,
272            concurrency_limit: num_cpus::get() * 2,
273            memory_limit: 100 * 1024 * 1024, // 100MB
274            selection_heap_size: self.config.token_budget * 2, // Allow larger heap for better selection
275        };
276
277        let streaming_selector = StreamingSelector::new(streaming_config);
278
279        // Calculate target file count based on token budget
280        let target_count = self.estimate_target_file_count();
281
282        // Create scoring functions
283        let score_fn = {
284            let token_budget = self.config.token_budget;
285            move |file: &FileMetadata| -> f64 {
286                Self::calculate_file_score_static(file, token_budget)
287            }
288        };
289
290        let token_fn = {
291            let token_budget = self.config.token_budget;
292            move |file: &FileMetadata| -> usize { Self::estimate_tokens_static(file, token_budget) }
293        };
294
295        // Use streaming selection for O(N log K) performance
296        let scored_files = streaming_selector
297            .select_files_streaming(
298                repo_path,
299                target_count,
300                self.config.token_budget,
301                score_fn,
302                token_fn,
303            )
304            .await?;
305
306        // Extract metadata from scored files
307        let selected_files: Vec<FileMetadata> = scored_files
308            .into_iter()
309            .map(|scored| scored.metadata)
310            .collect();
311
312        info!(
313            "Streaming selection completed: {} files selected",
314            selected_files.len()
315        );
316        Ok(selected_files)
317    }
318
319    /// Estimate target number of files to select
320    fn estimate_target_file_count(&self) -> usize {
321        // Conservative estimate: aim for ~300 tokens per file on average
322        // This gives us room for both small config files and larger source files
323        let estimated_files = self.config.token_budget / 300;
324
325        // Clamp between reasonable bounds
326        estimated_files.clamp(5, 200)
327    }
328
329    /// Simple language detection based on file extension
330    fn detect_language(&self, path: &Path) -> String {
331        match path.extension().and_then(|s| s.to_str()) {
332            Some("rs") => "Rust".to_string(),
333            Some("py") => "Python".to_string(),
334            Some("js") => "JavaScript".to_string(),
335            Some("ts") => "TypeScript".to_string(),
336            Some("go") => "Go".to_string(),
337            Some("java") => "Java".to_string(),
338            Some("cpp" | "cc" | "cxx") => "C++".to_string(),
339            Some("c") => "C".to_string(),
340            Some("h") => "Header".to_string(),
341            Some("md") => "Markdown".to_string(),
342            Some("json") => "JSON".to_string(),
343            Some("yaml" | "yml") => "YAML".to_string(),
344            Some("toml") => "TOML".to_string(),
345            _ => "Unknown".to_string(),
346        }
347    }
348
349    /// Simple file type classification
350    fn classify_file_type(&self, path: &Path) -> String {
351        match path.extension().and_then(|s| s.to_str()) {
352            Some("rs" | "py" | "js" | "ts" | "go" | "java" | "cpp" | "cc" | "cxx" | "c") => {
353                "Source".to_string()
354            }
355            Some("h" | "hpp" | "hxx") => "Header".to_string(),
356            Some("md" | "txt" | "rst") => "Documentation".to_string(),
357            Some("json" | "yaml" | "yml" | "toml" | "ini" | "cfg") => "Configuration".to_string(),
358            Some("png" | "jpg" | "jpeg" | "gif" | "svg") => "Image".to_string(),
359            _ => "Other".to_string(),
360        }
361    }
362
363    /// Apply intelligent selection algorithm based on configuration
364    async fn apply_intelligent_selection(
365        &self,
366        files: &[FileMetadata],
367    ) -> ScalingResult<Vec<FileMetadata>> {
368        // V5 Integrated selection algorithm (tiered approach)
369        self.apply_integrated_selection(files)
370    }
371
372    /// V5 Integrated selection: tiered approach with intelligent prioritization
373    fn apply_integrated_selection(
374        &self,
375        files: &[FileMetadata],
376    ) -> ScalingResult<Vec<FileMetadata>> {
377        // Score all files
378        let mut scored_files: Vec<SelectorScoredFile> = files
379            .iter()
380            .map(|file| {
381                let tokens = self.estimate_tokens(file);
382                let score = self.calculate_file_score(file);
383                let category = self.classify_file(file);
384
385                SelectorScoredFile {
386                    metadata: file.clone(),
387                    tokens,
388                    score,
389                    category,
390                }
391            })
392            .collect();
393
394        // Group by category for tiered selection
395        let mut categorized: HashMap<FileCategory, Vec<SelectorScoredFile>> = HashMap::new();
396        for scored_file in scored_files {
397            categorized
398                .entry(scored_file.category)
399                .or_insert_with(Vec::new)
400                .push(scored_file);
401        }
402
403        // Sort within each category by score
404        for files in categorized.values_mut() {
405            files.sort_by(|a, b| {
406                b.score
407                    .partial_cmp(&a.score)
408                    .unwrap_or(std::cmp::Ordering::Equal)
409            });
410        }
411
412        // V5 Tiered selection with intelligent allocation
413        let mut selected = Vec::new();
414        let mut remaining_budget = self.config.token_budget;
415
416        // Tier 1: Critical entry points (highest priority)
417        let tier1_order = [FileCategory::Entry, FileCategory::Config];
418        for category in tier1_order.iter() {
419            if let Some(files) = categorized.get(category) {
420                let tier_budget = match category {
421                    FileCategory::Entry => (self.config.token_budget as f64 * 0.35) as usize, // 35% for entry points
422                    FileCategory::Config => (self.config.token_budget as f64 * 0.25) as usize, // 25% for config
423                    _ => 0,
424                };
425
426                let mut used_budget = 0;
427                for scored_file in files {
428                    if used_budget + scored_file.tokens <= tier_budget
429                        && scored_file.tokens <= remaining_budget
430                    {
431                        selected.push(scored_file.metadata.clone());
432                        used_budget += scored_file.tokens;
433                        remaining_budget = remaining_budget.saturating_sub(scored_file.tokens);
434                    }
435                }
436            }
437        }
438
439        // Tier 2: General implementation files (fill remaining budget intelligently)
440        if let Some(general_files) = categorized.get(&FileCategory::General) {
441            for scored_file in general_files {
442                if scored_file.tokens <= remaining_budget {
443                    selected.push(scored_file.metadata.clone());
444                    remaining_budget = remaining_budget.saturating_sub(scored_file.tokens);
445                }
446            }
447        }
448
449        // Tier 3: Examples (lowest priority, use remaining budget)
450        if let Some(example_files) = categorized.get(&FileCategory::Examples) {
451            for scored_file in example_files {
452                if scored_file.tokens <= remaining_budget {
453                    selected.push(scored_file.metadata.clone());
454                    remaining_budget = remaining_budget.saturating_sub(scored_file.tokens);
455                }
456            }
457        }
458
459        Ok(selected)
460    }
461
462    /// Apply scaling optimizations to selected files
463    async fn apply_scaling_optimizations(
464        &self,
465        selected_files: &[FileMetadata],
466    ) -> ScalingResult<ProcessingResult> {
467        // Create a mock processing result optimized for selected subset
468        let total_size: u64 = selected_files.iter().map(|f| f.size).sum();
469        let processing_time = Duration::from_millis((selected_files.len() as u64 * 2).max(10)); // Fast for selected subset
470        let memory_peak = (selected_files.len() * 1024).max(1024); // Minimal memory usage
471
472        Ok(ProcessingResult {
473            files: selected_files.to_vec(),
474            total_files: selected_files.len(),
475            processing_time,
476            memory_peak,
477            cache_hits: 0,
478            cache_misses: selected_files.len() as u64,
479            metrics: crate::metrics::ScalingMetrics {
480                files_processed: selected_files.len() as u64,
481                total_processing_time: processing_time,
482                memory_peak,
483                cache_hits: 0,
484                cache_misses: selected_files.len() as u64,
485                parallel_efficiency: 1.0,
486                streaming_overhead: Duration::from_millis(0),
487            },
488        })
489    }
490
491    /// Calculate tokens used by selected files
492    fn calculate_tokens_used(&self, selected_files: &[FileMetadata]) -> usize {
493        selected_files
494            .iter()
495            .map(|file| self.estimate_tokens(file))
496            .sum()
497    }
498
499    /// Estimate tokens for a file based on size and type (matching original scribe behavior)
500    fn estimate_tokens(&self, file: &FileMetadata) -> usize {
501        // Use more realistic token estimation like original scribe
502        // Original scribe uses ~3.5 chars per token on average
503        let base_tokens = ((file.size as f64) / 3.5) as usize;
504
505        // Add minimum token count for very small files to avoid underestimation
506        // Make minimum higher for small budgets to be more selective
507        let min_tokens = if self.config.token_budget < 5000 {
508            100 // Higher minimum for small budgets
509        } else {
510            50 // Standard minimum
511        };
512        let base_tokens = base_tokens.max(min_tokens);
513
514        // Adjust based on file type (more realistic multipliers)
515        let multiplier = match file.file_type.as_str() {
516            "Source" => 1.2,        // Source code has more complexity
517            "Documentation" => 1.0, // Documentation is standard
518            "Configuration" => 0.8, // Config files are more compact
519            _ => 1.1,               // Default higher to be conservative
520        };
521
522        // Apply language-specific adjustments
523        let language_multiplier = match file.language.as_str() {
524            "Rust" => 1.3,                      // Rust is very verbose
525            "JavaScript" | "TypeScript" => 1.2, // JS/TS moderately verbose
526            "Python" => 1.1,                    // Python is readable but efficient
527            "C" | "Go" => 1.0,                  // C/Go are concise
528            "HTML" | "CSS" => 0.9,              // Markup is less token-dense
529            "JSON" | "YAML" | "TOML" => 0.7,    // Data formats are compact
530            _ => 1.0,                           // Default
531        };
532
533        // Final calculation with realistic scaling
534        let final_tokens = (base_tokens as f64 * multiplier * language_multiplier) as usize;
535
536        // Cap extremely large files to avoid single file consuming entire budget
537        final_tokens.min(self.config.token_budget / 4) // No single file > 25% of budget
538    }
539
540    /// Calculate file score for selection (aggressive prioritization like original scribe)
541    fn calculate_file_score(&self, file: &FileMetadata) -> f64 {
542        let mut score: f64 = 0.1; // Lower base score to be more selective
543
544        let path_str = file.path.to_string_lossy().to_lowercase();
545
546        // High-priority entry points (like original scribe)
547        if path_str.contains("main") || path_str.contains("index") {
548            score += 2.0; // Very high priority
549        }
550        if path_str.contains("lib.rs") || path_str.contains("mod.rs") {
551            score += 1.5; // High priority for Rust entry points
552        }
553        if path_str.contains("__init__.py") {
554            score += 1.3; // High priority for Python packages
555        }
556
557        // Root-level files get major boost (like README, setup files)
558        let path_components = file.path.components().count();
559        if path_components <= 2 {
560            // Root or one level down
561            score += 1.0;
562
563            // Special boost for important root files
564            if path_str.contains("readme")
565                || path_str.contains("license")
566                || path_str.contains("cargo.toml")
567                || path_str.contains("package.json")
568                || path_str.contains("pyproject.toml")
569                || path_str.contains("setup.py")
570            {
571                score += 1.5;
572            }
573        }
574
575        // Language importance (more aggressive)
576        match file.language.as_str() {
577            "Rust" | "Python" | "JavaScript" | "TypeScript" => score += 0.8,
578            "C" | "C++" | "Go" | "Java" => score += 0.6,
579            "Shell" | "Makefile" => score += 0.4, // Build scripts
580            _ => {}
581        }
582
583        // File type importance
584        match file.file_type.as_str() {
585            "Source" => score += 0.6,
586            "Configuration" => score += 0.5, // Config files are very important
587            "Documentation" => score += 0.3,
588            _ => {}
589        }
590
591        // Penalize very large files more heavily to stay within budget
592        if file.size > 50_000 {
593            score -= 0.5;
594        }
595        if file.size > 100_000 {
596            score -= 1.0;
597        }
598
599        // Boost for certain important patterns
600        if path_str.contains("test") && !path_str.contains("tests/") {
601            score += 0.2; // Important test files but not test directories
602        }
603
604        // Penalize deep nesting (prefer top-level files)
605        if path_components > 4 {
606            score -= 0.3 * (path_components - 4) as f64;
607        }
608
609        // Boost small, important files
610        if file.size < 10_000 && (path_str.contains("config") || path_str.contains("env")) {
611            score += 0.4;
612        }
613
614        score.clamp(0.0, 5.0) // Allow higher scores for very important files
615    }
616
617    /// Classify file into category
618    fn classify_file(&self, file: &FileMetadata) -> FileCategory {
619        let path_str = file.path.to_string_lossy().to_lowercase();
620        let filename = file
621            .path
622            .file_name()
623            .and_then(|n| n.to_str())
624            .unwrap_or("")
625            .to_lowercase();
626
627        // Check for config files
628        if matches!(file.file_type.as_str(), "Configuration")
629            || filename.contains("config")
630            || filename.ends_with(".toml")
631            || filename.ends_with(".json")
632            || filename.ends_with(".yaml")
633        {
634            return FileCategory::Config;
635        }
636
637        // Check for entry points
638        if filename.contains("main")
639            || filename.contains("index")
640            || filename == "lib.rs"
641            || filename == "__init__.py"
642        {
643            return FileCategory::Entry;
644        }
645
646        // Check for examples/tests
647        if path_str.contains("example")
648            || path_str.contains("test")
649            || path_str.contains("demo")
650            || path_str.contains("sample")
651        {
652            return FileCategory::Examples;
653        }
654
655        FileCategory::General
656    }
657
658    /// Static version of file scoring for use in streaming selector
659    fn calculate_file_score_static(file: &FileMetadata, token_budget: usize) -> f64 {
660        let mut score: f64 = 0.1; // Lower base score to be more selective
661
662        let path_str = file.path.to_string_lossy().to_lowercase();
663
664        // High-priority entry points (like original scribe)
665        if path_str.contains("main") || path_str.contains("index") {
666            score += 2.0; // Very high priority
667        }
668        if path_str.contains("lib.rs") || path_str.contains("mod.rs") {
669            score += 1.5; // High priority for Rust entry points
670        }
671        if path_str.contains("__init__.py") {
672            score += 1.3; // High priority for Python packages
673        }
674
675        // Root-level files get major boost (like README, setup files)
676        let path_components = file.path.components().count();
677        if path_components <= 2 {
678            // Root or one level down
679            score += 1.0;
680
681            // Special boost for important root files
682            if path_str.contains("readme")
683                || path_str.contains("license")
684                || path_str.contains("cargo.toml")
685                || path_str.contains("package.json")
686                || path_str.contains("pyproject.toml")
687                || path_str.contains("setup.py")
688            {
689                score += 1.5;
690            }
691        }
692
693        // Language importance (more aggressive)
694        match file.language.as_str() {
695            "Rust" | "Python" | "JavaScript" | "TypeScript" => score += 0.8,
696            "C" | "C++" | "Go" | "Java" => score += 0.6,
697            "Shell" => score += 0.4, // Build scripts
698            _ => {}
699        }
700
701        // File type importance
702        match file.file_type.as_str() {
703            "Source" => score += 0.6,
704            "Configuration" => score += 0.5, // Config files are very important
705            "Documentation" => score += 0.3,
706            _ => {}
707        }
708
709        // Penalize very large files more heavily to stay within budget
710        if file.size > 50_000 {
711            score -= 0.5;
712        }
713        if file.size > 100_000 {
714            score -= 1.0;
715        }
716
717        // Boost for certain important patterns
718        if path_str.contains("test") && !path_str.contains("tests/") {
719            score += 0.2; // Important test files but not test directories
720        }
721
722        // Penalize deep nesting (prefer top-level files)
723        if path_components > 4 {
724            score -= 0.3 * (path_components - 4) as f64;
725        }
726
727        // Boost small, important files
728        if file.size < 10_000 && (path_str.contains("config") || path_str.contains("env")) {
729            score += 0.4;
730        }
731
732        score.clamp(0.0, 5.0) // Allow higher scores for very important files
733    }
734
735    /// Static version of token estimation for use in streaming selector
736    fn estimate_tokens_static(file: &FileMetadata, token_budget: usize) -> usize {
737        // Use more realistic token estimation like original scribe
738        // Original scribe uses ~3.5 chars per token on average
739        let base_tokens = ((file.size as f64) / 3.5) as usize;
740
741        // Add minimum token count for very small files to avoid underestimation
742        // Make minimum higher for small budgets to be more selective
743        let min_tokens = if token_budget < 5000 {
744            100 // Higher minimum for small budgets
745        } else {
746            50 // Standard minimum
747        };
748        let base_tokens = base_tokens.max(min_tokens);
749
750        // Adjust based on file type (more realistic multipliers)
751        let multiplier = match file.file_type.as_str() {
752            "Source" => 1.2,        // Source code has more complexity
753            "Documentation" => 1.0, // Documentation is standard
754            "Configuration" => 0.8, // Config files are more compact
755            _ => 1.1,               // Default higher to be conservative
756        };
757
758        // Apply language-specific adjustments
759        let language_multiplier = match file.language.as_str() {
760            "Rust" => 1.3,                      // Rust is very verbose
761            "JavaScript" | "TypeScript" => 1.2, // JS/TS moderately verbose
762            "Python" => 1.1,                    // Python is readable but efficient
763            "C" | "Go" => 1.0,                  // C/Go are concise
764            "HTML" | "CSS" => 0.9,              // Markup is less token-dense
765            "JSON" | "YAML" | "TOML" => 0.7,    // Data formats are compact
766            _ => 1.0,                           // Default
767        };
768
769        // Final calculation with realistic scaling
770        let final_tokens = (base_tokens as f64 * multiplier * language_multiplier) as usize;
771
772        // Cap extremely large files to avoid single file consuming entire budget
773        final_tokens.min(token_budget / 4) // No single file > 25% of budget
774    }
775}
776
777impl ScalingSelectionResult {
778    /// Get all files in optimal order (positioned if available, otherwise selected)
779    pub fn get_optimally_ordered_files(&self) -> Vec<&FileMetadata> {
780        if let Some(positioned) = &self.positioned_selection {
781            let mut files = Vec::new();
782
783            // HEAD files first (query-relevant, high centrality)
784            for file in &positioned.positioning.head_files {
785                files.push(&file.metadata);
786            }
787
788            // MIDDLE files (supporting, low centrality)
789            for file in &positioned.positioning.middle_files {
790                files.push(&file.metadata);
791            }
792
793            // TAIL files last (core functionality, high centrality)
794            for file in &positioned.positioning.tail_files {
795                files.push(&file.metadata);
796            }
797
798            files
799        } else {
800            self.selected_files.iter().collect()
801        }
802    }
803
804    /// Get positioning statistics if available
805    pub fn get_positioning_stats(&self) -> Option<(usize, usize, usize)> {
806        self.positioned_selection.as_ref().map(|p| {
807            (
808                p.positioning.head_files.len(),
809                p.positioning.middle_files.len(),
810                p.positioning.tail_files.len(),
811            )
812        })
813    }
814
815    /// Get positioning reasoning if available
816    pub fn get_positioning_reasoning(&self) -> Option<&str> {
817        self.positioned_selection
818            .as_ref()
819            .map(|p| p.positioning_reasoning.as_str())
820    }
821
822    /// Check if context positioning was applied
823    pub fn has_context_positioning(&self) -> bool {
824        self.positioned_selection.is_some()
825    }
826}
827
828#[cfg(test)]
829mod tests {
830    use super::*;
831    use std::fs;
832    use tempfile::TempDir;
833
834    #[tokio::test]
835    async fn test_scaling_selector_creation() {
836        let selector = ScalingSelector::with_defaults();
837        assert_eq!(selector.config.token_budget, 8000);
838    }
839
840    #[tokio::test]
841    async fn test_small_budget_selection() {
842        let selector = ScalingSelector::with_token_budget(1000);
843        assert_eq!(selector.config.token_budget, 1000);
844        assert!(matches!(
845            selector.config.selection_algorithm,
846            SelectionAlgorithm::V5Integrated
847        ));
848    }
849
850    #[tokio::test]
851    async fn test_medium_budget_selection() {
852        let selector = ScalingSelector::with_token_budget(10000);
853        assert_eq!(selector.config.token_budget, 10000);
854        assert!(matches!(
855            selector.config.selection_algorithm,
856            SelectionAlgorithm::V5Integrated
857        ));
858    }
859
860    #[tokio::test]
861    async fn test_file_selection_process() {
862        let temp_dir = TempDir::new().unwrap();
863        let repo_path = temp_dir.path();
864
865        // Create test files
866        fs::create_dir_all(repo_path.join("src")).unwrap();
867        fs::write(
868            repo_path.join("src/main.rs"),
869            "fn main() { println!(\"Hello, world!\"); }",
870        )
871        .unwrap();
872        fs::write(
873            repo_path.join("src/lib.rs"),
874            "pub fn hello() -> String { \"Hello\".to_string() }",
875        )
876        .unwrap();
877        fs::write(
878            repo_path.join("Cargo.toml"),
879            "[package]\nname = \"test\"\nversion = \"0.1.0\"",
880        )
881        .unwrap();
882        fs::write(
883            repo_path.join("README.md"),
884            "# Test Project\n\nThis is a test project.",
885        )
886        .unwrap();
887
888        let mut selector = ScalingSelector::with_token_budget(5000);
889        let result = selector.select_and_process(repo_path).await.unwrap();
890
891        // Should select some files but not all
892        assert!(result.selected_files.len() > 0);
893        assert!(result.selected_files.len() <= 4); // Don't select everything
894        assert!(result.tokens_used <= 5000); // Stay within budget
895        assert!(result.token_utilization <= 1.0); // Don't exceed budget
896    }
897
898    #[test]
899    fn test_token_estimation() {
900        let selector = ScalingSelector::with_defaults();
901
902        let rust_file = FileMetadata {
903            path: std::path::PathBuf::from("src/main.rs"),
904            size: 1000,
905            modified: std::time::SystemTime::now(),
906            language: "Rust".to_string(),
907            file_type: "Source".to_string(),
908        };
909
910        let tokens = selector.estimate_tokens(&rust_file);
911        assert!(tokens > 200); // Should estimate reasonable number of tokens
912
913        let config_file = FileMetadata {
914            path: std::path::PathBuf::from("Cargo.toml"),
915            size: 500,
916            modified: std::time::SystemTime::now(),
917            language: "TOML".to_string(),
918            file_type: "Configuration".to_string(),
919        };
920
921        let config_tokens = selector.estimate_tokens(&config_file);
922        assert!(config_tokens < tokens); // Config should estimate fewer tokens
923    }
924
925    #[test]
926    fn test_file_scoring() {
927        let selector = ScalingSelector::with_defaults();
928
929        let main_file = FileMetadata {
930            path: std::path::PathBuf::from("src/main.rs"),
931            size: 1000,
932            modified: std::time::SystemTime::now(),
933            language: "Rust".to_string(),
934            file_type: "Source".to_string(),
935        };
936
937        let score = selector.calculate_file_score(&main_file);
938        assert!(score > 0.7); // Main files should score high
939
940        let readme = FileMetadata {
941            path: std::path::PathBuf::from("README.md"),
942            size: 500,
943            modified: std::time::SystemTime::now(),
944            language: "Markdown".to_string(),
945            file_type: "Documentation".to_string(),
946        };
947
948        let readme_score = selector.calculate_file_score(&readme);
949        assert!(readme_score < score); // README should score lower than main.rs
950    }
951
952    #[tokio::test]
953    async fn test_context_positioning_integration() {
954        let temp_dir = TempDir::new().unwrap();
955        let repo_path = temp_dir.path();
956
957        // Create test files
958        fs::create_dir_all(repo_path.join("src")).unwrap();
959        fs::write(
960            repo_path.join("src/main.rs"),
961            "fn main() { println!(\"Hello, world!\"); }",
962        )
963        .unwrap();
964        fs::write(
965            repo_path.join("src/lib.rs"),
966            "pub fn hello() -> String { \"Hello\".to_string() }",
967        )
968        .unwrap();
969        fs::write(repo_path.join("src/utils.rs"), "pub fn utility() {}").unwrap();
970        fs::write(
971            repo_path.join("Cargo.toml"),
972            "[package]\nname = \"test\"\nversion = \"0.1.0\"",
973        )
974        .unwrap();
975
976        // Test with positioning enabled and query hint
977        let mut config = ScalingSelectionConfig::medium_budget();
978        config.positioning_config.enable_positioning = true;
979        let mut selector = ScalingSelector::new(config);
980
981        let result = selector
982            .select_and_process_with_query(repo_path, Some("main"))
983            .await
984            .unwrap();
985
986        // Should have positioning applied
987        assert!(result.has_context_positioning());
988
989        // Should have files distributed across tiers
990        let (head, middle, tail) = result.get_positioning_stats().unwrap();
991        assert!(head > 0);
992        assert!(head + middle + tail == result.selected_files.len());
993
994        // Should have positioning reasoning
995        assert!(result.get_positioning_reasoning().is_some());
996        let reasoning = result.get_positioning_reasoning().unwrap();
997        assert!(reasoning.contains("HEAD"));
998        assert!(reasoning.contains("TAIL"));
999
1000        // Test optimal ordering
1001        let ordered_files = result.get_optimally_ordered_files();
1002        assert_eq!(ordered_files.len(), result.selected_files.len());
1003    }
1004
1005    #[tokio::test]
1006    async fn test_positioning_disabled() {
1007        let temp_dir = TempDir::new().unwrap();
1008        let repo_path = temp_dir.path();
1009
1010        // Create test files
1011        fs::create_dir_all(repo_path.join("src")).unwrap();
1012        fs::write(repo_path.join("src/main.rs"), "fn main() {}").unwrap();
1013
1014        // Test with positioning disabled
1015        let mut config = ScalingSelectionConfig::small_budget();
1016        config.positioning_config.enable_positioning = false;
1017        let mut selector = ScalingSelector::new(config);
1018
1019        let result = selector
1020            .select_and_process_with_query(repo_path, Some("main"))
1021            .await
1022            .unwrap();
1023
1024        // Should not have positioning applied
1025        assert!(!result.has_context_positioning());
1026        assert!(result.positioned_selection.is_none());
1027
1028        // Optimal ordering should just return selected files
1029        let ordered_files = result.get_optimally_ordered_files();
1030        assert_eq!(ordered_files.len(), result.selected_files.len());
1031    }
1032
1033    #[test]
1034    fn test_configuration_builder_positioning() {
1035        let config = ScalingSelectionConfig::default();
1036        assert!(config.positioning_config.enable_positioning);
1037        assert_eq!(config.positioning_config.head_percentage, 0.20);
1038        assert_eq!(config.positioning_config.tail_percentage, 0.20);
1039
1040        let small_config = ScalingSelectionConfig::small_budget();
1041        assert!(small_config.positioning_config.enable_positioning);
1042
1043        let large_config = ScalingSelectionConfig::large_budget();
1044        assert!(large_config.positioning_config.enable_positioning);
1045    }
1046
1047    #[test]
1048    fn test_with_test_exclusion_convenience_method() {
1049        let config = ScalingSelectionConfig::default().with_test_exclusion();
1050
1051        // Verify the convenience method enabled test exclusion
1052        assert!(config.positioning_config.auto_exclude_tests);
1053
1054        // Test that it can be chained with other configurations
1055        let config_chained = ScalingSelectionConfig::medium_budget().with_test_exclusion();
1056
1057        assert!(config_chained.positioning_config.auto_exclude_tests);
1058        assert_eq!(config_chained.token_budget, 10000); // Should preserve medium budget setting
1059    }
1060}
scribe_scaling/selector.rs

scribe_scaling/
selector.rs