scribe_scaling/
selector.rs

1//! Intelligent Scaling Selector - Minimal Self-contained Selection
2//!
3//! This module provides intelligent file selection with token budget awareness.
4//! This is a simplified, self-contained implementation to avoid circular dependencies.
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::time::{Duration, Instant};
9
10use serde::{Deserialize, Serialize};
11use tracing::{debug, info, warn};
12
13use crate::engine::{ProcessingResult, ScalingConfig};
14use crate::error::{ScalingError, ScalingResult};
15use crate::positioning::{ContextPositioner, ContextPositioningConfig, PositionedSelection};
16use crate::streaming::{FileMetadata, ScoredFile, StreamingSelector};
17use scribe_core::{file, FileInfo, FileType};
18
19/// File category classification for quota allocation
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
21pub enum FileCategory {
22    Config,
23    Entry,
24    Examples,
25    General,
26}
27
28/// Selection algorithm variants
29#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
30pub enum SelectionAlgorithm {
31    /// Tiered approach with intelligent selection (V5)
32    V5Integrated,
33}
34
35/// Configuration for intelligent scaling selection
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct ScalingSelectionConfig {
38    /// Token budget for selection (like --token-target)
39    pub token_budget: usize,
40
41    /// Selection algorithm variant to use
42    pub selection_algorithm: SelectionAlgorithm,
43
44    /// Enable category-based quota allocation
45    pub enable_quotas: bool,
46
47    /// Context positioning configuration
48    pub positioning_config: ContextPositioningConfig,
49
50    /// Base scaling configuration
51    pub scaling_config: ScalingConfig,
52}
53
54impl Default for ScalingSelectionConfig {
55    fn default() -> Self {
56        Self {
57            token_budget: 8000,
58            selection_algorithm: SelectionAlgorithm::V5Integrated,
59            enable_quotas: true,
60            positioning_config: ContextPositioningConfig::default(),
61            scaling_config: ScalingConfig::default(),
62        }
63    }
64}
65
66impl ScalingSelectionConfig {
67    /// Create configuration for small token budget (should select ~2 files)
68    pub fn small_budget() -> Self {
69        Self {
70            token_budget: 1000,
71            selection_algorithm: SelectionAlgorithm::V5Integrated,
72            enable_quotas: true,
73            positioning_config: ContextPositioningConfig::default(),
74            scaling_config: ScalingConfig::small_repository(),
75        }
76    }
77
78    /// Enable auto-exclusion of test files (focuses on code and docs only)
79    pub fn with_test_exclusion(mut self) -> Self {
80        self.positioning_config.auto_exclude_tests = true;
81        self
82    }
83
84    /// Create configuration for medium token budget (should select ~11 files)
85    pub fn medium_budget() -> Self {
86        Self {
87            token_budget: 10000,
88            selection_algorithm: SelectionAlgorithm::V5Integrated,
89            enable_quotas: true,
90            positioning_config: ContextPositioningConfig::default(),
91            scaling_config: ScalingConfig::default(),
92        }
93    }
94
95    /// Create configuration for large token budget
96    pub fn large_budget() -> Self {
97        Self {
98            token_budget: 100000,
99            selection_algorithm: SelectionAlgorithm::V5Integrated,
100            enable_quotas: true,
101            positioning_config: ContextPositioningConfig::default(),
102            scaling_config: ScalingConfig::large_repository(),
103        }
104    }
105}
106
107/// Results of intelligent scaling selection
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct ScalingSelectionResult {
110    /// Selected files with metadata (if positioning disabled)
111    pub selected_files: Vec<FileMetadata>,
112
113    /// Context-positioned selection (if positioning enabled)
114    pub positioned_selection: Option<PositionedSelection>,
115
116    /// Total files considered during selection
117    pub total_files_considered: usize,
118
119    /// Token budget utilization
120    pub token_utilization: f64,
121
122    /// Actual tokens used by selected files
123    pub tokens_used: usize,
124
125    /// Selection algorithm used
126    pub algorithm_used: SelectionAlgorithm,
127
128    /// Selection performance metrics
129    pub selection_time: Duration,
130
131    /// Processing performance metrics (from scaling system)
132    pub processing_result: ProcessingResult,
133}
134
135/// Scored file for selection (selector-specific version)
136#[derive(Debug, Clone)]
137struct SelectorScoredFile {
138    metadata: FileMetadata,
139    tokens: usize,
140    score: f64,
141    category: FileCategory,
142}
143
144/// Main intelligent scaling selector
145pub struct ScalingSelector {
146    config: ScalingSelectionConfig,
147}
148
149impl ScalingSelector {
150    /// Create new scaling selector with configuration
151    pub fn new(config: ScalingSelectionConfig) -> Self {
152        Self { config }
153    }
154
155    /// Create with default configuration
156    pub fn with_defaults() -> Self {
157        Self::new(ScalingSelectionConfig::default())
158    }
159
160    /// Create with specific token budget (like --token-target)
161    pub fn with_token_budget(token_budget: usize) -> Self {
162        let config = match token_budget {
163            0..=2000 => ScalingSelectionConfig::small_budget(),
164            2001..=15000 => ScalingSelectionConfig::medium_budget(),
165            _ => ScalingSelectionConfig::large_budget(),
166        };
167
168        Self::new(ScalingSelectionConfig {
169            token_budget,
170            ..config
171        })
172    }
173
174    /// Execute intelligent selection with scaling optimizations
175    pub async fn select_and_process(
176        &mut self,
177        repo_path: &Path,
178    ) -> ScalingResult<ScalingSelectionResult> {
179        self.select_and_process_with_query(repo_path, None).await
180    }
181
182    /// Execute intelligent selection with query hint for context positioning
183    pub async fn select_and_process_with_query(
184        &mut self,
185        repo_path: &Path,
186        query_hint: Option<&str>,
187    ) -> ScalingResult<ScalingSelectionResult> {
188        let start_time = Instant::now();
189
190        info!(
191            "Starting intelligent scaling selection for: {:?}",
192            repo_path
193        );
194        info!(
195            "Token budget: {}, Algorithm: {:?}",
196            self.config.token_budget, self.config.selection_algorithm
197        );
198        if let Some(query) = query_hint {
199            info!("Query hint for positioning: '{}'", query);
200        }
201
202        // Phase 1: Optimized streaming discovery and selection
203        let discovery_start = Instant::now();
204        let selected_files = self.discover_and_select_files_streaming(repo_path).await?;
205        let discovery_time = discovery_start.elapsed();
206
207        info!(
208            "Selected {} files in {:?}",
209            selected_files.len(),
210            discovery_time
211        );
212
213        // Phase 3: Apply context positioning if enabled
214        let total_files_considered = selected_files.len();
215        let (positioned_selection, final_files, final_tokens) =
216            if self.config.positioning_config.enable_positioning {
217                let positioner = ContextPositioner::new(self.config.positioning_config.clone());
218                let positioned = positioner
219                    .position_files(selected_files.clone(), query_hint)
220                    .await?;
221
222                info!(
223                    "Context positioning applied: HEAD={}, MIDDLE={}, TAIL={}",
224                    positioned.positioning.head_files.len(),
225                    positioned.positioning.middle_files.len(),
226                    positioned.positioning.tail_files.len()
227                );
228
229                let tokens = positioned.total_tokens;
230                (Some(positioned), selected_files, tokens)
231            } else {
232                let tokens = self.calculate_tokens_used(&selected_files);
233                (None, selected_files, tokens)
234            };
235
236        // Phase 4: Apply scaling optimizations to selected subset
237        let processing_result = self.apply_scaling_optimizations(&final_files).await?;
238
239        // Phase 5: Calculate final metrics
240        let token_utilization = final_tokens as f64 / self.config.token_budget as f64;
241
242        let total_time = start_time.elapsed();
243        info!("Total selection and processing time: {:?}", total_time);
244        info!(
245            "Token utilization: {:.1}% ({}/{})",
246            token_utilization * 100.0,
247            final_tokens,
248            self.config.token_budget
249        );
250
251        Ok(ScalingSelectionResult {
252            selected_files: final_files,
253            positioned_selection,
254            total_files_considered, // We only process selected files now
255            token_utilization,
256            tokens_used: final_tokens,
257            algorithm_used: self.config.selection_algorithm,
258            selection_time: discovery_time, // This now includes both discovery and selection
259            processing_result,
260        })
261    }
262
263    /// Optimized streaming file discovery with intelligent selection
264    async fn discover_and_select_files_streaming(
265        &self,
266        repo_path: &Path,
267    ) -> ScalingResult<Vec<FileMetadata>> {
268        info!("Using optimized streaming file discovery");
269
270        // Create streaming selector
271        let streaming_config = crate::streaming::StreamingConfig {
272            enable_streaming: true,
273            concurrency_limit: num_cpus::get() * 2,
274            memory_limit: 100 * 1024 * 1024, // 100MB
275            selection_heap_size: self.config.token_budget * 2, // Allow larger heap for better selection
276        };
277
278        let streaming_selector = StreamingSelector::new(streaming_config);
279
280        // Calculate target file count based on token budget
281        let target_count = self.estimate_target_file_count();
282
283        // Create scoring functions
284        let score_fn = {
285            let token_budget = self.config.token_budget;
286            move |file: &FileMetadata| -> f64 {
287                Self::calculate_file_score_static(file, token_budget)
288            }
289        };
290
291        let token_fn = {
292            let token_budget = self.config.token_budget;
293            move |file: &FileMetadata| -> usize { Self::estimate_tokens_static(file, token_budget) }
294        };
295
296        // Use streaming selection for O(N log K) performance
297        let scored_files = streaming_selector
298            .select_files_streaming(
299                repo_path,
300                target_count,
301                self.config.token_budget,
302                score_fn,
303                token_fn,
304            )
305            .await?;
306
307        // Extract metadata from scored files
308        let selected_files: Vec<FileMetadata> = scored_files
309            .into_iter()
310            .map(|scored| scored.metadata)
311            .collect();
312
313        info!(
314            "Streaming selection completed: {} files selected",
315            selected_files.len()
316        );
317        Ok(selected_files)
318    }
319
320    /// Estimate target number of files to select
321    fn estimate_target_file_count(&self) -> usize {
322        // Conservative estimate: aim for ~300 tokens per file on average
323        // This gives us room for both small config files and larger source files
324        let estimated_files = self.config.token_budget / 300;
325
326        // Clamp between reasonable bounds
327        estimated_files.clamp(5, 200)
328    }
329
330    /// Simple language detection based on file extension
331    fn detect_language(&self, path: &Path) -> String {
332        let ext = path
333            .extension()
334            .and_then(|s| s.to_str())
335            .map(|s| s.to_lowercase());
336
337        if matches!(ext.as_deref(), Some("h" | "hpp" | "hxx")) {
338            return "Header".to_string();
339        }
340
341        if path
342            .file_name()
343            .and_then(|s| s.to_str())
344            .map(|s| s.eq_ignore_ascii_case("dockerfile"))
345            .unwrap_or(false)
346        {
347            return "Dockerfile".to_string();
348        }
349
350        let language = file::detect_language_from_path(path);
351        file::language_display_name(&language).to_string()
352    }
353
354    /// Simple file type classification
355    fn classify_file_type(&self, path: &Path) -> String {
356        let extension = path
357            .extension()
358            .and_then(|s| s.to_str())
359            .map(|s| s.to_lowercase())
360            .unwrap_or_default();
361
362        let language = file::detect_language_from_path(path);
363        let file_type =
364            FileInfo::classify_file_type(path.to_string_lossy().as_ref(), &language, &extension);
365
366        match file_type {
367            FileType::Test { .. } => "Test".to_string(),
368            FileType::Documentation { .. } => "Documentation".to_string(),
369            FileType::Configuration { .. } => "Configuration".to_string(),
370            FileType::Binary => "Binary".to_string(),
371            FileType::Generated => "Generated".to_string(),
372            FileType::Source { .. } => match extension.as_str() {
373                "h" | "hpp" | "hxx" => "Header".to_string(),
374                _ => "Source".to_string(),
375            },
376            FileType::Unknown => match extension.as_str() {
377                "md" | "txt" | "rst" => "Documentation".to_string(),
378                "json" | "yaml" | "yml" | "toml" | "ini" | "cfg" => "Configuration".to_string(),
379                "png" | "jpg" | "jpeg" | "gif" | "svg" => "Image".to_string(),
380                _ => "Other".to_string(),
381            },
382        }
383    }
384
385    /// Apply intelligent selection algorithm based on configuration
386    async fn apply_intelligent_selection(
387        &self,
388        files: &[FileMetadata],
389    ) -> ScalingResult<Vec<FileMetadata>> {
390        // V5 Integrated selection algorithm (tiered approach)
391        self.apply_integrated_selection(files)
392    }
393
394    /// V5 Integrated selection: tiered approach with intelligent prioritization
395    fn apply_integrated_selection(
396        &self,
397        files: &[FileMetadata],
398    ) -> ScalingResult<Vec<FileMetadata>> {
399        // Score all files
400        let mut scored_files: Vec<SelectorScoredFile> = files
401            .iter()
402            .map(|file| {
403                let tokens = self.estimate_tokens(file);
404                let score = self.calculate_file_score(file);
405                let category = self.classify_file(file);
406
407                SelectorScoredFile {
408                    metadata: file.clone(),
409                    tokens,
410                    score,
411                    category,
412                }
413            })
414            .collect();
415
416        // Group by category for tiered selection
417        let mut categorized: HashMap<FileCategory, Vec<SelectorScoredFile>> = HashMap::new();
418        for scored_file in scored_files {
419            categorized
420                .entry(scored_file.category)
421                .or_insert_with(Vec::new)
422                .push(scored_file);
423        }
424
425        // Sort within each category by score
426        for files in categorized.values_mut() {
427            files.sort_by(|a, b| {
428                b.score
429                    .partial_cmp(&a.score)
430                    .unwrap_or(std::cmp::Ordering::Equal)
431            });
432        }
433
434        // V5 Tiered selection with intelligent allocation
435        let mut selected = Vec::new();
436        let mut remaining_budget = self.config.token_budget;
437
438        // Tier 1: Critical entry points (highest priority)
439        let tier1_order = [FileCategory::Entry, FileCategory::Config];
440        for category in tier1_order.iter() {
441            if let Some(files) = categorized.get(category) {
442                let tier_budget = match category {
443                    FileCategory::Entry => (self.config.token_budget as f64 * 0.35) as usize, // 35% for entry points
444                    FileCategory::Config => (self.config.token_budget as f64 * 0.25) as usize, // 25% for config
445                    _ => 0,
446                };
447
448                let mut used_budget = 0;
449                for scored_file in files {
450                    if used_budget + scored_file.tokens <= tier_budget
451                        && scored_file.tokens <= remaining_budget
452                    {
453                        selected.push(scored_file.metadata.clone());
454                        used_budget += scored_file.tokens;
455                        remaining_budget = remaining_budget.saturating_sub(scored_file.tokens);
456                    }
457                }
458            }
459        }
460
461        // Tier 2: General implementation files (fill remaining budget intelligently)
462        if let Some(general_files) = categorized.get(&FileCategory::General) {
463            for scored_file in general_files {
464                if scored_file.tokens <= remaining_budget {
465                    selected.push(scored_file.metadata.clone());
466                    remaining_budget = remaining_budget.saturating_sub(scored_file.tokens);
467                }
468            }
469        }
470
471        // Tier 3: Examples (lowest priority, use remaining budget)
472        if let Some(example_files) = categorized.get(&FileCategory::Examples) {
473            for scored_file in example_files {
474                if scored_file.tokens <= remaining_budget {
475                    selected.push(scored_file.metadata.clone());
476                    remaining_budget = remaining_budget.saturating_sub(scored_file.tokens);
477                }
478            }
479        }
480
481        Ok(selected)
482    }
483
484    /// Apply scaling optimizations to selected files
485    async fn apply_scaling_optimizations(
486        &self,
487        selected_files: &[FileMetadata],
488    ) -> ScalingResult<ProcessingResult> {
489        // Create a mock processing result optimized for selected subset
490        let total_size: u64 = selected_files.iter().map(|f| f.size).sum();
491        let processing_time = Duration::from_millis((selected_files.len() as u64 * 2).max(10)); // Fast for selected subset
492        let memory_peak = (selected_files.len() * 1024).max(1024); // Minimal memory usage
493
494        Ok(ProcessingResult {
495            files: selected_files.to_vec(),
496            total_files: selected_files.len(),
497            processing_time,
498            memory_peak,
499            cache_hits: 0,
500            cache_misses: selected_files.len() as u64,
501            metrics: crate::metrics::ScalingMetrics {
502                files_processed: selected_files.len() as u64,
503                total_processing_time: processing_time,
504                memory_peak,
505                cache_hits: 0,
506                cache_misses: selected_files.len() as u64,
507                parallel_efficiency: 1.0,
508                streaming_overhead: Duration::from_millis(0),
509            },
510        })
511    }
512
513    /// Calculate tokens used by selected files
514    fn calculate_tokens_used(&self, selected_files: &[FileMetadata]) -> usize {
515        selected_files
516            .iter()
517            .map(|file| self.estimate_tokens(file))
518            .sum()
519    }
520
521    /// Estimate tokens for a file based on size and type (matching original scribe behavior)
522    fn estimate_tokens(&self, file: &FileMetadata) -> usize {
523        // Use more realistic token estimation like original scribe
524        // Original scribe uses ~3.5 chars per token on average
525        let base_tokens = ((file.size as f64) / 3.5) as usize;
526
527        // Add minimum token count for very small files to avoid underestimation
528        // Make minimum higher for small budgets to be more selective
529        let min_tokens = if self.config.token_budget < 5000 {
530            100 // Higher minimum for small budgets
531        } else {
532            50 // Standard minimum
533        };
534        let base_tokens = base_tokens.max(min_tokens);
535
536        // Adjust based on file type (more realistic multipliers)
537        let multiplier = match file.file_type.as_str() {
538            "Source" => 1.2,        // Source code has more complexity
539            "Documentation" => 1.0, // Documentation is standard
540            "Configuration" => 0.8, // Config files are more compact
541            _ => 1.1,               // Default higher to be conservative
542        };
543
544        // Apply language-specific adjustments
545        let language_multiplier = match file.language.as_str() {
546            "Rust" => 1.3,                      // Rust is very verbose
547            "JavaScript" | "TypeScript" => 1.2, // JS/TS moderately verbose
548            "Python" => 1.1,                    // Python is readable but efficient
549            "C" | "Go" => 1.0,                  // C/Go are concise
550            "HTML" | "CSS" => 0.9,              // Markup is less token-dense
551            "JSON" | "YAML" | "TOML" => 0.7,    // Data formats are compact
552            _ => 1.0,                           // Default
553        };
554
555        // Final calculation with realistic scaling
556        let final_tokens = (base_tokens as f64 * multiplier * language_multiplier) as usize;
557
558        // Cap extremely large files to avoid single file consuming entire budget
559        final_tokens.min(self.config.token_budget / 4) // No single file > 25% of budget
560    }
561
562    /// Calculate file score for selection (aggressive prioritization like original scribe)
563    fn calculate_file_score(&self, file: &FileMetadata) -> f64 {
564        let mut score: f64 = 0.1; // Lower base score to be more selective
565
566        let path_str = file.path.to_string_lossy().to_lowercase();
567
568        // High-priority entry points (like original scribe)
569        if path_str.contains("main") || path_str.contains("index") {
570            score += 2.0; // Very high priority
571        }
572        if path_str.contains("lib.rs") || path_str.contains("mod.rs") {
573            score += 1.5; // High priority for Rust entry points
574        }
575        if path_str.contains("__init__.py") {
576            score += 1.3; // High priority for Python packages
577        }
578
579        // Root-level files get major boost (like README, setup files)
580        let path_components = file.path.components().count();
581        if path_components <= 2 {
582            // Root or one level down
583            score += 1.0;
584
585            // Special boost for important root files
586            if path_str.contains("readme")
587                || path_str.contains("license")
588                || path_str.contains("cargo.toml")
589                || path_str.contains("package.json")
590                || path_str.contains("pyproject.toml")
591                || path_str.contains("setup.py")
592            {
593                score += 1.5;
594            }
595        }
596
597        // Language importance (more aggressive)
598        match file.language.as_str() {
599            "Rust" | "Python" | "JavaScript" | "TypeScript" => score += 0.8,
600            "C" | "C++" | "Go" | "Java" => score += 0.6,
601            "Shell" | "Makefile" => score += 0.4, // Build scripts
602            _ => {}
603        }
604
605        // File type importance
606        match file.file_type.as_str() {
607            "Source" => score += 0.6,
608            "Configuration" => score += 0.5, // Config files are very important
609            "Documentation" => score += 0.3,
610            _ => {}
611        }
612
613        // Penalize very large files more heavily to stay within budget
614        if file.size > 50_000 {
615            score -= 0.5;
616        }
617        if file.size > 100_000 {
618            score -= 1.0;
619        }
620
621        // Boost for certain important patterns
622        if path_str.contains("test") && !path_str.contains("tests/") {
623            score += 0.2; // Important test files but not test directories
624        }
625
626        // Penalize deep nesting (prefer top-level files)
627        if path_components > 4 {
628            score -= 0.3 * (path_components - 4) as f64;
629        }
630
631        // Boost small, important files
632        if file.size < 10_000 && (path_str.contains("config") || path_str.contains("env")) {
633            score += 0.4;
634        }
635
636        score.clamp(0.0, 5.0) // Allow higher scores for very important files
637    }
638
639    /// Classify file into category
640    fn classify_file(&self, file: &FileMetadata) -> FileCategory {
641        let path_str = file.path.to_string_lossy().to_lowercase();
642        let filename = file
643            .path
644            .file_name()
645            .and_then(|n| n.to_str())
646            .unwrap_or("")
647            .to_lowercase();
648
649        // Check for config files
650        if matches!(file.file_type.as_str(), "Configuration")
651            || filename.contains("config")
652            || filename.ends_with(".toml")
653            || filename.ends_with(".json")
654            || filename.ends_with(".yaml")
655        {
656            return FileCategory::Config;
657        }
658
659        // Check for entry points
660        if filename.contains("main")
661            || filename.contains("index")
662            || filename == "lib.rs"
663            || filename == "__init__.py"
664        {
665            return FileCategory::Entry;
666        }
667
668        // Check for examples/tests
669        if path_str.contains("example")
670            || path_str.contains("test")
671            || path_str.contains("demo")
672            || path_str.contains("sample")
673        {
674            return FileCategory::Examples;
675        }
676
677        FileCategory::General
678    }
679
680    /// Static version of file scoring for use in streaming selector
681    fn calculate_file_score_static(file: &FileMetadata, token_budget: usize) -> f64 {
682        let mut score: f64 = 0.1; // Lower base score to be more selective
683
684        let path_str = file.path.to_string_lossy().to_lowercase();
685
686        // High-priority entry points (like original scribe)
687        if path_str.contains("main") || path_str.contains("index") {
688            score += 2.0; // Very high priority
689        }
690        if path_str.contains("lib.rs") || path_str.contains("mod.rs") {
691            score += 1.5; // High priority for Rust entry points
692        }
693        if path_str.contains("__init__.py") {
694            score += 1.3; // High priority for Python packages
695        }
696
697        // Root-level files get major boost (like README, setup files)
698        let path_components = file.path.components().count();
699        if path_components <= 2 {
700            // Root or one level down
701            score += 1.0;
702
703            // Special boost for important root files
704            if path_str.contains("readme")
705                || path_str.contains("license")
706                || path_str.contains("cargo.toml")
707                || path_str.contains("package.json")
708                || path_str.contains("pyproject.toml")
709                || path_str.contains("setup.py")
710            {
711                score += 1.5;
712            }
713        }
714
715        // Language importance (more aggressive)
716        match file.language.as_str() {
717            "Rust" | "Python" | "JavaScript" | "TypeScript" => score += 0.8,
718            "C" | "C++" | "Go" | "Java" => score += 0.6,
719            "Shell" => score += 0.4, // Build scripts
720            _ => {}
721        }
722
723        // File type importance
724        match file.file_type.as_str() {
725            "Source" => score += 0.6,
726            "Configuration" => score += 0.5, // Config files are very important
727            "Documentation" => score += 0.3,
728            _ => {}
729        }
730
731        // Penalize very large files more heavily to stay within budget
732        if file.size > 50_000 {
733            score -= 0.5;
734        }
735        if file.size > 100_000 {
736            score -= 1.0;
737        }
738
739        // Boost for certain important patterns
740        if path_str.contains("test") && !path_str.contains("tests/") {
741            score += 0.2; // Important test files but not test directories
742        }
743
744        // Penalize deep nesting (prefer top-level files)
745        if path_components > 4 {
746            score -= 0.3 * (path_components - 4) as f64;
747        }
748
749        // Boost small, important files
750        if file.size < 10_000 && (path_str.contains("config") || path_str.contains("env")) {
751            score += 0.4;
752        }
753
754        score.clamp(0.0, 5.0) // Allow higher scores for very important files
755    }
756
757    /// Static version of token estimation for use in streaming selector
758    fn estimate_tokens_static(file: &FileMetadata, token_budget: usize) -> usize {
759        // Use more realistic token estimation like original scribe
760        // Original scribe uses ~3.5 chars per token on average
761        let base_tokens = ((file.size as f64) / 3.5) as usize;
762
763        // Add minimum token count for very small files to avoid underestimation
764        // Make minimum higher for small budgets to be more selective
765        let min_tokens = if token_budget < 5000 {
766            100 // Higher minimum for small budgets
767        } else {
768            50 // Standard minimum
769        };
770        let base_tokens = base_tokens.max(min_tokens);
771
772        // Adjust based on file type (more realistic multipliers)
773        let multiplier = match file.file_type.as_str() {
774            "Source" => 1.2,        // Source code has more complexity
775            "Documentation" => 1.0, // Documentation is standard
776            "Configuration" => 0.8, // Config files are more compact
777            _ => 1.1,               // Default higher to be conservative
778        };
779
780        // Apply language-specific adjustments
781        let language_multiplier = match file.language.as_str() {
782            "Rust" => 1.3,                      // Rust is very verbose
783            "JavaScript" | "TypeScript" => 1.2, // JS/TS moderately verbose
784            "Python" => 1.1,                    // Python is readable but efficient
785            "C" | "Go" => 1.0,                  // C/Go are concise
786            "HTML" | "CSS" => 0.9,              // Markup is less token-dense
787            "JSON" | "YAML" | "TOML" => 0.7,    // Data formats are compact
788            _ => 1.0,                           // Default
789        };
790
791        // Final calculation with realistic scaling
792        let final_tokens = (base_tokens as f64 * multiplier * language_multiplier) as usize;
793
794        // Cap extremely large files to avoid single file consuming entire budget
795        final_tokens.min(token_budget / 4) // No single file > 25% of budget
796    }
797}
798
799impl ScalingSelectionResult {
800    /// Get all files in optimal order (positioned if available, otherwise selected)
801    pub fn get_optimally_ordered_files(&self) -> Vec<&FileMetadata> {
802        if let Some(positioned) = &self.positioned_selection {
803            let mut files = Vec::new();
804
805            // HEAD files first (query-relevant, high centrality)
806            for file in &positioned.positioning.head_files {
807                files.push(&file.metadata);
808            }
809
810            // MIDDLE files (supporting, low centrality)
811            for file in &positioned.positioning.middle_files {
812                files.push(&file.metadata);
813            }
814
815            // TAIL files last (core functionality, high centrality)
816            for file in &positioned.positioning.tail_files {
817                files.push(&file.metadata);
818            }
819
820            files
821        } else {
822            self.selected_files.iter().collect()
823        }
824    }
825
826    /// Get positioning statistics if available
827    pub fn get_positioning_stats(&self) -> Option<(usize, usize, usize)> {
828        self.positioned_selection.as_ref().map(|p| {
829            (
830                p.positioning.head_files.len(),
831                p.positioning.middle_files.len(),
832                p.positioning.tail_files.len(),
833            )
834        })
835    }
836
837    /// Get positioning reasoning if available
838    pub fn get_positioning_reasoning(&self) -> Option<&str> {
839        self.positioned_selection
840            .as_ref()
841            .map(|p| p.positioning_reasoning.as_str())
842    }
843
844    /// Check if context positioning was applied
845    pub fn has_context_positioning(&self) -> bool {
846        self.positioned_selection.is_some()
847    }
848}
849
850#[cfg(test)]
851mod tests {
852    use super::*;
853    use std::fs;
854    use tempfile::TempDir;
855
856    #[tokio::test]
857    async fn test_scaling_selector_creation() {
858        let selector = ScalingSelector::with_defaults();
859        assert_eq!(selector.config.token_budget, 8000);
860    }
861
862    #[tokio::test]
863    async fn test_small_budget_selection() {
864        let selector = ScalingSelector::with_token_budget(1000);
865        assert_eq!(selector.config.token_budget, 1000);
866        assert!(matches!(
867            selector.config.selection_algorithm,
868            SelectionAlgorithm::V5Integrated
869        ));
870    }
871
872    #[tokio::test]
873    async fn test_medium_budget_selection() {
874        let selector = ScalingSelector::with_token_budget(10000);
875        assert_eq!(selector.config.token_budget, 10000);
876        assert!(matches!(
877            selector.config.selection_algorithm,
878            SelectionAlgorithm::V5Integrated
879        ));
880    }
881
882    #[tokio::test]
883    async fn test_file_selection_process() {
884        let temp_dir = TempDir::new().unwrap();
885        let repo_path = temp_dir.path();
886
887        // Create test files
888        fs::create_dir_all(repo_path.join("src")).unwrap();
889        fs::write(
890            repo_path.join("src/main.rs"),
891            "fn main() { println!(\"Hello, world!\"); }",
892        )
893        .unwrap();
894        fs::write(
895            repo_path.join("src/lib.rs"),
896            "pub fn hello() -> String { \"Hello\".to_string() }",
897        )
898        .unwrap();
899        fs::write(
900            repo_path.join("Cargo.toml"),
901            "[package]\nname = \"test\"\nversion = \"0.1.0\"",
902        )
903        .unwrap();
904        fs::write(
905            repo_path.join("README.md"),
906            "# Test Project\n\nThis is a test project.",
907        )
908        .unwrap();
909
910        let mut selector = ScalingSelector::with_token_budget(5000);
911        let result = selector.select_and_process(repo_path).await.unwrap();
912
913        // Should select some files but not all
914        assert!(result.selected_files.len() > 0);
915        assert!(result.selected_files.len() <= 4); // Don't select everything
916        assert!(result.tokens_used <= 5000); // Stay within budget
917        assert!(result.token_utilization <= 1.0); // Don't exceed budget
918    }
919
920    #[test]
921    fn test_token_estimation() {
922        let selector = ScalingSelector::with_defaults();
923
924        let rust_file = FileMetadata {
925            path: std::path::PathBuf::from("src/main.rs"),
926            size: 1000,
927            modified: std::time::SystemTime::now(),
928            language: "Rust".to_string(),
929            file_type: "Source".to_string(),
930        };
931
932        let tokens = selector.estimate_tokens(&rust_file);
933        assert!(tokens > 200); // Should estimate reasonable number of tokens
934
935        let config_file = FileMetadata {
936            path: std::path::PathBuf::from("Cargo.toml"),
937            size: 500,
938            modified: std::time::SystemTime::now(),
939            language: "TOML".to_string(),
940            file_type: "Configuration".to_string(),
941        };
942
943        let config_tokens = selector.estimate_tokens(&config_file);
944        assert!(config_tokens < tokens); // Config should estimate fewer tokens
945    }
946
947    #[test]
948    fn test_file_scoring() {
949        let selector = ScalingSelector::with_defaults();
950
951        let main_file = FileMetadata {
952            path: std::path::PathBuf::from("src/main.rs"),
953            size: 1000,
954            modified: std::time::SystemTime::now(),
955            language: "Rust".to_string(),
956            file_type: "Source".to_string(),
957        };
958
959        let score = selector.calculate_file_score(&main_file);
960        assert!(score > 0.7); // Main files should score high
961
962        let readme = FileMetadata {
963            path: std::path::PathBuf::from("README.md"),
964            size: 500,
965            modified: std::time::SystemTime::now(),
966            language: "Markdown".to_string(),
967            file_type: "Documentation".to_string(),
968        };
969
970        let readme_score = selector.calculate_file_score(&readme);
971        assert!(readme_score < score); // README should score lower than main.rs
972    }
973
974    #[tokio::test]
975    async fn test_context_positioning_integration() {
976        let temp_dir = TempDir::new().unwrap();
977        let repo_path = temp_dir.path();
978
979        // Create test files
980        fs::create_dir_all(repo_path.join("src")).unwrap();
981        fs::write(
982            repo_path.join("src/main.rs"),
983            "fn main() { println!(\"Hello, world!\"); }",
984        )
985        .unwrap();
986        fs::write(
987            repo_path.join("src/lib.rs"),
988            "pub fn hello() -> String { \"Hello\".to_string() }",
989        )
990        .unwrap();
991        fs::write(repo_path.join("src/utils.rs"), "pub fn utility() {}").unwrap();
992        fs::write(
993            repo_path.join("Cargo.toml"),
994            "[package]\nname = \"test\"\nversion = \"0.1.0\"",
995        )
996        .unwrap();
997
998        // Test with positioning enabled and query hint
999        let mut config = ScalingSelectionConfig::medium_budget();
1000        config.positioning_config.enable_positioning = true;
1001        let mut selector = ScalingSelector::new(config);
1002
1003        let result = selector
1004            .select_and_process_with_query(repo_path, Some("main"))
1005            .await
1006            .unwrap();
1007
1008        // Should have positioning applied
1009        assert!(result.has_context_positioning());
1010
1011        // Should have files distributed across tiers
1012        let (head, middle, tail) = result.get_positioning_stats().unwrap();
1013        assert!(head > 0);
1014        assert!(head + middle + tail == result.selected_files.len());
1015
1016        // Should have positioning reasoning
1017        assert!(result.get_positioning_reasoning().is_some());
1018        let reasoning = result.get_positioning_reasoning().unwrap();
1019        assert!(reasoning.contains("HEAD"));
1020        assert!(reasoning.contains("TAIL"));
1021
1022        // Test optimal ordering
1023        let ordered_files = result.get_optimally_ordered_files();
1024        assert_eq!(ordered_files.len(), result.selected_files.len());
1025    }
1026
1027    #[tokio::test]
1028    async fn test_positioning_disabled() {
1029        let temp_dir = TempDir::new().unwrap();
1030        let repo_path = temp_dir.path();
1031
1032        // Create test files
1033        fs::create_dir_all(repo_path.join("src")).unwrap();
1034        fs::write(repo_path.join("src/main.rs"), "fn main() {}").unwrap();
1035
1036        // Test with positioning disabled
1037        let mut config = ScalingSelectionConfig::small_budget();
1038        config.positioning_config.enable_positioning = false;
1039        let mut selector = ScalingSelector::new(config);
1040
1041        let result = selector
1042            .select_and_process_with_query(repo_path, Some("main"))
1043            .await
1044            .unwrap();
1045
1046        // Should not have positioning applied
1047        assert!(!result.has_context_positioning());
1048        assert!(result.positioned_selection.is_none());
1049
1050        // Optimal ordering should just return selected files
1051        let ordered_files = result.get_optimally_ordered_files();
1052        assert_eq!(ordered_files.len(), result.selected_files.len());
1053    }
1054
1055    #[test]
1056    fn test_configuration_builder_positioning() {
1057        let config = ScalingSelectionConfig::default();
1058        assert!(config.positioning_config.enable_positioning);
1059        assert_eq!(config.positioning_config.head_percentage, 0.20);
1060        assert_eq!(config.positioning_config.tail_percentage, 0.20);
1061
1062        let small_config = ScalingSelectionConfig::small_budget();
1063        assert!(small_config.positioning_config.enable_positioning);
1064
1065        let large_config = ScalingSelectionConfig::large_budget();
1066        assert!(large_config.positioning_config.enable_positioning);
1067    }
1068
1069    #[test]
1070    fn test_with_test_exclusion_convenience_method() {
1071        let config = ScalingSelectionConfig::default().with_test_exclusion();
1072
1073        // Verify the convenience method enabled test exclusion
1074        assert!(config.positioning_config.auto_exclude_tests);
1075
1076        // Test that it can be chained with other configurations
1077        let config_chained = ScalingSelectionConfig::medium_budget().with_test_exclusion();
1078
1079        assert!(config_chained.positioning_config.auto_exclude_tests);
1080        assert_eq!(config_chained.token_budget, 10000); // Should preserve medium budget setting
1081    }
1082}