scribe_selection/
quota.rs

1use std::collections::HashMap;
2use serde::{Deserialize, Serialize};
3
4use scribe_core::{ScribeError, Result as ScribeResult};
5use scribe_analysis::heuristics::ScanResult;
6
7/// Simple ScanResult implementation for quota system
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct QuotaScanResult {
10    pub path: String,
11    pub relative_path: String,
12    pub depth: usize,
13    pub content: String,
14    pub is_entrypoint: bool,
15    pub priority_boost: f64,
16    pub churn_score: f64,
17    pub centrality_in: f64,
18    pub imports: Option<Vec<String>>,
19    pub is_docs: bool,
20    pub is_readme: bool,
21    pub is_test: bool,
22    pub has_examples: bool,
23}
24
25impl ScanResult for QuotaScanResult {
26    fn path(&self) -> &str {
27        &self.path
28    }
29    
30    fn relative_path(&self) -> &str {
31        &self.relative_path
32    }
33    
34    fn depth(&self) -> usize {
35        self.depth
36    }
37    
38    fn is_docs(&self) -> bool {
39        self.is_docs
40    }
41    
42    fn is_readme(&self) -> bool {
43        self.is_readme
44    }
45    
46    fn is_test(&self) -> bool {
47        self.is_test
48    }
49    
50    fn is_entrypoint(&self) -> bool {
51        self.is_entrypoint
52    }
53    
54    fn has_examples(&self) -> bool {
55        self.has_examples
56    }
57    
58    fn priority_boost(&self) -> f64 {
59        self.priority_boost
60    }
61    
62    fn churn_score(&self) -> f64 {
63        self.churn_score
64    }
65    
66    fn centrality_in(&self) -> f64 {
67        self.centrality_in
68    }
69    
70    fn imports(&self) -> Option<&[String]> {
71        self.imports.as_deref()
72    }
73    
74    fn doc_analysis(&self) -> Option<&scribe_analysis::heuristics::DocumentAnalysis> {
75        None // Simplified for now
76    }
77}
78
79/// File category classification for quota allocation
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
81pub enum FileCategory {
82    Config,
83    Entry, 
84    Examples,
85    General,
86}
87
88impl FileCategory {
89    pub fn as_str(&self) -> &'static str {
90        match self {
91            FileCategory::Config => "config",
92            FileCategory::Entry => "entry",
93            FileCategory::Examples => "examples", 
94            FileCategory::General => "general",
95        }
96    }
97}
98
99/// Budget quota configuration for a file category
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct CategoryQuota {
102    pub category: FileCategory,
103    pub min_budget_pct: f64,     // Minimum budget percentage reserved
104    pub max_budget_pct: f64,     // Maximum budget percentage allowed  
105    pub recall_target: f64,      // Recall target (0.0-1.0, 0 means no target)
106    pub priority_multiplier: f64, // Priority boost for this category
107}
108
109impl CategoryQuota {
110    pub fn new(
111        category: FileCategory,
112        min_budget_pct: f64,
113        max_budget_pct: f64,
114        recall_target: f64,
115        priority_multiplier: f64,
116    ) -> Self {
117        Self {
118            category,
119            min_budget_pct,
120            max_budget_pct,
121            recall_target,
122            priority_multiplier,
123        }
124    }
125}
126
127/// Actual budget allocation result for a category
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct QuotaAllocation {
130    pub category: FileCategory,
131    pub allocated_budget: usize,
132    pub used_budget: usize,
133    pub file_count: usize,
134    pub recall_achieved: f64,
135    pub density_score: f64,
136}
137
138/// Detects file categories for quota allocation
139#[derive(Debug, Clone)]
140pub struct CategoryDetector {
141    config_patterns: Vec<&'static str>,
142    entry_patterns: Vec<&'static str>,
143    examples_patterns: Vec<&'static str>,
144}
145
146impl Default for CategoryDetector {
147    fn default() -> Self {
148        Self::new()
149    }
150}
151
152impl CategoryDetector {
153    pub fn new() -> Self {
154        Self {
155            // Config file patterns
156            config_patterns: vec![
157                // Configuration files
158                ".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf",
159                // Build and dependency files  
160                "package.json", "requirements.txt", "pyproject.toml", "cargo.toml",
161                "setup.py", "setup.cfg", "makefile", "dockerfile", "docker-compose.yml",
162                // CI/CD configuration
163                ".github", ".gitlab-ci.yml", ".travis.yml", ".circleci",
164                // IDE and tool configuration
165                ".vscode", ".idea", ".editorconfig", "tsconfig.json", "tslint.json",
166                "eslint.json", ".eslintrc", ".prettierrc", "jest.config.js"
167            ],
168            
169            // Entry point patterns
170            entry_patterns: vec![
171                "main.py", "__main__.py", "app.py", "server.py", "index.py",
172                "main.js", "index.js", "app.js", "server.js", "index.ts", "main.ts",
173                "main.go", "main.rs", "lib.rs", "mod.rs"
174            ],
175            
176            // Example/demo patterns
177            examples_patterns: vec![
178                "example", "examples", "demo", "demos", "sample", "samples",
179                "tutorial", "tutorials", "test", "tests", "spec", "specs",
180                "benchmark", "benchmarks"
181            ],
182        }
183    }
184    
185    /// Detect the category of a file based on its scan result
186    pub fn detect_category(&self, scan_result: &QuotaScanResult) -> FileCategory {
187        let path = scan_result.path.to_lowercase();
188        let filename = scan_result.path
189            .split('/')
190            .last()
191            .unwrap_or("")
192            .to_lowercase();
193        
194        // Check for config files
195        if self.is_config_file(&path, &filename) {
196            return FileCategory::Config;
197        }
198        
199        // Check for entry points
200        if self.is_entry_file(&path, &filename, scan_result) {
201            return FileCategory::Entry;
202        }
203        
204        // Check for examples
205        if self.is_examples_file(&path, &filename) {
206            return FileCategory::Examples;
207        }
208        
209        FileCategory::General
210    }
211    
212    fn is_config_file(&self, path: &str, filename: &str) -> bool {
213        // Check file extensions and names
214        for pattern in &self.config_patterns {
215            if filename.contains(pattern) || path.contains(pattern) {
216                return true;
217            }
218        }
219        false
220    }
221    
222    fn is_entry_file(&self, path: &str, filename: &str, scan_result: &QuotaScanResult) -> bool {
223        // Check explicit entry point markers
224        if scan_result.is_entrypoint {
225            return true;
226        }
227        
228        // Check filename patterns
229        for pattern in &self.entry_patterns {
230            if filename == *pattern {
231                return true;
232            }
233        }
234        
235        // Check for main function or entry point indicators
236        // This would require content analysis - for now just use filename patterns
237        false
238    }
239    
240    fn is_examples_file(&self, path: &str, filename: &str) -> bool {
241        // Check if file is examples/demos/tests
242        for pattern in &self.examples_patterns {
243            if path.contains(pattern) || filename.contains(pattern) {
244                return true;
245            }
246        }
247        false
248    }
249}
250
251/// Manages budget quotas and density-greedy selection
252#[derive(Debug, Clone)]
253pub struct QuotaManager {
254    pub total_budget: usize,
255    pub detector: CategoryDetector,
256    pub category_quotas: HashMap<FileCategory, CategoryQuota>,
257}
258
259impl QuotaManager {
260    pub fn new(total_budget: usize) -> Self {
261        let mut category_quotas = HashMap::new();
262        
263        // Default quota configuration (research-optimized)
264        category_quotas.insert(
265            FileCategory::Config,
266            CategoryQuota::new(
267                FileCategory::Config,
268                15.0,  // Reserve at least 15% for config
269                30.0,  // Cap at 30% to avoid over-allocation
270                0.95,  // 95% recall target for config files
271                2.0,   // High priority for config files
272            )
273        );
274        
275        category_quotas.insert(
276            FileCategory::Entry,
277            CategoryQuota::new(
278                FileCategory::Entry,
279                2.0,   // Minimum for entry points
280                7.0,   // Max 7% for entry points
281                0.90,  // High recall for entry points
282                1.8,   // High priority
283            )
284        );
285        
286        category_quotas.insert(
287            FileCategory::Examples,
288            CategoryQuota::new(
289                FileCategory::Examples,
290                1.0,   // Small allocation for examples
291                3.0,   // Max 3% for examples
292                0.0,   // No recall target for examples
293                0.5,   // Lower priority
294            )
295        );
296        
297        category_quotas.insert(
298            FileCategory::General,
299            CategoryQuota::new(
300                FileCategory::General,
301                60.0,  // Most budget goes to general files
302                82.0,  // Leave room for other categories
303                0.0,   // No specific recall target
304                1.0,   // Standard priority
305            )
306        );
307        
308        Self {
309            total_budget,
310            detector: CategoryDetector::new(),
311            category_quotas,
312        }
313    }
314    
315    /// Classify files into categories
316    pub fn classify_files(&self, scan_results: &[QuotaScanResult]) -> HashMap<FileCategory, Vec<QuotaScanResult>> {
317        let mut categorized = HashMap::new();
318        
319        for result in scan_results {
320            let category = self.detector.detect_category(result);
321            categorized.entry(category)
322                .or_insert_with(Vec::new)
323                .push(result.clone());
324        }
325        
326        categorized
327    }
328    
329    /// Calculate density score (importance per token)
330    /// Density = importance_score / token_cost * priority_multiplier
331    pub fn calculate_density_score(&self, scan_result: &QuotaScanResult, heuristic_score: f64) -> f64 {
332        // Estimate token cost - simple heuristic for now
333        let estimated_tokens = self.estimate_tokens(scan_result);
334        
335        // Avoid division by zero
336        let estimated_tokens = if estimated_tokens == 0 { 1 } else { estimated_tokens };
337        
338        let mut density = heuristic_score / estimated_tokens as f64;
339        
340        // Apply category priority multiplier
341        let category = self.detector.detect_category(scan_result);
342        if let Some(quota) = self.category_quotas.get(&category) {
343            density *= quota.priority_multiplier;
344        }
345        
346        density
347    }
348    
349    /// Simple token estimation based on file size
350    fn estimate_tokens(&self, scan_result: &QuotaScanResult) -> usize {
351        // Rough approximation: 1 token per 3-4 characters for code
352        // More sophisticated estimation would use actual tokenizer
353        (scan_result.content.len() / 3).max(1)
354    }
355    
356    /// Apply density-greedy selection algorithm with quotas
357    pub fn select_files_density_greedy(
358        &self,
359        categorized_files: &HashMap<FileCategory, Vec<QuotaScanResult>>,
360        heuristic_scores: &HashMap<String, f64>,
361        adaptation_factor: f64,
362    ) -> ScribeResult<(Vec<QuotaScanResult>, HashMap<FileCategory, QuotaAllocation>)> {
363        let mut selected_files = Vec::new();
364        let mut allocations = HashMap::new();
365        
366        // Adapt total budget under pressure
367        let effective_budget = if adaptation_factor > 0.4 {
368            // Reduce effective budget to force faster selection
369            (self.total_budget as f64 * (1.0 - adaptation_factor * 0.3)) as usize
370        } else {
371            self.total_budget
372        };
373        
374        let mut remaining_budget = effective_budget;
375        
376        // Phase 1: Allocate minimum budgets
377        let mut min_allocations = HashMap::new();
378        for (category, quota) in &self.category_quotas {
379            if !categorized_files.contains_key(category) {
380                continue;
381            }
382            
383            let min_budget = (effective_budget as f64 * quota.min_budget_pct / 100.0) as usize;
384            min_allocations.insert(*category, min_budget);
385            remaining_budget = remaining_budget.saturating_sub(min_budget);
386        }
387        
388        // Phase 2: Distribute remaining budget based on demand and priority
389        let additional_allocations = self.distribute_remaining_budget(
390            categorized_files, 
391            heuristic_scores, 
392            remaining_budget
393        )?;
394        
395        // Phase 3: Select files within each category using density-greedy
396        for (category, files) in categorized_files {
397            if !self.category_quotas.contains_key(category) {
398                continue;
399            }
400            
401            let quota = &self.category_quotas[category];
402            let allocated_budget = min_allocations.get(category).unwrap_or(&0) 
403                + additional_allocations.get(category).unwrap_or(&0);
404            
405            // Select files for this category
406            let (selected, allocation) = self.select_category_files(
407                *category,
408                files,
409                allocated_budget,
410                quota,
411                heuristic_scores,
412            )?;
413            
414            selected_files.extend(selected);
415            allocations.insert(*category, allocation);
416        }
417        
418        Ok((selected_files, allocations))
419    }
420    
421    /// Distribute remaining budget based on category demands and priorities
422    fn distribute_remaining_budget(
423        &self,
424        categorized_files: &HashMap<FileCategory, Vec<QuotaScanResult>>,
425        heuristic_scores: &HashMap<String, f64>,
426        remaining_budget: usize,
427    ) -> ScribeResult<HashMap<FileCategory, usize>> {
428        let mut additional_allocations = HashMap::new();
429        
430        // Calculate demand scores for each category
431        let mut category_demands = HashMap::new();
432        for (category, files) in categorized_files {
433            if !self.category_quotas.contains_key(category) {
434                continue;
435            }
436            
437            let quota = &self.category_quotas[category];
438            
439            // Calculate total value density for this category
440            let mut total_density = 0.0;
441            for file_result in files {
442                let heuristic_score = heuristic_scores.get(&file_result.path).unwrap_or(&0.0);
443                let density = self.calculate_density_score(file_result, *heuristic_score);
444                total_density += density;
445            }
446            
447            // Weight by priority multiplier and file count
448            let demand_score = total_density * quota.priority_multiplier * (files.len() as f64 + 1.0).ln();
449            category_demands.insert(*category, demand_score);
450        }
451        
452        // Distribute remaining budget proportionally to demand
453        let total_demand: f64 = category_demands.values().sum();
454        if total_demand > 0.0 {
455            for (category, demand) in &category_demands {
456                let proportion = demand / total_demand;
457                let additional_budget = (remaining_budget as f64 * proportion) as usize;
458                
459                // Respect maximum budget constraints
460                let quota = &self.category_quotas[category];
461                let max_budget = (self.total_budget as f64 * quota.max_budget_pct / 100.0) as usize;
462                let min_budget = (self.total_budget as f64 * quota.min_budget_pct / 100.0) as usize;
463                
464                // Don't exceed maximum allocation
465                let current_allocation = min_budget + additional_budget;
466                let final_additional = if current_allocation > max_budget {
467                    max_budget.saturating_sub(min_budget)
468                } else {
469                    additional_budget
470                };
471                
472                additional_allocations.insert(*category, final_additional);
473            }
474        }
475        
476        Ok(additional_allocations)
477    }
478    
479    /// Select files within a category using density-greedy algorithm
480    fn select_category_files(
481        &self,
482        category: FileCategory,
483        files: &[QuotaScanResult],
484        allocated_budget: usize,
485        quota: &CategoryQuota,
486        heuristic_scores: &HashMap<String, f64>,
487    ) -> ScribeResult<(Vec<QuotaScanResult>, QuotaAllocation)> {
488        // Calculate density scores for all files in category
489        let mut file_densities = Vec::new();
490        for file_result in files {
491            let heuristic_score = heuristic_scores.get(&file_result.path).unwrap_or(&0.0);
492            let density = self.calculate_density_score(file_result, *heuristic_score);
493            let estimated_tokens = self.estimate_tokens(file_result);
494            file_densities.push((file_result.clone(), density, *heuristic_score, estimated_tokens));
495        }
496        
497        // Sort by density (descending)
498        file_densities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
499        
500        // Greedy selection within budget
501        let mut selected = Vec::new();
502        let mut used_budget = 0;
503        let mut total_importance = 0.0;
504        
505        for (file_result, density, importance, tokens) in &file_densities {
506            if used_budget + tokens <= allocated_budget {
507                selected.push(file_result.clone());
508                used_budget += tokens;
509                total_importance += importance;
510            } else if quota.recall_target > 0.0 {
511                // For categories with recall targets, try to fit more critical files
512                // even if it means going slightly over budget
513                let importance_threshold = self.calculate_importance_threshold(
514                    &file_densities.iter().map(|(_, _, imp, _)| *imp).collect::<Vec<_>>(),
515                    quota.recall_target
516                )?;
517                if *importance >= importance_threshold && used_budget + tokens <= (allocated_budget as f64 * 1.05) as usize {
518                    selected.push(file_result.clone());
519                    used_budget += tokens;
520                    total_importance += importance;
521                }
522            }
523        }
524        
525        // Calculate achieved recall
526        let achieved_recall = if quota.recall_target > 0.0 && !files.is_empty() {
527            // Recall = selected high-importance files / total high-importance files
528            let importance_scores: Vec<f64> = files.iter()
529                .map(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0))
530                .cloned()
531                .collect();
532            let importance_threshold = self.calculate_importance_threshold(&importance_scores, quota.recall_target)?;
533            
534            let high_importance_files: Vec<_> = files.iter()
535                .filter(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0) >= &importance_threshold)
536                .collect();
537                
538            let selected_high_importance: Vec<_> = selected.iter()
539                .filter(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0) >= &importance_threshold)
540                .collect();
541                
542            selected_high_importance.len() as f64 / high_importance_files.len().max(1) as f64
543        } else {
544            selected.len() as f64 / files.len().max(1) as f64  // Selection ratio
545        };
546        
547        // Calculate density score for selected set
548        let density_score = if used_budget > 0 {
549            total_importance / used_budget as f64
550        } else {
551            0.0
552        };
553        
554        let allocation = QuotaAllocation {
555            category,
556            allocated_budget,
557            used_budget,
558            file_count: selected.len(),
559            recall_achieved: achieved_recall,
560            density_score,
561        };
562        
563        Ok((selected, allocation))
564    }
565    
566    /// Calculate importance threshold for achieving target recall
567    fn calculate_importance_threshold(&self, importance_scores: &[f64], recall_target: f64) -> ScribeResult<f64> {
568        if importance_scores.is_empty() {
569            return Ok(0.0);
570        }
571        
572        // Sort scores in descending order
573        let mut sorted_scores = importance_scores.to_vec();
574        sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
575        
576        // Find threshold that captures top recall_target fraction
577        let target_count = (sorted_scores.len() as f64 * recall_target) as usize;
578        let target_count = target_count.max(1).min(sorted_scores.len());
579        
580        let threshold_index = target_count - 1;
581        Ok(sorted_scores[threshold_index])
582    }
583    
584    /// Main entry point for quotas-based selection
585    pub fn apply_quotas_selection(
586        &self,
587        scan_results: &[QuotaScanResult],
588        heuristic_scores: &HashMap<String, f64>,
589    ) -> ScribeResult<(Vec<QuotaScanResult>, HashMap<FileCategory, QuotaAllocation>)> {
590        // Apply quotas-based selection
591        let categorized_files = self.classify_files(scan_results);
592        self.select_files_density_greedy(&categorized_files, heuristic_scores, 0.0)
593    }
594}
595
596/// Create a QuotaManager instance
597pub fn create_quota_manager(total_budget: usize) -> QuotaManager {
598    QuotaManager::new(total_budget)
599}
scribe_selection/quota.rs

scribe_selection/
quota.rs