1use rayon::prelude::*;
2use regex::RegexSet;
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6use scribe_analysis::heuristics::ScanResult;
7use scribe_core::{Result as ScribeResult, ScribeError};
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct QuotaScanResult {
12 pub path: String,
13 pub relative_path: String,
14 pub depth: usize,
15 pub content: String,
16 pub is_entrypoint: bool,
17 pub priority_boost: f64,
18 pub churn_score: f64,
19 pub centrality_in: f64,
20 pub imports: Option<Vec<String>>,
21 pub is_docs: bool,
22 pub is_readme: bool,
23 pub is_test: bool,
24 pub has_examples: bool,
25}
26
27impl ScanResult for QuotaScanResult {
28 fn path(&self) -> &str {
29 &self.path
30 }
31
32 fn relative_path(&self) -> &str {
33 &self.relative_path
34 }
35
36 fn depth(&self) -> usize {
37 self.depth
38 }
39
40 fn is_docs(&self) -> bool {
41 self.is_docs
42 }
43
44 fn is_readme(&self) -> bool {
45 self.is_readme
46 }
47
48 fn is_test(&self) -> bool {
49 self.is_test
50 }
51
52 fn is_entrypoint(&self) -> bool {
53 self.is_entrypoint
54 }
55
56 fn has_examples(&self) -> bool {
57 self.has_examples
58 }
59
60 fn priority_boost(&self) -> f64 {
61 self.priority_boost
62 }
63
64 fn churn_score(&self) -> f64 {
65 self.churn_score
66 }
67
68 fn centrality_in(&self) -> f64 {
69 self.centrality_in
70 }
71
72 fn imports(&self) -> Option<&[String]> {
73 self.imports.as_deref()
74 }
75
76 fn doc_analysis(&self) -> Option<&scribe_analysis::heuristics::DocumentAnalysis> {
77 None }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
83pub enum FileCategory {
84 Config,
85 Entry,
86 Examples,
87 General,
88}
89
90impl FileCategory {
91 pub fn as_str(&self) -> &'static str {
92 match self {
93 FileCategory::Config => "config",
94 FileCategory::Entry => "entry",
95 FileCategory::Examples => "examples",
96 FileCategory::General => "general",
97 }
98 }
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct CategoryQuota {
104 pub category: FileCategory,
105 pub min_budget_pct: f64, pub max_budget_pct: f64, pub recall_target: f64, pub priority_multiplier: f64, }
110
111impl CategoryQuota {
112 pub fn new(
113 category: FileCategory,
114 min_budget_pct: f64,
115 max_budget_pct: f64,
116 recall_target: f64,
117 priority_multiplier: f64,
118 ) -> Self {
119 Self {
120 category,
121 min_budget_pct,
122 max_budget_pct,
123 recall_target,
124 priority_multiplier,
125 }
126 }
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct QuotaAllocation {
132 pub category: FileCategory,
133 pub allocated_budget: usize,
134 pub used_budget: usize,
135 pub file_count: usize,
136 pub recall_achieved: f64,
137 pub density_score: f64,
138}
139
140#[derive(Debug)]
142pub struct CategoryDetector {
143 config_regex_set: RegexSet,
144 entry_regex_set: RegexSet,
145 examples_regex_set: RegexSet,
146}
147
148impl Default for CategoryDetector {
149 fn default() -> Self {
150 Self::new().expect("Failed to create CategoryDetector")
151 }
152}
153
154impl CategoryDetector {
155 pub fn new() -> Result<Self, regex::Error> {
156 let config_patterns = vec![
158 r"\.json$",
160 r"\.yaml$",
161 r"\.yml$",
162 r"\.toml$",
163 r"\.ini$",
164 r"\.cfg$",
165 r"\.conf$",
166 r"package\.json$",
168 r"requirements\.txt$",
169 r"pyproject\.toml$",
170 r"cargo\.toml$",
171 r"setup\.py$",
172 r"setup\.cfg$",
173 r"makefile$",
174 r"dockerfile$",
175 r"docker-compose\.yml$",
176 r"\.github",
178 r"\.gitlab-ci\.yml$",
179 r"\.travis\.yml$",
180 r"\.circleci",
181 r"\.vscode",
183 r"\.idea",
184 r"\.editorconfig$",
185 r"tsconfig\.json$",
186 r"tslint\.json$",
187 r"eslint\.json$",
188 r"\.eslintrc",
189 r"\.prettierrc",
190 r"jest\.config\.js$",
191 ];
192
193 let entry_patterns = vec![
195 r"main\.py$",
196 r"__main__\.py$",
197 r"app\.py$",
198 r"server\.py$",
199 r"index\.py$",
200 r"main\.js$",
201 r"index\.js$",
202 r"app\.js$",
203 r"server\.js$",
204 r"index\.ts$",
205 r"main\.ts$",
206 r"main\.go$",
207 r"main\.rs$",
208 r"lib\.rs$",
209 r"mod\.rs$",
210 ];
211
212 let examples_patterns = vec![
214 r"example",
215 r"examples",
216 r"demo",
217 r"demos",
218 r"sample",
219 r"samples",
220 r"tutorial",
221 r"tutorials",
222 r"test",
223 r"tests",
224 r"spec",
225 r"specs",
226 r"benchmark",
227 r"benchmarks",
228 ];
229
230 Ok(Self {
231 config_regex_set: RegexSet::new(&config_patterns)?,
232 entry_regex_set: RegexSet::new(&entry_patterns)?,
233 examples_regex_set: RegexSet::new(&examples_patterns)?,
234 })
235 }
236
237 pub fn detect_category(&self, scan_result: &QuotaScanResult) -> FileCategory {
239 let path = scan_result.path.to_lowercase();
240 let filename = scan_result
241 .path
242 .split('/')
243 .last()
244 .unwrap_or("")
245 .to_lowercase();
246
247 if self.config_regex_set.is_match(&path) || self.config_regex_set.is_match(&filename) {
249 return FileCategory::Config;
250 }
251
252 if scan_result.is_entrypoint || self.entry_regex_set.is_match(&filename) {
254 return FileCategory::Entry;
255 }
256
257 if self.examples_regex_set.is_match(&path) || self.examples_regex_set.is_match(&filename) {
259 return FileCategory::Examples;
260 }
261
262 FileCategory::General
263 }
264}
265
266#[derive(Debug)]
268pub struct QuotaManager {
269 pub total_budget: usize,
270 pub detector: CategoryDetector,
271 pub category_quotas: HashMap<FileCategory, CategoryQuota>,
272}
273
274impl QuotaManager {
275 pub fn new(total_budget: usize) -> ScribeResult<Self> {
276 let mut category_quotas = HashMap::new();
277
278 category_quotas.insert(
280 FileCategory::Config,
281 CategoryQuota::new(
282 FileCategory::Config,
283 15.0, 30.0, 0.95, 2.0, ),
288 );
289
290 category_quotas.insert(
291 FileCategory::Entry,
292 CategoryQuota::new(
293 FileCategory::Entry,
294 2.0, 7.0, 0.90, 1.8, ),
299 );
300
301 category_quotas.insert(
302 FileCategory::Examples,
303 CategoryQuota::new(
304 FileCategory::Examples,
305 1.0, 3.0, 0.0, 0.5, ),
310 );
311
312 category_quotas.insert(
313 FileCategory::General,
314 CategoryQuota::new(
315 FileCategory::General,
316 60.0, 82.0, 0.0, 1.0, ),
321 );
322
323 Ok(Self {
324 total_budget,
325 detector: CategoryDetector::new().map_err(|e| {
326 ScribeError::parse(format!("Failed to create category detector: {}", e))
327 })?,
328 category_quotas,
329 })
330 }
331
332 pub fn classify_files<'a>(
334 &self,
335 scan_results: &'a [QuotaScanResult],
336 ) -> HashMap<FileCategory, Vec<&'a QuotaScanResult>> {
337 let mut categorized = HashMap::new();
338
339 for result in scan_results {
340 let category = self.detector.detect_category(result);
341 categorized
342 .entry(category)
343 .or_insert_with(Vec::new)
344 .push(result);
345 }
346
347 categorized
348 }
349
350 pub fn calculate_density_score(
353 &self,
354 scan_result: &QuotaScanResult,
355 heuristic_score: f64,
356 ) -> f64 {
357 let estimated_tokens = self.estimate_tokens(scan_result);
359
360 let estimated_tokens = if estimated_tokens == 0 {
362 1
363 } else {
364 estimated_tokens
365 };
366
367 let mut density = heuristic_score / estimated_tokens as f64;
368
369 let category = self.detector.detect_category(scan_result);
371 if let Some(quota) = self.category_quotas.get(&category) {
372 density *= quota.priority_multiplier;
373 }
374
375 density
376 }
377
378 fn estimate_tokens(&self, scan_result: &QuotaScanResult) -> usize {
380 (scan_result.content.len() / 3).max(1)
383 }
384
385 pub fn select_files_density_greedy(
387 &self,
388 categorized_files: &HashMap<FileCategory, Vec<&QuotaScanResult>>,
389 heuristic_scores: &HashMap<String, f64>,
390 adaptation_factor: f64,
391 ) -> ScribeResult<(Vec<QuotaScanResult>, HashMap<FileCategory, QuotaAllocation>)> {
392 let mut selected_files = Vec::new();
393 let mut allocations = HashMap::new();
394
395 let effective_budget = if adaptation_factor > 0.4 {
397 (self.total_budget as f64 * (1.0 - adaptation_factor * 0.3)) as usize
399 } else {
400 self.total_budget
401 };
402
403 let mut remaining_budget = effective_budget;
404
405 let mut min_allocations = HashMap::new();
407 for (category, quota) in &self.category_quotas {
408 if !categorized_files.contains_key(category) {
409 continue;
410 }
411
412 let min_budget = (effective_budget as f64 * quota.min_budget_pct / 100.0) as usize;
413 min_allocations.insert(*category, min_budget);
414 remaining_budget = remaining_budget.saturating_sub(min_budget);
415 }
416
417 let additional_allocations = self.distribute_remaining_budget(
419 categorized_files,
420 heuristic_scores,
421 remaining_budget,
422 )?;
423
424 for (category, files) in categorized_files {
426 if !self.category_quotas.contains_key(category) {
427 continue;
428 }
429
430 let quota = &self.category_quotas[category];
431 let allocated_budget = min_allocations.get(category).unwrap_or(&0)
432 + additional_allocations.get(category).unwrap_or(&0);
433
434 let (selected, allocation) = self.select_category_files(
436 *category,
437 files,
438 allocated_budget,
439 quota,
440 heuristic_scores,
441 )?;
442
443 selected_files.extend(selected);
444 allocations.insert(*category, allocation);
445 }
446
447 Ok((selected_files, allocations))
448 }
449
450 fn distribute_remaining_budget(
452 &self,
453 categorized_files: &HashMap<FileCategory, Vec<&QuotaScanResult>>,
454 heuristic_scores: &HashMap<String, f64>,
455 remaining_budget: usize,
456 ) -> ScribeResult<HashMap<FileCategory, usize>> {
457 let mut additional_allocations = HashMap::new();
458
459 let mut category_demands = HashMap::new();
461 for (category, files) in categorized_files {
462 if !self.category_quotas.contains_key(category) {
463 continue;
464 }
465
466 let quota = &self.category_quotas[category];
467
468 let mut total_density = 0.0;
470 for file_result in files {
471 let heuristic_score = heuristic_scores.get(&file_result.path).unwrap_or(&0.0);
472 let density = self.calculate_density_score(file_result, *heuristic_score);
473 total_density += density;
474 }
475
476 let demand_score =
478 total_density * quota.priority_multiplier * (files.len() as f64 + 1.0).ln();
479 category_demands.insert(*category, demand_score);
480 }
481
482 let total_demand: f64 = category_demands.values().sum();
484 if total_demand > 0.0 {
485 for (category, demand) in &category_demands {
486 let proportion = demand / total_demand;
487 let additional_budget = (remaining_budget as f64 * proportion) as usize;
488
489 let quota = &self.category_quotas[category];
491 let max_budget = (self.total_budget as f64 * quota.max_budget_pct / 100.0) as usize;
492 let min_budget = (self.total_budget as f64 * quota.min_budget_pct / 100.0) as usize;
493
494 let current_allocation = min_budget + additional_budget;
496 let final_additional = if current_allocation > max_budget {
497 max_budget.saturating_sub(min_budget)
498 } else {
499 additional_budget
500 };
501
502 additional_allocations.insert(*category, final_additional);
503 }
504 }
505
506 Ok(additional_allocations)
507 }
508
509 fn select_category_files(
511 &self,
512 category: FileCategory,
513 files: &[&QuotaScanResult],
514 allocated_budget: usize,
515 quota: &CategoryQuota,
516 heuristic_scores: &HashMap<String, f64>,
517 ) -> ScribeResult<(Vec<QuotaScanResult>, QuotaAllocation)> {
518 let mut file_densities: Vec<_> = files
520 .par_iter()
521 .map(|file_result| {
522 let heuristic_score = heuristic_scores.get(&file_result.path).unwrap_or(&0.0);
523 let density = self.calculate_density_score(file_result, *heuristic_score);
524 let estimated_tokens = self.estimate_tokens(file_result);
525 (*file_result, density, *heuristic_score, estimated_tokens)
526 })
527 .collect();
528
529 file_densities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
531
532 let mut selected = Vec::new();
534 let mut used_budget = 0;
535 let mut total_importance = 0.0;
536
537 for (file_result, density, importance, tokens) in &file_densities {
538 if used_budget + tokens <= allocated_budget {
539 selected.push((*file_result).clone());
540 used_budget += tokens;
541 total_importance += importance;
542 } else if quota.recall_target > 0.0 {
543 let importance_threshold = self.calculate_importance_threshold(
546 &file_densities
547 .iter()
548 .map(|(_, _, imp, _)| *imp)
549 .collect::<Vec<_>>(),
550 quota.recall_target,
551 )?;
552 if *importance >= importance_threshold
553 && used_budget + tokens <= (allocated_budget as f64 * 1.05) as usize
554 {
555 selected.push((*file_result).clone());
556 used_budget += tokens;
557 total_importance += importance;
558 }
559 }
560 }
561
562 let achieved_recall = if quota.recall_target > 0.0 && !files.is_empty() {
564 let importance_scores: Vec<f64> = files
566 .iter()
567 .map(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0))
568 .cloned()
569 .collect();
570 let importance_threshold =
571 self.calculate_importance_threshold(&importance_scores, quota.recall_target)?;
572
573 let high_importance_files: Vec<_> = files
574 .iter()
575 .filter(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0) >= &importance_threshold)
576 .collect();
577
578 let selected_high_importance: Vec<_> = selected
579 .iter()
580 .filter(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0) >= &importance_threshold)
581 .collect();
582
583 selected_high_importance.len() as f64 / high_importance_files.len().max(1) as f64
584 } else {
585 selected.len() as f64 / files.len().max(1) as f64 };
587
588 let density_score = if used_budget > 0 {
590 total_importance / used_budget as f64
591 } else {
592 0.0
593 };
594
595 let allocation = QuotaAllocation {
596 category,
597 allocated_budget,
598 used_budget,
599 file_count: selected.len(),
600 recall_achieved: achieved_recall,
601 density_score,
602 };
603
604 Ok((selected, allocation))
605 }
606
607 fn calculate_importance_threshold(
609 &self,
610 importance_scores: &[f64],
611 recall_target: f64,
612 ) -> ScribeResult<f64> {
613 if importance_scores.is_empty() {
614 return Ok(0.0);
615 }
616
617 let mut sorted_scores = importance_scores.to_vec();
619 sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
620
621 let target_count = (sorted_scores.len() as f64 * recall_target) as usize;
623 let target_count = target_count.max(1).min(sorted_scores.len());
624
625 let threshold_index = target_count - 1;
626 Ok(sorted_scores[threshold_index])
627 }
628
629 pub fn apply_quotas_selection(
631 &self,
632 scan_results: &[QuotaScanResult],
633 heuristic_scores: &HashMap<String, f64>,
634 ) -> ScribeResult<(Vec<QuotaScanResult>, HashMap<FileCategory, QuotaAllocation>)> {
635 let categorized_files = self.classify_files(scan_results);
637 self.select_files_density_greedy(&categorized_files, heuristic_scores, 0.0)
638 }
639}
640
641pub fn create_quota_manager(total_budget: usize) -> ScribeResult<QuotaManager> {
643 QuotaManager::new(total_budget)
644}
645
646#[cfg(test)]
647mod tests {
648 use super::*;
649
650 #[test]
651 fn test_category_detection_with_regex_set() {
652 let detector = CategoryDetector::new().expect("Failed to create CategoryDetector");
653
654 let config_file = QuotaScanResult {
656 path: "package.json".to_string(),
657 relative_path: "package.json".to_string(),
658 depth: 0,
659 content: "{}".to_string(),
660 is_entrypoint: false,
661 priority_boost: 0.0,
662 churn_score: 0.0,
663 centrality_in: 0.0,
664 imports: None,
665 is_docs: false,
666 is_readme: false,
667 is_test: false,
668 has_examples: false,
669 };
670 assert_eq!(detector.detect_category(&config_file), FileCategory::Config);
671
672 let entry_file = QuotaScanResult {
674 path: "src/main.rs".to_string(),
675 relative_path: "src/main.rs".to_string(),
676 depth: 1,
677 content: "fn main() {}".to_string(),
678 is_entrypoint: false,
679 priority_boost: 0.0,
680 churn_score: 0.0,
681 centrality_in: 0.0,
682 imports: None,
683 is_docs: false,
684 is_readme: false,
685 is_test: false,
686 has_examples: false,
687 };
688 assert_eq!(detector.detect_category(&entry_file), FileCategory::Entry);
689
690 let examples_file = QuotaScanResult {
692 path: "examples/demo.rs".to_string(),
693 relative_path: "examples/demo.rs".to_string(),
694 depth: 1,
695 content: "// demo".to_string(),
696 is_entrypoint: false,
697 priority_boost: 0.0,
698 churn_score: 0.0,
699 centrality_in: 0.0,
700 imports: None,
701 is_docs: false,
702 is_readme: false,
703 is_test: false,
704 has_examples: false,
705 };
706 assert_eq!(
707 detector.detect_category(&examples_file),
708 FileCategory::Examples
709 );
710
711 let entry_lib_file = QuotaScanResult {
713 path: "src/lib.rs".to_string(),
714 relative_path: "src/lib.rs".to_string(),
715 depth: 1,
716 content: "pub mod utils;".to_string(),
717 is_entrypoint: false,
718 priority_boost: 0.0,
719 churn_score: 0.0,
720 centrality_in: 0.0,
721 imports: None,
722 is_docs: false,
723 is_readme: false,
724 is_test: false,
725 has_examples: false,
726 };
727 assert_eq!(
728 detector.detect_category(&entry_lib_file),
729 FileCategory::Entry
730 );
731
732 let general_file = QuotaScanResult {
734 path: "src/utils.rs".to_string(),
735 relative_path: "src/utils.rs".to_string(),
736 depth: 1,
737 content: "pub fn helper() {}".to_string(),
738 is_entrypoint: false,
739 priority_boost: 0.0,
740 churn_score: 0.0,
741 centrality_in: 0.0,
742 imports: None,
743 is_docs: false,
744 is_readme: false,
745 is_test: false,
746 has_examples: false,
747 };
748 assert_eq!(
749 detector.detect_category(&general_file),
750 FileCategory::General
751 );
752 }
753
754 #[test]
755 fn test_quota_manager_creation() {
756 let manager = QuotaManager::new(1000).expect("Failed to create QuotaManager");
757 assert_eq!(manager.total_budget, 1000);
758 assert_eq!(manager.category_quotas.len(), 4);
759 }
760
761 #[test]
762 fn test_regex_patterns_directly() {
763 use regex::RegexSet;
764
765 let entry_patterns = vec![
766 r"main\.py$",
767 r"__main__\.py$",
768 r"app\.py$",
769 r"server\.py$",
770 r"index\.py$",
771 r"main\.js$",
772 r"index\.js$",
773 r"app\.js$",
774 r"server\.js$",
775 r"index\.ts$",
776 r"main\.ts$",
777 r"main\.go$",
778 r"main\.rs$",
779 r"lib\.rs$",
780 r"mod\.rs$",
781 ];
782
783 let regex_set = RegexSet::new(&entry_patterns).unwrap();
784
785 assert!(
787 regex_set.is_match("lib.rs"),
788 "lib.rs should match entry patterns"
789 );
790 assert!(
791 regex_set.is_match("main.rs"),
792 "main.rs should match entry patterns"
793 );
794
795 let path = "src/lib.rs";
797 let filename = path.split('/').last().unwrap_or("").to_lowercase();
798 assert_eq!(filename, "lib.rs");
799 assert!(
800 regex_set.is_match(&filename),
801 "Extracted filename should match"
802 );
803 }
804}