use rayon::prelude::*;
use regex::RegexSet;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use scribe_analysis::heuristics::ScanResult;
use scribe_core::{Result as ScribeResult, ScribeError};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QuotaScanResult {
pub path: String,
pub relative_path: String,
pub depth: usize,
pub content: String,
pub is_entrypoint: bool,
pub priority_boost: f64,
pub churn_score: f64,
pub centrality_in: f64,
pub imports: Option<Vec<String>>,
pub is_docs: bool,
pub is_readme: bool,
pub is_test: bool,
pub has_examples: bool,
}
impl ScanResult for QuotaScanResult {
fn path(&self) -> &str {
&self.path
}
fn relative_path(&self) -> &str {
&self.relative_path
}
fn depth(&self) -> usize {
self.depth
}
fn is_docs(&self) -> bool {
self.is_docs
}
fn is_readme(&self) -> bool {
self.is_readme
}
fn is_test(&self) -> bool {
self.is_test
}
fn is_entrypoint(&self) -> bool {
self.is_entrypoint
}
fn has_examples(&self) -> bool {
self.has_examples
}
fn priority_boost(&self) -> f64 {
self.priority_boost
}
fn churn_score(&self) -> f64 {
self.churn_score
}
fn centrality_in(&self) -> f64 {
self.centrality_in
}
fn imports(&self) -> Option<&[String]> {
self.imports.as_deref()
}
fn doc_analysis(&self) -> Option<&scribe_analysis::heuristics::DocumentAnalysis> {
None }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FileCategory {
Config,
Entry,
Examples,
General,
}
impl FileCategory {
pub fn as_str(&self) -> &'static str {
match self {
FileCategory::Config => "config",
FileCategory::Entry => "entry",
FileCategory::Examples => "examples",
FileCategory::General => "general",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CategoryQuota {
pub category: FileCategory,
pub min_budget_pct: f64, pub max_budget_pct: f64, pub recall_target: f64, pub priority_multiplier: f64, }
impl CategoryQuota {
pub fn new(
category: FileCategory,
min_budget_pct: f64,
max_budget_pct: f64,
recall_target: f64,
priority_multiplier: f64,
) -> Self {
Self {
category,
min_budget_pct,
max_budget_pct,
recall_target,
priority_multiplier,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QuotaAllocation {
pub category: FileCategory,
pub allocated_budget: usize,
pub used_budget: usize,
pub file_count: usize,
pub recall_achieved: f64,
pub density_score: f64,
}
#[derive(Debug)]
pub struct CategoryDetector {
config_regex_set: RegexSet,
entry_regex_set: RegexSet,
examples_regex_set: RegexSet,
}
impl Default for CategoryDetector {
fn default() -> Self {
Self::new().expect("Failed to create CategoryDetector")
}
}
impl CategoryDetector {
pub fn new() -> Result<Self, regex::Error> {
let config_patterns = vec![
r"\.json$",
r"\.yaml$",
r"\.yml$",
r"\.toml$",
r"\.ini$",
r"\.cfg$",
r"\.conf$",
r"package\.json$",
r"requirements\.txt$",
r"pyproject\.toml$",
r"cargo\.toml$",
r"setup\.py$",
r"setup\.cfg$",
r"makefile$",
r"dockerfile$",
r"docker-compose\.yml$",
r"\.github",
r"\.gitlab-ci\.yml$",
r"\.travis\.yml$",
r"\.circleci",
r"\.vscode",
r"\.idea",
r"\.editorconfig$",
r"tsconfig\.json$",
r"tslint\.json$",
r"eslint\.json$",
r"\.eslintrc",
r"\.prettierrc",
r"jest\.config\.js$",
];
let entry_patterns = vec![
r"main\.py$",
r"__main__\.py$",
r"app\.py$",
r"server\.py$",
r"index\.py$",
r"main\.js$",
r"index\.js$",
r"app\.js$",
r"server\.js$",
r"index\.ts$",
r"main\.ts$",
r"main\.go$",
r"main\.rs$",
r"lib\.rs$",
r"mod\.rs$",
];
let examples_patterns = vec![
r"example",
r"examples",
r"demo",
r"demos",
r"sample",
r"samples",
r"tutorial",
r"tutorials",
r"test",
r"tests",
r"spec",
r"specs",
r"benchmark",
r"benchmarks",
];
Ok(Self {
config_regex_set: RegexSet::new(&config_patterns)?,
entry_regex_set: RegexSet::new(&entry_patterns)?,
examples_regex_set: RegexSet::new(&examples_patterns)?,
})
}
pub fn detect_category(&self, scan_result: &QuotaScanResult) -> FileCategory {
let path = scan_result.path.to_lowercase();
let filename = scan_result
.path
.split('/')
.last()
.unwrap_or("")
.to_lowercase();
if self.config_regex_set.is_match(&path) || self.config_regex_set.is_match(&filename) {
return FileCategory::Config;
}
if scan_result.is_entrypoint || self.entry_regex_set.is_match(&filename) {
return FileCategory::Entry;
}
if self.examples_regex_set.is_match(&path) || self.examples_regex_set.is_match(&filename) {
return FileCategory::Examples;
}
FileCategory::General
}
}
#[derive(Debug)]
pub struct QuotaManager {
pub total_budget: usize,
pub detector: CategoryDetector,
pub category_quotas: HashMap<FileCategory, CategoryQuota>,
}
impl QuotaManager {
pub fn new(total_budget: usize) -> ScribeResult<Self> {
let mut category_quotas = HashMap::new();
category_quotas.insert(
FileCategory::Config,
CategoryQuota::new(
FileCategory::Config,
15.0, 30.0, 0.95, 2.0, ),
);
category_quotas.insert(
FileCategory::Entry,
CategoryQuota::new(
FileCategory::Entry,
2.0, 7.0, 0.90, 1.8, ),
);
category_quotas.insert(
FileCategory::Examples,
CategoryQuota::new(
FileCategory::Examples,
1.0, 3.0, 0.0, 0.5, ),
);
category_quotas.insert(
FileCategory::General,
CategoryQuota::new(
FileCategory::General,
60.0, 82.0, 0.0, 1.0, ),
);
Ok(Self {
total_budget,
detector: CategoryDetector::new().map_err(|e| {
ScribeError::parse(format!("Failed to create category detector: {}", e))
})?,
category_quotas,
})
}
pub fn classify_files<'a>(
&self,
scan_results: &'a [QuotaScanResult],
) -> HashMap<FileCategory, Vec<&'a QuotaScanResult>> {
let mut categorized = HashMap::new();
for result in scan_results {
let category = self.detector.detect_category(result);
categorized
.entry(category)
.or_insert_with(Vec::new)
.push(result);
}
categorized
}
pub fn calculate_density_score(
&self,
scan_result: &QuotaScanResult,
heuristic_score: f64,
) -> f64 {
let estimated_tokens = self.estimate_tokens(scan_result);
let estimated_tokens = if estimated_tokens == 0 {
1
} else {
estimated_tokens
};
let mut density = heuristic_score / estimated_tokens as f64;
let category = self.detector.detect_category(scan_result);
if let Some(quota) = self.category_quotas.get(&category) {
density *= quota.priority_multiplier;
}
density
}
fn estimate_tokens(&self, scan_result: &QuotaScanResult) -> usize {
(scan_result.content.len() / 3).max(1)
}
pub fn select_files_density_greedy(
&self,
categorized_files: &HashMap<FileCategory, Vec<&QuotaScanResult>>,
heuristic_scores: &HashMap<String, f64>,
adaptation_factor: f64,
) -> ScribeResult<(Vec<QuotaScanResult>, HashMap<FileCategory, QuotaAllocation>)> {
let mut selected_files = Vec::new();
let mut allocations = HashMap::new();
let effective_budget = if adaptation_factor > 0.4 {
(self.total_budget as f64 * (1.0 - adaptation_factor * 0.3)) as usize
} else {
self.total_budget
};
let mut remaining_budget = effective_budget;
let mut min_allocations = HashMap::new();
for (category, quota) in &self.category_quotas {
if !categorized_files.contains_key(category) {
continue;
}
let min_budget = (effective_budget as f64 * quota.min_budget_pct / 100.0) as usize;
min_allocations.insert(*category, min_budget);
remaining_budget = remaining_budget.saturating_sub(min_budget);
}
let additional_allocations = self.distribute_remaining_budget(
categorized_files,
heuristic_scores,
remaining_budget,
)?;
for (category, files) in categorized_files {
if !self.category_quotas.contains_key(category) {
continue;
}
let quota = &self.category_quotas[category];
let allocated_budget = min_allocations.get(category).unwrap_or(&0)
+ additional_allocations.get(category).unwrap_or(&0);
let (selected, allocation) = self.select_category_files(
*category,
files,
allocated_budget,
quota,
heuristic_scores,
)?;
selected_files.extend(selected);
allocations.insert(*category, allocation);
}
Ok((selected_files, allocations))
}
fn distribute_remaining_budget(
&self,
categorized_files: &HashMap<FileCategory, Vec<&QuotaScanResult>>,
heuristic_scores: &HashMap<String, f64>,
remaining_budget: usize,
) -> ScribeResult<HashMap<FileCategory, usize>> {
let mut additional_allocations = HashMap::new();
let mut category_demands = HashMap::new();
for (category, files) in categorized_files {
if !self.category_quotas.contains_key(category) {
continue;
}
let quota = &self.category_quotas[category];
let mut total_density = 0.0;
for file_result in files {
let heuristic_score = heuristic_scores.get(&file_result.path).unwrap_or(&0.0);
let density = self.calculate_density_score(file_result, *heuristic_score);
total_density += density;
}
let demand_score =
total_density * quota.priority_multiplier * (files.len() as f64 + 1.0).ln();
category_demands.insert(*category, demand_score);
}
let total_demand: f64 = category_demands.values().sum();
if total_demand > 0.0 {
for (category, demand) in &category_demands {
let proportion = demand / total_demand;
let additional_budget = (remaining_budget as f64 * proportion) as usize;
let quota = &self.category_quotas[category];
let max_budget = (self.total_budget as f64 * quota.max_budget_pct / 100.0) as usize;
let min_budget = (self.total_budget as f64 * quota.min_budget_pct / 100.0) as usize;
let current_allocation = min_budget + additional_budget;
let final_additional = if current_allocation > max_budget {
max_budget.saturating_sub(min_budget)
} else {
additional_budget
};
additional_allocations.insert(*category, final_additional);
}
}
Ok(additional_allocations)
}
fn select_category_files(
&self,
category: FileCategory,
files: &[&QuotaScanResult],
allocated_budget: usize,
quota: &CategoryQuota,
heuristic_scores: &HashMap<String, f64>,
) -> ScribeResult<(Vec<QuotaScanResult>, QuotaAllocation)> {
let mut file_densities: Vec<_> = files
.par_iter()
.map(|file_result| {
let heuristic_score = heuristic_scores.get(&file_result.path).unwrap_or(&0.0);
let density = self.calculate_density_score(file_result, *heuristic_score);
let estimated_tokens = self.estimate_tokens(file_result);
(*file_result, density, *heuristic_score, estimated_tokens)
})
.collect();
file_densities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let mut selected = Vec::new();
let mut used_budget = 0;
let mut total_importance = 0.0;
for (file_result, density, importance, tokens) in &file_densities {
if used_budget + tokens <= allocated_budget {
selected.push((*file_result).clone());
used_budget += tokens;
total_importance += importance;
} else if quota.recall_target > 0.0 {
let importance_threshold = self.calculate_importance_threshold(
&file_densities
.iter()
.map(|(_, _, imp, _)| *imp)
.collect::<Vec<_>>(),
quota.recall_target,
)?;
if *importance >= importance_threshold
&& used_budget + tokens <= (allocated_budget as f64 * 1.05) as usize
{
selected.push((*file_result).clone());
used_budget += tokens;
total_importance += importance;
}
}
}
let achieved_recall = if quota.recall_target > 0.0 && !files.is_empty() {
let importance_scores: Vec<f64> = files
.iter()
.map(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0))
.cloned()
.collect();
let importance_threshold =
self.calculate_importance_threshold(&importance_scores, quota.recall_target)?;
let high_importance_files: Vec<_> = files
.iter()
.filter(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0) >= &importance_threshold)
.collect();
let selected_high_importance: Vec<_> = selected
.iter()
.filter(|f| heuristic_scores.get(&f.path).unwrap_or(&0.0) >= &importance_threshold)
.collect();
selected_high_importance.len() as f64 / high_importance_files.len().max(1) as f64
} else {
selected.len() as f64 / files.len().max(1) as f64 };
let density_score = if used_budget > 0 {
total_importance / used_budget as f64
} else {
0.0
};
let allocation = QuotaAllocation {
category,
allocated_budget,
used_budget,
file_count: selected.len(),
recall_achieved: achieved_recall,
density_score,
};
Ok((selected, allocation))
}
fn calculate_importance_threshold(
&self,
importance_scores: &[f64],
recall_target: f64,
) -> ScribeResult<f64> {
if importance_scores.is_empty() {
return Ok(0.0);
}
let mut sorted_scores = importance_scores.to_vec();
sorted_scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let target_count = (sorted_scores.len() as f64 * recall_target) as usize;
let target_count = target_count.max(1).min(sorted_scores.len());
let threshold_index = target_count - 1;
Ok(sorted_scores[threshold_index])
}
pub fn apply_quotas_selection(
&self,
scan_results: &[QuotaScanResult],
heuristic_scores: &HashMap<String, f64>,
) -> ScribeResult<(Vec<QuotaScanResult>, HashMap<FileCategory, QuotaAllocation>)> {
let categorized_files = self.classify_files(scan_results);
self.select_files_density_greedy(&categorized_files, heuristic_scores, 0.0)
}
}
pub fn create_quota_manager(total_budget: usize) -> ScribeResult<QuotaManager> {
QuotaManager::new(total_budget)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_category_detection_with_regex_set() {
let detector = CategoryDetector::new().expect("Failed to create CategoryDetector");
let config_file = QuotaScanResult {
path: "package.json".to_string(),
relative_path: "package.json".to_string(),
depth: 0,
content: "{}".to_string(),
is_entrypoint: false,
priority_boost: 0.0,
churn_score: 0.0,
centrality_in: 0.0,
imports: None,
is_docs: false,
is_readme: false,
is_test: false,
has_examples: false,
};
assert_eq!(detector.detect_category(&config_file), FileCategory::Config);
let entry_file = QuotaScanResult {
path: "src/main.rs".to_string(),
relative_path: "src/main.rs".to_string(),
depth: 1,
content: "fn main() {}".to_string(),
is_entrypoint: false,
priority_boost: 0.0,
churn_score: 0.0,
centrality_in: 0.0,
imports: None,
is_docs: false,
is_readme: false,
is_test: false,
has_examples: false,
};
assert_eq!(detector.detect_category(&entry_file), FileCategory::Entry);
let examples_file = QuotaScanResult {
path: "examples/demo.rs".to_string(),
relative_path: "examples/demo.rs".to_string(),
depth: 1,
content: "// demo".to_string(),
is_entrypoint: false,
priority_boost: 0.0,
churn_score: 0.0,
centrality_in: 0.0,
imports: None,
is_docs: false,
is_readme: false,
is_test: false,
has_examples: false,
};
assert_eq!(
detector.detect_category(&examples_file),
FileCategory::Examples
);
let entry_lib_file = QuotaScanResult {
path: "src/lib.rs".to_string(),
relative_path: "src/lib.rs".to_string(),
depth: 1,
content: "pub mod utils;".to_string(),
is_entrypoint: false,
priority_boost: 0.0,
churn_score: 0.0,
centrality_in: 0.0,
imports: None,
is_docs: false,
is_readme: false,
is_test: false,
has_examples: false,
};
assert_eq!(
detector.detect_category(&entry_lib_file),
FileCategory::Entry
);
let general_file = QuotaScanResult {
path: "src/utils.rs".to_string(),
relative_path: "src/utils.rs".to_string(),
depth: 1,
content: "pub fn helper() {}".to_string(),
is_entrypoint: false,
priority_boost: 0.0,
churn_score: 0.0,
centrality_in: 0.0,
imports: None,
is_docs: false,
is_readme: false,
is_test: false,
has_examples: false,
};
assert_eq!(
detector.detect_category(&general_file),
FileCategory::General
);
}
#[test]
fn test_quota_manager_creation() {
let manager = QuotaManager::new(1000).expect("Failed to create QuotaManager");
assert_eq!(manager.total_budget, 1000);
assert_eq!(manager.category_quotas.len(), 4);
}
#[test]
fn test_regex_patterns_directly() {
use regex::RegexSet;
let entry_patterns = vec![
r"main\.py$",
r"__main__\.py$",
r"app\.py$",
r"server\.py$",
r"index\.py$",
r"main\.js$",
r"index\.js$",
r"app\.js$",
r"server\.js$",
r"index\.ts$",
r"main\.ts$",
r"main\.go$",
r"main\.rs$",
r"lib\.rs$",
r"mod\.rs$",
];
let regex_set = RegexSet::new(&entry_patterns).unwrap();
assert!(
regex_set.is_match("lib.rs"),
"lib.rs should match entry patterns"
);
assert!(
regex_set.is_match("main.rs"),
"main.rs should match entry patterns"
);
let path = "src/lib.rs";
let filename = path.split('/').last().unwrap_or("").to_lowercase();
assert_eq!(filename, "lib.rs");
assert!(
regex_set.is_match(&filename),
"Extracted filename should match"
);
}
}