1use crate::demotion::{DemotionEngine, FidelityMode};
6use scribe_analysis::heuristics::ScanResult;
7use scribe_core::{
8 tokenization::{TokenBudget, TokenCounter},
9 Config, FileInfo, FileType, Result, ScribeError,
10};
11use scribe_graph::CentralityCalculator;
12use std::collections::HashSet;
13use std::path::Path;
14
15pub async fn apply_token_budget_selection(
27 files: Vec<FileInfo>,
28 token_budget: usize,
29 config: &Config,
30) -> Result<Vec<FileInfo>> {
31 if std::env::var("SCRIBE_DEBUG").is_ok() {
32 eprintln!(
33 "🎯 Intelligent token budget selection: {} tokens across {} files",
34 token_budget,
35 files.len()
36 );
37 }
38
39 let counter = TokenCounter::global();
40 let mut selected_files = Vec::new();
41
42 let (mandatory_files, source_files, doc_files, other_files) = categorize_files(files.clone());
44
45 let all_files = files;
47
48 if std::env::var("SCRIBE_DEBUG").is_ok() {
49 eprintln!(
50 "📊 File categories: {} mandatory, {} source, {} docs, {} other",
51 mandatory_files.len(),
52 source_files.len(),
53 doc_files.len(),
54 other_files.len()
55 );
56 }
57
58 let mut budget_tracker = TokenBudget::new(token_budget);
59
60 if std::env::var("SCRIBE_DEBUG").is_ok() {
62 eprintln!("📌 Tier 1: Processing mandatory files");
63 }
64 for file in mandatory_files {
65 if budget_tracker.available() < 1 {
66 if std::env::var("SCRIBE_DEBUG").is_ok() {
67 eprintln!("🛑 Budget exhausted, stopping mandatory file selection");
68 }
69 break;
70 }
71 if let Some(selected_file) =
72 try_include_file_with_budget(file, &counter, &mut budget_tracker).await?
73 {
74 selected_files.push(selected_file);
75 }
76 }
77
78 if !source_files.is_empty() && budget_tracker.available() > 0 {
80 if std::env::var("SCRIBE_DEBUG").is_ok() {
81 eprintln!("🧠 Tier 2: Processing source files with centrality analysis");
82 }
83
84 let calculator = CentralityCalculator::new()?;
86 let mock_scan_results: Vec<_> = source_files
87 .iter()
88 .map(MockScanResult::from_file_info)
89 .collect();
90 let centrality_results = calculator.calculate_centrality(&mock_scan_results)?;
91
92 let mut source_with_centrality: Vec<_> = source_files
93 .into_iter()
94 .map(|mut file| {
95 let centrality_score = centrality_results
96 .pagerank_scores
97 .get(&file.relative_path)
98 .copied()
99 .unwrap_or(0.0);
100 file.centrality_score = Some(centrality_score);
101 (file, centrality_score)
102 })
103 .collect();
104
105 source_with_centrality
107 .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
108
109 if std::env::var("SCRIBE_DEBUG").is_ok() && !source_with_centrality.is_empty() {
110 eprintln!("🔍 Top 10 source files by centrality:");
111 for (i, (file, score)) in source_with_centrality.iter().enumerate().take(10) {
112 eprintln!(" {}. {} (score: {:.6})", i + 1, file.relative_path, score);
113 }
114 }
115
116 for (file, centrality_score) in source_with_centrality {
117 if budget_tracker.available() < 1 {
118 if std::env::var("SCRIBE_DEBUG").is_ok() {
119 eprintln!("🛑 Budget exhausted, stopping source selection");
120 }
121 break;
122 }
123
124 if let Some(selected_file) = try_include_file_with_budget_and_demotion(
125 file,
126 &counter,
127 &mut budget_tracker,
128 centrality_score,
129 )
130 .await?
131 {
132 if std::env::var("SCRIBE_DEBUG").is_ok() {
133 eprintln!(
134 "✅ Selected {} (centrality: {:.4})",
135 selected_file.relative_path, centrality_score
136 );
137 }
138 selected_files.push(selected_file);
139 }
140 }
141 }
142
143 if !doc_files.is_empty() && budget_tracker.available() > 0 {
145 if std::env::var("SCRIBE_DEBUG").is_ok() {
146 eprintln!("📚 Tier 3: Processing documentation files");
147 }
148
149 let mut critical_docs = Vec::new();
151 let mut other_docs = Vec::new();
152
153 for file in doc_files {
154 let path_lower = file.relative_path.to_lowercase();
155 if path_lower.contains("architecture")
156 || path_lower.contains("design")
157 || path_lower.contains("api")
158 || path_lower.contains("spec")
159 || path_lower.ends_with("changelog.md")
160 || path_lower.ends_with("contributing.md")
161 {
162 critical_docs.push(file);
163 } else {
164 other_docs.push(file);
165 }
166 }
167
168 for file in critical_docs.into_iter().chain(other_docs.into_iter()) {
170 if budget_tracker.available() < 1 {
171 if std::env::var("SCRIBE_DEBUG").is_ok() {
172 eprintln!("🛑 Budget exhausted, stopping documentation selection");
173 }
174 break;
175 }
176
177 if let Some(selected_file) =
178 try_include_file_with_budget(file, &counter, &mut budget_tracker).await?
179 {
180 selected_files.push(selected_file);
181 }
182 }
183 }
184
185 if !other_files.is_empty() && budget_tracker.available() > 0 {
187 if std::env::var("SCRIBE_DEBUG").is_ok() {
188 eprintln!("📄 Tier 4: Processing other files");
189 }
190
191 for file in other_files {
192 if budget_tracker.available() < 1 {
193 if std::env::var("SCRIBE_DEBUG").is_ok() {
194 eprintln!("🛑 Budget exhausted, stopping other file selection");
195 }
196 break;
197 }
198
199 if let Some(selected_file) =
200 try_include_file_with_budget(file, &counter, &mut budget_tracker).await?
201 {
202 selected_files.push(selected_file);
203 }
204 }
205 }
206
207 if budget_tracker.available() > 1 {
209 if std::env::var("SCRIBE_DEBUG").is_ok() {
210 eprintln!(
211 "🔧 Final optimization pass: {} tokens remaining, searching for small files",
212 budget_tracker.available()
213 );
214 }
215
216 let included_paths: HashSet<String> = selected_files
217 .iter()
218 .map(|f| f.relative_path.clone())
219 .collect();
220
221 for file in &all_files {
223 if budget_tracker.available() < 1 {
224 break;
225 }
226
227 if included_paths.contains(&file.relative_path) || !file.decision.should_include() {
228 continue;
229 }
230
231 if file.size <= (budget_tracker.available() * 4) as u64 {
233 if let Some(selected_file) =
234 try_include_file_with_budget(file.clone(), &counter, &mut budget_tracker)
235 .await?
236 {
237 if std::env::var("SCRIBE_DEBUG").is_ok() {
238 eprintln!(
239 "🎯 Final pass: included {} ({} tokens)",
240 selected_file.relative_path,
241 selected_file.token_estimate.unwrap_or(0)
242 );
243 }
244 selected_files.push(selected_file);
245 }
246 }
247 }
248 }
249
250 let tokens_used = token_budget - budget_tracker.available();
251 let utilization = (tokens_used as f64 / token_budget as f64) * 100.0;
252
253 if std::env::var("SCRIBE_DEBUG").is_ok() {
254 eprintln!(
255 "✅ Selected {} files ({} tokens / {} budget, {:.1}% utilized)",
256 selected_files.len(),
257 tokens_used,
258 token_budget,
259 utilization
260 );
261
262 if utilization < 90.0 {
263 eprintln!(
264 "⚠️ Budget utilization below 90% - {} tokens unused",
265 budget_tracker.available()
266 );
267 }
268 }
269
270 Ok(selected_files)
271}
272
273fn categorize_files(
274 files: Vec<FileInfo>,
275) -> (Vec<FileInfo>, Vec<FileInfo>, Vec<FileInfo>, Vec<FileInfo>) {
276 let mut mandatory = Vec::new();
277 let mut source = Vec::new();
278 let mut docs = Vec::new();
279 let mut other = Vec::new();
280
281 for file in files {
282 if !file.decision.should_include() {
283 continue;
284 }
285
286 if is_mandatory_file(&file) {
287 mandatory.push(file);
288 } else if matches!(file.file_type, FileType::Source { .. }) {
289 source.push(file);
290 } else if matches!(file.file_type, FileType::Documentation { .. }) {
291 docs.push(file);
292 } else {
293 other.push(file);
294 }
295 }
296
297 (mandatory, source, docs, other)
298}
299
300fn is_mandatory_file(file: &FileInfo) -> bool {
301 let path = file.relative_path.to_lowercase();
302
303 if path.contains("node_modules/")
305 || path.contains("target/")
306 || path.contains("vendor/")
307 || path.contains(".git/")
308 || path.contains("__pycache__/")
309 || path.contains("build/")
310 || path.contains("dist/")
311 || path.contains(".cache/")
312 {
313 return false;
314 }
315
316 if path.contains("readme") {
318 let depth = path.matches('/').count();
319 return depth <= 1;
320 }
321
322 if !path.contains('/')
324 && matches!(
325 path.as_str(),
326 "package.json"
327 | "cargo.toml"
328 | "pyproject.toml"
329 | "requirements.txt"
330 | "go.mod"
331 | "pom.xml"
332 | "build.gradle"
333 | "composer.json"
334 | "tsconfig.json"
335 | ".gitignore"
336 | "dockerfile"
337 | "docker-compose.yml"
338 )
339 {
340 return true;
341 }
342
343 if (path.starts_with("src/") || path.starts_with("lib/") || !path.contains('/'))
345 && (path.contains("main") || path.contains("index"))
346 {
347 return true;
348 }
349
350 false
351}
352
353async fn try_include_file_with_budget(
354 mut file: FileInfo,
355 counter: &TokenCounter,
356 budget_tracker: &mut TokenBudget,
357) -> Result<Option<FileInfo>> {
358 match load_file_content_safe(&file.path) {
359 Ok(content) => match counter.estimate_file_tokens(&content, &file.path) {
360 Ok(token_count) => {
361 if budget_tracker.can_allocate(token_count) {
362 budget_tracker.allocate(token_count);
363 file.content = Some(content);
364 file.token_estimate = Some(token_count);
365 file.char_count = Some(file.content.as_ref().unwrap().chars().count());
366 file.line_count = Some(file.content.as_ref().unwrap().lines().count());
367 Ok(Some(file))
368 } else {
369 if std::env::var("SCRIBE_DEBUG").is_ok() {
370 eprintln!(
371 "⚠️ Skipping {} ({} tokens) - would exceed budget",
372 file.relative_path, token_count
373 );
374 }
375 Ok(None)
376 }
377 }
378 Err(e) => {
379 if std::env::var("SCRIBE_DEBUG").is_ok() {
380 eprintln!(
381 "⚠️ Failed to estimate tokens for {}: {}",
382 file.relative_path, e
383 );
384 }
385 Ok(None)
386 }
387 },
388 Err(e) => {
389 if std::env::var("SCRIBE_DEBUG").is_ok() {
390 eprintln!("⚠️ Failed to read {}: {}", file.relative_path, e);
391 }
392 Ok(None)
393 }
394 }
395}
396
397async fn try_include_file_with_budget_and_demotion(
398 mut file: FileInfo,
399 counter: &TokenCounter,
400 budget_tracker: &mut TokenBudget,
401 centrality_score: f64,
402) -> Result<Option<FileInfo>> {
403 match load_file_content_safe(&file.path) {
404 Ok(content) => match counter.estimate_file_tokens(&content, &file.path) {
405 Ok(full_tokens) => {
406 if budget_tracker.can_allocate(full_tokens) {
408 budget_tracker.allocate(full_tokens);
409 file.content = Some(content);
410 file.token_estimate = Some(full_tokens);
411 file.char_count = Some(file.content.as_ref().unwrap().chars().count());
412 file.line_count = Some(file.content.as_ref().unwrap().lines().count());
413 return Ok(Some(file));
414 }
415
416 if matches!(file.file_type, FileType::Source { .. }) {
418 if std::env::var("SCRIBE_DEBUG").is_ok() {
419 eprintln!(
420 "🔧 Trying demotion for {} ({} tokens → chunks/signatures)",
421 file.relative_path, full_tokens
422 );
423 }
424
425 if let Ok(mut demotion_engine) = DemotionEngine::new() {
426 if let Ok(chunk_result) = demotion_engine.demote_content(
427 &content,
428 &file.relative_path,
429 FidelityMode::Chunk,
430 Some(budget_tracker.available()),
431 ) {
432 if budget_tracker.can_allocate(chunk_result.demoted_tokens) {
433 budget_tracker.allocate(chunk_result.demoted_tokens);
434 file.content = Some(chunk_result.content);
435 file.token_estimate = Some(chunk_result.demoted_tokens);
436 file.char_count =
437 Some(file.content.as_ref().unwrap().chars().count());
438 file.line_count =
439 Some(file.content.as_ref().unwrap().lines().count());
440 if std::env::var("SCRIBE_DEBUG").is_ok() {
441 eprintln!(
442 "✅ Demoted {} to chunks ({} → {} tokens, {:.1}% compression, centrality: {:.4})",
443 file.relative_path,
444 full_tokens,
445 chunk_result.demoted_tokens,
446 chunk_result.compression_ratio * 100.0,
447 centrality_score
448 );
449 }
450 return Ok(Some(file));
451 }
452 }
453
454 if let Ok(sig_result) = demotion_engine.demote_content(
455 &content,
456 &file.relative_path,
457 FidelityMode::Signature,
458 None,
459 ) {
460 if budget_tracker.can_allocate(sig_result.demoted_tokens) {
461 budget_tracker.allocate(sig_result.demoted_tokens);
462 file.content = Some(sig_result.content);
463 file.token_estimate = Some(sig_result.demoted_tokens);
464 file.char_count =
465 Some(file.content.as_ref().unwrap().chars().count());
466 file.line_count =
467 Some(file.content.as_ref().unwrap().lines().count());
468 if std::env::var("SCRIBE_DEBUG").is_ok() {
469 eprintln!(
470 "✅ Demoted {} to signatures ({} → {} tokens, {:.1}% compression, centrality: {:.4})",
471 file.relative_path,
472 full_tokens,
473 sig_result.demoted_tokens,
474 sig_result.compression_ratio * 100.0,
475 centrality_score
476 );
477 }
478 return Ok(Some(file));
479 }
480 }
481 }
482 }
483
484 if std::env::var("SCRIBE_DEBUG").is_ok() {
485 eprintln!(
486 "⚠️ Skipping {} ({} tokens) - no demotion method fits budget",
487 file.relative_path, full_tokens
488 );
489 }
490 Ok(None)
491 }
492 Err(e) => {
493 if std::env::var("SCRIBE_DEBUG").is_ok() {
494 eprintln!(
495 "⚠️ Failed to estimate tokens for {}: {}",
496 file.relative_path, e
497 );
498 }
499 Ok(None)
500 }
501 },
502 Err(e) => {
503 if std::env::var("SCRIBE_DEBUG").is_ok() {
504 eprintln!("⚠️ Failed to read {}: {}", file.relative_path, e);
505 }
506 Ok(None)
507 }
508 }
509}
510
511struct MockScanResult {
512 path: String,
513 relative_path: String,
514 centrality_score: Option<f64>,
515}
516
517impl MockScanResult {
518 fn from_file_info(file: &FileInfo) -> Self {
519 Self {
520 path: file.path.to_string_lossy().to_string(),
521 relative_path: file.relative_path.clone(),
522 centrality_score: file.centrality_score,
523 }
524 }
525}
526
527impl ScanResult for MockScanResult {
528 fn path(&self) -> &str {
529 &self.path
530 }
531
532 fn relative_path(&self) -> &str {
533 &self.relative_path
534 }
535
536 fn depth(&self) -> usize {
537 self.relative_path.matches('/').count()
538 }
539
540 fn is_docs(&self) -> bool {
541 false
542 }
543
544 fn is_readme(&self) -> bool {
545 self.relative_path.to_lowercase().contains("readme")
546 }
547
548 fn is_entrypoint(&self) -> bool {
549 self.relative_path.contains("main") || self.relative_path.contains("index")
550 }
551
552 fn has_examples(&self) -> bool {
553 self.relative_path.contains("example")
554 }
555
556 fn is_test(&self) -> bool {
557 self.relative_path.contains("test")
558 }
559
560 fn priority_boost(&self) -> f64 {
561 0.0
562 }
563
564 fn churn_score(&self) -> f64 {
565 0.0
566 }
567
568 fn centrality_in(&self) -> f64 {
569 self.centrality_score.unwrap_or(0.0)
570 }
571
572 fn imports(&self) -> Option<&[String]> {
573 None
574 }
575
576 fn doc_analysis(&self) -> Option<&scribe_analysis::heuristics::DocumentAnalysis> {
577 None
578 }
579}
580
581fn load_file_content_safe(path: &Path) -> Result<String> {
582 std::fs::read_to_string(path)
583 .map_err(|e| ScribeError::io(format!("Failed to read file {}: {}", path.display(), e), e))
584}