scribe_selection/
token_budget.rs

1//! Token budget selection logic previously implemented in the analyzer crate.
2//! This module provides a shared implementation that can be reused by both the
3//! library pipeline and external consumers without duplicating complex logic.
4
5use crate::demotion::{DemotionEngine, FidelityMode};
6use scribe_analysis::heuristics::ScanResult;
7use scribe_core::{
8    tokenization::{TokenBudget, TokenCounter},
9    Config, FileInfo, FileType, Result, ScribeError,
10};
11use scribe_graph::CentralityCalculator;
12use std::collections::HashSet;
13use std::path::Path;
14
15/// Apply the library's tiered token budget selection to a set of files.
16///
17/// The selector prioritizes files in multiple tiers:
18/// 1. Mandatory project metadata (README, config files, entrypoints)
19/// 2. Source files ordered by graph centrality with demotion fallback
20/// 3. Documentation with preference for design/architecture material
21/// 4. Any remaining files while budget remains
22///
23/// The function loads file content and token estimates for the selected files
24/// and will attempt demotion (chunk/signature extraction) when a source file
25/// would otherwise exceed the available budget.
26pub async fn apply_token_budget_selection(
27    files: Vec<FileInfo>,
28    token_budget: usize,
29    config: &Config,
30) -> Result<Vec<FileInfo>> {
31    if std::env::var("SCRIBE_DEBUG").is_ok() {
32        eprintln!(
33            "🎯 Intelligent token budget selection: {} tokens across {} files",
34            token_budget,
35            files.len()
36        );
37    }
38
39    let counter = TokenCounter::global();
40    let mut selected_files = Vec::new();
41
42    // Split files into categories for prioritized selection
43    let (mandatory_files, source_files, doc_files, other_files) = categorize_files(files.clone());
44
45    // Keep a reference to all files for final optimization pass
46    let all_files = files;
47
48    if std::env::var("SCRIBE_DEBUG").is_ok() {
49        eprintln!(
50            "📊 File categories: {} mandatory, {} source, {} docs, {} other",
51            mandatory_files.len(),
52            source_files.len(),
53            doc_files.len(),
54            other_files.len()
55        );
56    }
57
58    let mut budget_tracker = TokenBudget::new(token_budget);
59
60    // Tier 1: Mandatory files (README, project config, main/index files)
61    if std::env::var("SCRIBE_DEBUG").is_ok() {
62        eprintln!("📌 Tier 1: Processing mandatory files");
63    }
64    for file in mandatory_files {
65        if budget_tracker.available() < 1 {
66            if std::env::var("SCRIBE_DEBUG").is_ok() {
67                eprintln!("🛑 Budget exhausted, stopping mandatory file selection");
68            }
69            break;
70        }
71        if let Some(selected_file) =
72            try_include_file_with_budget(file, &counter, &mut budget_tracker).await?
73        {
74            selected_files.push(selected_file);
75        }
76    }
77
78    // Tier 2: Source files (prioritized by centrality)
79    if !source_files.is_empty() && budget_tracker.available() > 0 {
80        if std::env::var("SCRIBE_DEBUG").is_ok() {
81            eprintln!("🧠 Tier 2: Processing source files with centrality analysis");
82        }
83
84        // Calculate centrality scores for source files
85        let calculator = CentralityCalculator::new()?;
86        let mock_scan_results: Vec<_> = source_files
87            .iter()
88            .map(MockScanResult::from_file_info)
89            .collect();
90        let centrality_results = calculator.calculate_centrality(&mock_scan_results)?;
91
92        let mut source_with_centrality: Vec<_> = source_files
93            .into_iter()
94            .map(|mut file| {
95                let centrality_score = centrality_results
96                    .pagerank_scores
97                    .get(&file.relative_path)
98                    .copied()
99                    .unwrap_or(0.0);
100                file.centrality_score = Some(centrality_score);
101                (file, centrality_score)
102            })
103            .collect();
104
105        // Sort by centrality score (highest first)
106        source_with_centrality
107            .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
108
109        if std::env::var("SCRIBE_DEBUG").is_ok() && !source_with_centrality.is_empty() {
110            eprintln!("🔍 Top 10 source files by centrality:");
111            for (i, (file, score)) in source_with_centrality.iter().enumerate().take(10) {
112                eprintln!("  {}. {} (score: {:.6})", i + 1, file.relative_path, score);
113            }
114        }
115
116        for (file, centrality_score) in source_with_centrality {
117            if budget_tracker.available() < 1 {
118                if std::env::var("SCRIBE_DEBUG").is_ok() {
119                    eprintln!("🛑 Budget exhausted, stopping source selection");
120                }
121                break;
122            }
123
124            if let Some(selected_file) = try_include_file_with_budget_and_demotion(
125                file,
126                &counter,
127                &mut budget_tracker,
128                centrality_score,
129            )
130            .await?
131            {
132                if std::env::var("SCRIBE_DEBUG").is_ok() {
133                    eprintln!(
134                        "✅ Selected {} (centrality: {:.4})",
135                        selected_file.relative_path, centrality_score
136                    );
137                }
138                selected_files.push(selected_file);
139            }
140        }
141    }
142
143    // Tier 3: Documentation files
144    if !doc_files.is_empty() && budget_tracker.available() > 0 {
145        if std::env::var("SCRIBE_DEBUG").is_ok() {
146            eprintln!("📚 Tier 3: Processing documentation files");
147        }
148
149        // Sort docs by importance - prioritize architecture/design docs
150        let mut critical_docs = Vec::new();
151        let mut other_docs = Vec::new();
152
153        for file in doc_files {
154            let path_lower = file.relative_path.to_lowercase();
155            if path_lower.contains("architecture")
156                || path_lower.contains("design")
157                || path_lower.contains("api")
158                || path_lower.contains("spec")
159                || path_lower.ends_with("changelog.md")
160                || path_lower.ends_with("contributing.md")
161            {
162                critical_docs.push(file);
163            } else {
164                other_docs.push(file);
165            }
166        }
167
168        // Process critical docs first, then others
169        for file in critical_docs.into_iter().chain(other_docs.into_iter()) {
170            if budget_tracker.available() < 1 {
171                if std::env::var("SCRIBE_DEBUG").is_ok() {
172                    eprintln!("🛑 Budget exhausted, stopping documentation selection");
173                }
174                break;
175            }
176
177            if let Some(selected_file) =
178                try_include_file_with_budget(file, &counter, &mut budget_tracker).await?
179            {
180                selected_files.push(selected_file);
181            }
182        }
183    }
184
185    // Tier 4: Other files (if budget remains)
186    if !other_files.is_empty() && budget_tracker.available() > 0 {
187        if std::env::var("SCRIBE_DEBUG").is_ok() {
188            eprintln!("📄 Tier 4: Processing other files");
189        }
190
191        for file in other_files {
192            if budget_tracker.available() < 1 {
193                if std::env::var("SCRIBE_DEBUG").is_ok() {
194                    eprintln!("🛑 Budget exhausted, stopping other file selection");
195                }
196                break;
197            }
198
199            if let Some(selected_file) =
200                try_include_file_with_budget(file, &counter, &mut budget_tracker).await?
201            {
202                selected_files.push(selected_file);
203            }
204        }
205    }
206
207    // Final optimization pass: try to fill remaining budget with smaller files
208    if budget_tracker.available() > 1 {
209        if std::env::var("SCRIBE_DEBUG").is_ok() {
210            eprintln!(
211                "🔧 Final optimization pass: {} tokens remaining, searching for small files",
212                budget_tracker.available()
213            );
214        }
215
216        let included_paths: HashSet<String> = selected_files
217            .iter()
218            .map(|f| f.relative_path.clone())
219            .collect();
220
221        // Try to find any remaining files that could fit
222        for file in &all_files {
223            if budget_tracker.available() < 1 {
224                break;
225            }
226
227            if included_paths.contains(&file.relative_path) || !file.decision.should_include() {
228                continue;
229            }
230
231            // Quick estimate - try small files that might fit
232            if file.size <= (budget_tracker.available() * 4) as u64 {
233                if let Some(selected_file) =
234                    try_include_file_with_budget(file.clone(), &counter, &mut budget_tracker)
235                        .await?
236                {
237                    if std::env::var("SCRIBE_DEBUG").is_ok() {
238                        eprintln!(
239                            "🎯 Final pass: included {} ({} tokens)",
240                            selected_file.relative_path,
241                            selected_file.token_estimate.unwrap_or(0)
242                        );
243                    }
244                    selected_files.push(selected_file);
245                }
246            }
247        }
248    }
249
250    let tokens_used = token_budget - budget_tracker.available();
251    let utilization = (tokens_used as f64 / token_budget as f64) * 100.0;
252
253    if std::env::var("SCRIBE_DEBUG").is_ok() {
254        eprintln!(
255            "✅ Selected {} files ({} tokens / {} budget, {:.1}% utilized)",
256            selected_files.len(),
257            tokens_used,
258            token_budget,
259            utilization
260        );
261
262        if utilization < 90.0 {
263            eprintln!(
264                "⚠️  Budget utilization below 90% - {} tokens unused",
265                budget_tracker.available()
266            );
267        }
268    }
269
270    Ok(selected_files)
271}
272
273fn categorize_files(
274    files: Vec<FileInfo>,
275) -> (Vec<FileInfo>, Vec<FileInfo>, Vec<FileInfo>, Vec<FileInfo>) {
276    let mut mandatory = Vec::new();
277    let mut source = Vec::new();
278    let mut docs = Vec::new();
279    let mut other = Vec::new();
280
281    for file in files {
282        if !file.decision.should_include() {
283            continue;
284        }
285
286        if is_mandatory_file(&file) {
287            mandatory.push(file);
288        } else if matches!(file.file_type, FileType::Source { .. }) {
289            source.push(file);
290        } else if matches!(file.file_type, FileType::Documentation { .. }) {
291            docs.push(file);
292        } else {
293            other.push(file);
294        }
295    }
296
297    (mandatory, source, docs, other)
298}
299
300fn is_mandatory_file(file: &FileInfo) -> bool {
301    let path = file.relative_path.to_lowercase();
302
303    // Skip files in dependency/build directories
304    if path.contains("node_modules/")
305        || path.contains("target/")
306        || path.contains("vendor/")
307        || path.contains(".git/")
308        || path.contains("__pycache__/")
309        || path.contains("build/")
310        || path.contains("dist/")
311        || path.contains(".cache/")
312    {
313        return false;
314    }
315
316    // README files (only in project root and first-level directories)
317    if path.contains("readme") {
318        let depth = path.matches('/').count();
319        return depth <= 1;
320    }
321
322    // Project configuration files (only at root level)
323    if !path.contains('/')
324        && matches!(
325            path.as_str(),
326            "package.json"
327                | "cargo.toml"
328                | "pyproject.toml"
329                | "requirements.txt"
330                | "go.mod"
331                | "pom.xml"
332                | "build.gradle"
333                | "composer.json"
334                | "tsconfig.json"
335                | ".gitignore"
336                | "dockerfile"
337                | "docker-compose.yml"
338        )
339    {
340        return true;
341    }
342
343    // Main/index files in root or src
344    if (path.starts_with("src/") || path.starts_with("lib/") || !path.contains('/'))
345        && (path.contains("main") || path.contains("index"))
346    {
347        return true;
348    }
349
350    false
351}
352
353async fn try_include_file_with_budget(
354    mut file: FileInfo,
355    counter: &TokenCounter,
356    budget_tracker: &mut TokenBudget,
357) -> Result<Option<FileInfo>> {
358    match load_file_content_safe(&file.path) {
359        Ok(content) => match counter.estimate_file_tokens(&content, &file.path) {
360            Ok(token_count) => {
361                if budget_tracker.can_allocate(token_count) {
362                    budget_tracker.allocate(token_count);
363                    file.content = Some(content);
364                    file.token_estimate = Some(token_count);
365                    file.char_count = Some(file.content.as_ref().unwrap().chars().count());
366                    file.line_count = Some(file.content.as_ref().unwrap().lines().count());
367                    Ok(Some(file))
368                } else {
369                    if std::env::var("SCRIBE_DEBUG").is_ok() {
370                        eprintln!(
371                            "⚠️  Skipping {} ({} tokens) - would exceed budget",
372                            file.relative_path, token_count
373                        );
374                    }
375                    Ok(None)
376                }
377            }
378            Err(e) => {
379                if std::env::var("SCRIBE_DEBUG").is_ok() {
380                    eprintln!(
381                        "⚠️  Failed to estimate tokens for {}: {}",
382                        file.relative_path, e
383                    );
384                }
385                Ok(None)
386            }
387        },
388        Err(e) => {
389            if std::env::var("SCRIBE_DEBUG").is_ok() {
390                eprintln!("⚠️  Failed to read {}: {}", file.relative_path, e);
391            }
392            Ok(None)
393        }
394    }
395}
396
397async fn try_include_file_with_budget_and_demotion(
398    mut file: FileInfo,
399    counter: &TokenCounter,
400    budget_tracker: &mut TokenBudget,
401    centrality_score: f64,
402) -> Result<Option<FileInfo>> {
403    match load_file_content_safe(&file.path) {
404        Ok(content) => match counter.estimate_file_tokens(&content, &file.path) {
405            Ok(full_tokens) => {
406                // Try full content first
407                if budget_tracker.can_allocate(full_tokens) {
408                    budget_tracker.allocate(full_tokens);
409                    file.content = Some(content);
410                    file.token_estimate = Some(full_tokens);
411                    file.char_count = Some(file.content.as_ref().unwrap().chars().count());
412                    file.line_count = Some(file.content.as_ref().unwrap().lines().count());
413                    return Ok(Some(file));
414                }
415
416                // Full content doesn't fit - try demotion for source files
417                if matches!(file.file_type, FileType::Source { .. }) {
418                    if std::env::var("SCRIBE_DEBUG").is_ok() {
419                        eprintln!(
420                            "🔧 Trying demotion for {} ({} tokens → chunks/signatures)",
421                            file.relative_path, full_tokens
422                        );
423                    }
424
425                    if let Ok(mut demotion_engine) = DemotionEngine::new() {
426                        if let Ok(chunk_result) = demotion_engine.demote_content(
427                            &content,
428                            &file.relative_path,
429                            FidelityMode::Chunk,
430                            Some(budget_tracker.available()),
431                        ) {
432                            if budget_tracker.can_allocate(chunk_result.demoted_tokens) {
433                                budget_tracker.allocate(chunk_result.demoted_tokens);
434                                file.content = Some(chunk_result.content);
435                                file.token_estimate = Some(chunk_result.demoted_tokens);
436                                file.char_count =
437                                    Some(file.content.as_ref().unwrap().chars().count());
438                                file.line_count =
439                                    Some(file.content.as_ref().unwrap().lines().count());
440                                if std::env::var("SCRIBE_DEBUG").is_ok() {
441                                    eprintln!(
442                                        "✅ Demoted {} to chunks ({} → {} tokens, {:.1}% compression, centrality: {:.4})",
443                                        file.relative_path,
444                                        full_tokens,
445                                        chunk_result.demoted_tokens,
446                                        chunk_result.compression_ratio * 100.0,
447                                        centrality_score
448                                    );
449                                }
450                                return Ok(Some(file));
451                            }
452                        }
453
454                        if let Ok(sig_result) = demotion_engine.demote_content(
455                            &content,
456                            &file.relative_path,
457                            FidelityMode::Signature,
458                            None,
459                        ) {
460                            if budget_tracker.can_allocate(sig_result.demoted_tokens) {
461                                budget_tracker.allocate(sig_result.demoted_tokens);
462                                file.content = Some(sig_result.content);
463                                file.token_estimate = Some(sig_result.demoted_tokens);
464                                file.char_count =
465                                    Some(file.content.as_ref().unwrap().chars().count());
466                                file.line_count =
467                                    Some(file.content.as_ref().unwrap().lines().count());
468                                if std::env::var("SCRIBE_DEBUG").is_ok() {
469                                    eprintln!(
470                                        "✅ Demoted {} to signatures ({} → {} tokens, {:.1}% compression, centrality: {:.4})",
471                                        file.relative_path,
472                                        full_tokens,
473                                        sig_result.demoted_tokens,
474                                        sig_result.compression_ratio * 100.0,
475                                        centrality_score
476                                    );
477                                }
478                                return Ok(Some(file));
479                            }
480                        }
481                    }
482                }
483
484                if std::env::var("SCRIBE_DEBUG").is_ok() {
485                    eprintln!(
486                        "⚠️  Skipping {} ({} tokens) - no demotion method fits budget",
487                        file.relative_path, full_tokens
488                    );
489                }
490                Ok(None)
491            }
492            Err(e) => {
493                if std::env::var("SCRIBE_DEBUG").is_ok() {
494                    eprintln!(
495                        "⚠️  Failed to estimate tokens for {}: {}",
496                        file.relative_path, e
497                    );
498                }
499                Ok(None)
500            }
501        },
502        Err(e) => {
503            if std::env::var("SCRIBE_DEBUG").is_ok() {
504                eprintln!("⚠️  Failed to read {}: {}", file.relative_path, e);
505            }
506            Ok(None)
507        }
508    }
509}
510
511struct MockScanResult {
512    path: String,
513    relative_path: String,
514    centrality_score: Option<f64>,
515}
516
517impl MockScanResult {
518    fn from_file_info(file: &FileInfo) -> Self {
519        Self {
520            path: file.path.to_string_lossy().to_string(),
521            relative_path: file.relative_path.clone(),
522            centrality_score: file.centrality_score,
523        }
524    }
525}
526
527impl ScanResult for MockScanResult {
528    fn path(&self) -> &str {
529        &self.path
530    }
531
532    fn relative_path(&self) -> &str {
533        &self.relative_path
534    }
535
536    fn depth(&self) -> usize {
537        self.relative_path.matches('/').count()
538    }
539
540    fn is_docs(&self) -> bool {
541        false
542    }
543
544    fn is_readme(&self) -> bool {
545        self.relative_path.to_lowercase().contains("readme")
546    }
547
548    fn is_entrypoint(&self) -> bool {
549        self.relative_path.contains("main") || self.relative_path.contains("index")
550    }
551
552    fn has_examples(&self) -> bool {
553        self.relative_path.contains("example")
554    }
555
556    fn is_test(&self) -> bool {
557        self.relative_path.contains("test")
558    }
559
560    fn priority_boost(&self) -> f64 {
561        0.0
562    }
563
564    fn churn_score(&self) -> f64 {
565        0.0
566    }
567
568    fn centrality_in(&self) -> f64 {
569        self.centrality_score.unwrap_or(0.0)
570    }
571
572    fn imports(&self) -> Option<&[String]> {
573        None
574    }
575
576    fn doc_analysis(&self) -> Option<&scribe_analysis::heuristics::DocumentAnalysis> {
577        None
578    }
579}
580
581fn load_file_content_safe(path: &Path) -> Result<String> {
582    std::fs::read_to_string(path)
583        .map_err(|e| ScribeError::io(format!("Failed to read file {}: {}", path.display(), e), e))
584}