1use std::cmp::Ordering;
2use std::collections::HashSet;
3use std::fs;
4use std::path::{Path, PathBuf};
5use std::time::{Instant, SystemTime};
6
7use globset::{Glob, GlobSet, GlobSetBuilder};
8
9use crate::report::SelectionMetrics;
10use crate::{
11 analyze_repository, apply_token_budget_selection, format_timestamp, report::ReportFile, Config,
12 RepositoryAnalysis,
13};
14use scribe_core::tokenization::{utils as token_utils, TokenCounter};
15use scribe_core::{FileInfo, Result};
16
17#[derive(Debug, Clone)]
21pub struct SelectionOptions {
22 pub token_target: usize,
24 pub force_traditional: bool,
26 pub algorithm_name: Option<String>,
28 pub include_directory_map: bool,
30}
31
32impl Default for SelectionOptions {
33 fn default() -> Self {
34 Self {
35 token_target: 128_000,
36 force_traditional: false,
37 algorithm_name: None,
38 include_directory_map: true,
39 }
40 }
41}
42
43#[derive(Debug, Clone)]
45pub struct SelectionOutcome {
46 pub selected_files: Vec<ReportFile>,
48 pub selected_file_infos: Vec<FileInfo>,
50 pub metrics: SelectionMetrics,
52 pub eligible_file_count: usize,
54 pub unlimited_budget: bool,
56}
57
58#[derive(Debug, Clone)]
61pub struct AnalysisOutcome {
62 pub analysis: RepositoryAnalysis,
63 pub selection: SelectionOutcome,
64}
65
66pub async fn analyze_and_select<P: AsRef<Path>>(
69 repo_path: P,
70 config: &Config,
71 options: &SelectionOptions,
72) -> Result<AnalysisOutcome> {
73 let repo_path = repo_path.as_ref();
74 let analysis = analyze_repository(repo_path, config).await?;
75 let selection = select_from_analysis(repo_path, config, &analysis, options).await?;
76
77 Ok(AnalysisOutcome {
78 analysis,
79 selection,
80 })
81}
82
83pub async fn select_from_analysis(
85 repo_path: &Path,
86 config: &Config,
87 analysis: &RepositoryAnalysis,
88 options: &SelectionOptions,
89) -> Result<SelectionOutcome> {
90 let selection_start = Instant::now();
91 let token_counter = TokenCounter::global();
92
93 let total_files_discovered = analysis.files.len();
94 let include_filter = build_include_filter(&config.filtering.include_patterns);
95
96 let filtered_infos: Vec<FileInfo> = analysis
97 .files
98 .iter()
99 .filter(|info| info.decision.should_include())
100 .filter(|info| match &include_filter {
101 Some(filter) => filter.is_match(info.relative_path.as_str()),
102 None => true,
103 })
104 .cloned()
105 .collect();
106
107 let unlimited_budget = options.force_traditional || options.token_target == 0;
108
109 let mut selected_infos = if unlimited_budget {
110 filtered_infos.clone()
111 } else {
112 apply_token_budget_selection(filtered_infos.clone(), options.token_target, config).await?
113 };
114
115 selected_infos.sort_by(|a, b| {
116 let a_key = a.path.to_string_lossy();
117 let b_key = b.path.to_string_lossy();
118 let a_score = analysis
119 .final_scores
120 .get(&a_key.to_string())
121 .copied()
122 .unwrap_or(0.0);
123 let b_score = analysis
124 .final_scores
125 .get(&b_key.to_string())
126 .copied()
127 .unwrap_or(0.0);
128
129 b_score
130 .partial_cmp(&a_score)
131 .unwrap_or(Ordering::Equal)
132 .then_with(|| a.relative_path.cmp(&b.relative_path))
133 });
134
135 let mut selected_file_infos = selected_infos.clone();
136
137 let mut selected_files = Vec::new();
138 let mut budget_consumed = 0usize;
139
140 if options.include_directory_map {
143 if let Some(directory_map) = build_directory_map_for_analysis(repo_path, &analysis.files) {
144 let map_tokens = directory_map.estimated_tokens;
145
146 if !unlimited_budget {
147 budget_consumed = budget_consumed.saturating_add(map_tokens);
148
149 if map_tokens > options.token_target && std::env::var("SCRIBE_DEBUG").is_ok() {
150 eprintln!(
151 "Directory map ({} tokens) exceeds the token budget {}; proceeding regardless",
152 map_tokens, options.token_target
153 );
154 }
155 }
156
157 selected_files.push(directory_map);
158 }
159 }
160
161 for info in selected_infos {
162 let mut content = info.content.clone();
163 if content.is_none() && !info.is_binary {
164 if let Ok(read) = fs::read_to_string(&info.path) {
165 content = Some(read);
166 }
167 }
168
169 let text = content.unwrap_or_else(|| String::from("<binary or unavailable content>"));
170 let estimated_tokens = info.token_estimate.unwrap_or_else(|| {
171 token_counter
172 .estimate_file_tokens(&text, &info.path)
173 .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&text))
174 .max(1)
175 });
176
177 if !unlimited_budget {
178 if budget_consumed.saturating_add(estimated_tokens) > options.token_target {
179 continue;
180 }
181 budget_consumed = budget_consumed.saturating_add(estimated_tokens);
182 }
183
184 let path_key = info.path.to_string_lossy().to_string();
185 let importance_score = analysis.final_scores.get(&path_key).copied().unwrap_or(0.0);
186
187 let display_path = info
188 .path
189 .strip_prefix(repo_path)
190 .unwrap_or(&info.path)
191 .to_string_lossy()
192 .to_string();
193
194 selected_files.push(ReportFile {
195 path: info.path.clone(),
196 relative_path: display_path,
197 content: text,
198 size: info.size,
199 estimated_tokens,
200 importance_score,
201 centrality_score: info.centrality_score.unwrap_or(0.0),
202 query_relevance_score: 0.0,
203 entry_point_proximity: 0.0,
204 content_quality_score: 0.0,
205 repository_role_score: 0.0,
206 recency_score: 0.0,
207 modified: info.modified,
208 });
209 }
210
211 if selected_files.is_empty() {
212 if let Some(first) = filtered_infos.first().or_else(|| analysis.files.first()) {
213 let fallback_content = fs::read_to_string(&first.path).unwrap_or_default();
214 let estimated_tokens = token_counter
215 .estimate_file_tokens(&fallback_content, &first.path)
216 .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&fallback_content))
217 .max(1);
218
219 let fallback_display = first
220 .path
221 .strip_prefix(repo_path)
222 .unwrap_or(&first.path)
223 .to_string_lossy()
224 .to_string();
225
226 selected_files.push(ReportFile {
227 path: first.path.clone(),
228 relative_path: fallback_display.clone(),
229 content: fallback_content,
230 size: first.size,
231 estimated_tokens,
232 importance_score: analysis
233 .final_scores
234 .get(&first.path.to_string_lossy().to_string())
235 .copied()
236 .unwrap_or(0.0),
237 centrality_score: first.centrality_score.unwrap_or(0.0),
238 query_relevance_score: 0.0,
239 entry_point_proximity: 0.0,
240 content_quality_score: 0.0,
241 repository_role_score: 0.0,
242 recency_score: 0.0,
243 modified: first.modified,
244 });
245 selected_file_infos.push(first.clone());
246 }
247 }
248
249 let total_tokens_estimated: usize = selected_files.iter().map(|f| f.estimated_tokens).sum();
250 let selection_time_ms = selection_start.elapsed().as_millis() as u128;
251
252 let coverage_score = if total_files_discovered > 0 {
253 selected_files.len() as f64 / total_files_discovered as f64
254 } else {
255 1.0
256 };
257
258 let relevance_score = if selected_files.is_empty() {
259 0.0
260 } else {
261 selected_files
262 .iter()
263 .map(|f| f.importance_score)
264 .sum::<f64>()
265 / selected_files.len() as f64
266 };
267
268 let algorithm_label = match (&options.algorithm_name, unlimited_budget) {
269 (Some(name), true) => format!("{} (unlimited)", name),
270 (Some(name), false) => name.clone(),
271 (None, true) => "Tiered (unlimited budget)".to_string(),
272 (None, false) => "Tiered (token-budget)".to_string(),
273 };
274
275 let metrics = SelectionMetrics {
276 total_files_discovered,
277 files_selected: selected_files.len(),
278 total_tokens_estimated,
279 selection_time_ms,
280 algorithm_used: algorithm_label,
281 coverage_score,
282 relevance_score,
283 };
284
285 Ok(SelectionOutcome {
286 selected_files,
287 selected_file_infos,
288 metrics,
289 eligible_file_count: filtered_infos.len(),
290 unlimited_budget,
291 })
292}
293
294fn build_include_filter(patterns: &[String]) -> Option<GlobSet> {
295 if patterns.is_empty() {
296 return None;
297 }
298
299 let mut builder = GlobSetBuilder::new();
300 for pattern in patterns {
301 if let Ok(glob) = Glob::new(pattern) {
302 builder.add(glob);
303 }
304 }
305
306 builder.build().ok()
307}
308
309fn build_directory_map_for_analysis(repo_path: &Path, files: &[FileInfo]) -> Option<ReportFile> {
310 let inventory = gather_inventory_entries(repo_path, files);
311 if inventory.is_empty() {
312 return None;
313 }
314
315 let directory_map = build_directory_map(&inventory)?;
316 if directory_map.is_empty() {
317 return None;
318 }
319
320 let estimated_tokens = TokenCounter::global()
321 .count_tokens(&directory_map)
322 .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&directory_map));
323 let tokens = estimated_tokens.max(1);
324 let size = directory_map.len() as u64;
325
326 Some(ReportFile {
327 path: repo_path.join("DIRECTORY_MAP.txt"),
328 relative_path: "DIRECTORY_MAP.txt".to_string(),
329 content: directory_map,
330 size,
331 estimated_tokens: tokens,
332 importance_score: 1.0,
333 centrality_score: 0.0,
334 query_relevance_score: 0.0,
335 entry_point_proximity: 0.0,
336 content_quality_score: 0.0,
337 repository_role_score: 0.0,
338 recency_score: 0.0,
339 modified: None,
340 })
341}
342
343fn gather_inventory_entries(repo_path: &Path, files: &[FileInfo]) -> Vec<InventoryEntry> {
344 if files.is_empty() {
345 return Vec::new();
346 }
347
348 let mut entries = Vec::with_capacity(files.len() + 16);
349 entries.push(InventoryEntry {
350 path: String::new(),
351 });
352
353 let mut directories: HashSet<String> = HashSet::new();
354
355 for file in files {
356 let mut ancestor = Path::new(&file.relative_path).parent();
357 while let Some(parent) = ancestor {
358 let parent_str = parent.to_string_lossy().to_string();
359 if parent_str.is_empty() {
360 break;
361 }
362 directories.insert(parent_str.clone());
363 ancestor = parent.parent();
364 }
365 }
366
367 for dir in directories {
368 if dir.is_empty() {
369 continue;
370 }
371
372 let dir_path = repo_path.join(&dir);
373 let metadata = fs::metadata(dir_path).ok();
374 let modified = metadata.as_ref().and_then(|meta| meta.modified().ok());
375
376 entries.push(InventoryEntry { path: dir });
377 }
378
379 entries
380}
381
382#[derive(Debug, Clone)]
383struct InventoryEntry {
384 path: String,
385}
386
387fn build_directory_map(entries: &[InventoryEntry]) -> Option<String> {
388 if entries.is_empty() {
389 return None;
390 }
391
392 let mut sorted = entries.to_vec();
393 sorted.sort_by(|a, b| a.path.cmp(&b.path));
394
395 let mut lines = Vec::with_capacity(sorted.len() + 4);
396 lines.push("Repository Directory Map".to_string());
397 lines.push("========================".to_string());
398 lines.push("Directory".to_string());
399 lines.push("---------".to_string());
400
401 for entry in sorted {
402 let display_path = if entry.path.is_empty() {
403 "."
404 } else {
405 entry.path.as_str()
406 };
407 lines.push(display_path.to_string());
408 }
409
410 lines.push(String::new());
411 Some(lines.join("\n"))
412}