scribe/
pipeline.rs

1use std::cmp::Ordering;
2use std::collections::HashSet;
3use std::fs;
4use std::path::{Path, PathBuf};
5use std::time::{Instant, SystemTime};
6
7use globset::{Glob, GlobSet, GlobSetBuilder};
8
9use crate::report::SelectionMetrics;
10use crate::{
11    analyze_repository, apply_token_budget_selection, format_timestamp, report::ReportFile, Config,
12    RepositoryAnalysis,
13};
14use scribe_core::tokenization::{utils as token_utils, TokenCounter};
15use scribe_core::{FileInfo, Result};
16
17/// Configuration options controlling how selection behaves when generating
18/// analysis reports. These options capture the CLI behaviour but remain general
19/// enough for other front-ends (e.g. the web service) to reuse.
20#[derive(Debug, Clone)]
21pub struct SelectionOptions {
22    /// Target number of tokens to keep within. `0` means unlimited.
23    pub token_target: usize,
24    /// When true the selector skips token-budget pruning and returns everything.
25    pub force_traditional: bool,
26    /// Human friendly label for the active algorithm (used in metrics output).
27    pub algorithm_name: Option<String>,
28    /// Whether to inject the directory inventory map into the final bundle.
29    pub include_directory_map: bool,
30}
31
32impl Default for SelectionOptions {
33    fn default() -> Self {
34        Self {
35            token_target: 128_000,
36            force_traditional: false,
37            algorithm_name: None,
38            include_directory_map: true,
39        }
40    }
41}
42
43/// Result of running the selection step against a repository analysis.
44#[derive(Debug, Clone)]
45pub struct SelectionOutcome {
46    /// Files that were selected for inclusion in the final bundle.
47    pub selected_files: Vec<ReportFile>,
48    /// The underlying `FileInfo` records corresponding to the selected files.
49    pub selected_file_infos: Vec<FileInfo>,
50    /// Summary statistics describing the selection.
51    pub metrics: SelectionMetrics,
52    /// Number of files that were eligible after filtering and ignore handling.
53    pub eligible_file_count: usize,
54    /// Indicates whether a token budget was applied.
55    pub unlimited_budget: bool,
56}
57
58/// Combined result containing the raw repository analysis and the derived
59/// selection outcome.
60#[derive(Debug, Clone)]
61pub struct AnalysisOutcome {
62    pub analysis: RepositoryAnalysis,
63    pub selection: SelectionOutcome,
64}
65
66/// Run a full repository analysis followed by intelligent selection using the
67/// provided configuration.
68pub async fn analyze_and_select<P: AsRef<Path>>(
69    repo_path: P,
70    config: &Config,
71    options: &SelectionOptions,
72) -> Result<AnalysisOutcome> {
73    let repo_path = repo_path.as_ref();
74    let analysis = analyze_repository(repo_path, config).await?;
75    let selection = select_from_analysis(repo_path, config, &analysis, options).await?;
76
77    Ok(AnalysisOutcome {
78        analysis,
79        selection,
80    })
81}
82
83/// Derive a selection outcome from an existing repository analysis.
84pub async fn select_from_analysis(
85    repo_path: &Path,
86    config: &Config,
87    analysis: &RepositoryAnalysis,
88    options: &SelectionOptions,
89) -> Result<SelectionOutcome> {
90    let selection_start = Instant::now();
91    let token_counter = TokenCounter::global();
92
93    let total_files_discovered = analysis.files.len();
94    let include_filter = build_include_filter(&config.filtering.include_patterns);
95
96    let filtered_infos: Vec<FileInfo> = analysis
97        .files
98        .iter()
99        .filter(|info| info.decision.should_include())
100        .filter(|info| match &include_filter {
101            Some(filter) => filter.is_match(info.relative_path.as_str()),
102            None => true,
103        })
104        .cloned()
105        .collect();
106
107    let unlimited_budget = options.force_traditional || options.token_target == 0;
108
109    let mut selected_infos = if unlimited_budget {
110        filtered_infos.clone()
111    } else {
112        apply_token_budget_selection(filtered_infos.clone(), options.token_target, config).await?
113    };
114
115    selected_infos.sort_by(|a, b| {
116        let a_key = a.path.to_string_lossy();
117        let b_key = b.path.to_string_lossy();
118        let a_score = analysis
119            .final_scores
120            .get(&a_key.to_string())
121            .copied()
122            .unwrap_or(0.0);
123        let b_score = analysis
124            .final_scores
125            .get(&b_key.to_string())
126            .copied()
127            .unwrap_or(0.0);
128
129        b_score
130            .partial_cmp(&a_score)
131            .unwrap_or(Ordering::Equal)
132            .then_with(|| a.relative_path.cmp(&b.relative_path))
133    });
134
135    let mut selected_file_infos = selected_infos.clone();
136
137    let mut selected_files = Vec::new();
138    let mut budget_consumed = 0usize;
139
140    // Always attempt to include the directory map first so subsequent selection respects
141    // the remaining budget. This keeps the structural overview available in every bundle.
142    if options.include_directory_map {
143        if let Some(directory_map) = build_directory_map_for_analysis(repo_path, &analysis.files) {
144            let map_tokens = directory_map.estimated_tokens;
145
146            if !unlimited_budget {
147                budget_consumed = budget_consumed.saturating_add(map_tokens);
148
149                if map_tokens > options.token_target && std::env::var("SCRIBE_DEBUG").is_ok() {
150                    eprintln!(
151                        "Directory map ({} tokens) exceeds the token budget {}; proceeding regardless",
152                        map_tokens, options.token_target
153                    );
154                }
155            }
156
157            selected_files.push(directory_map);
158        }
159    }
160
161    for info in selected_infos {
162        let mut content = info.content.clone();
163        if content.is_none() && !info.is_binary {
164            if let Ok(read) = fs::read_to_string(&info.path) {
165                content = Some(read);
166            }
167        }
168
169        let text = content.unwrap_or_else(|| String::from("<binary or unavailable content>"));
170        let estimated_tokens = info.token_estimate.unwrap_or_else(|| {
171            token_counter
172                .estimate_file_tokens(&text, &info.path)
173                .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&text))
174                .max(1)
175        });
176
177        if !unlimited_budget {
178            if budget_consumed.saturating_add(estimated_tokens) > options.token_target {
179                continue;
180            }
181            budget_consumed = budget_consumed.saturating_add(estimated_tokens);
182        }
183
184        let path_key = info.path.to_string_lossy().to_string();
185        let importance_score = analysis.final_scores.get(&path_key).copied().unwrap_or(0.0);
186
187        let display_path = info
188            .path
189            .strip_prefix(repo_path)
190            .unwrap_or(&info.path)
191            .to_string_lossy()
192            .to_string();
193
194        selected_files.push(ReportFile {
195            path: info.path.clone(),
196            relative_path: display_path,
197            content: text,
198            size: info.size,
199            estimated_tokens,
200            importance_score,
201            centrality_score: info.centrality_score.unwrap_or(0.0),
202            query_relevance_score: 0.0,
203            entry_point_proximity: 0.0,
204            content_quality_score: 0.0,
205            repository_role_score: 0.0,
206            recency_score: 0.0,
207            modified: info.modified,
208        });
209    }
210
211    if selected_files.is_empty() {
212        if let Some(first) = filtered_infos.first().or_else(|| analysis.files.first()) {
213            let fallback_content = fs::read_to_string(&first.path).unwrap_or_default();
214            let estimated_tokens = token_counter
215                .estimate_file_tokens(&fallback_content, &first.path)
216                .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&fallback_content))
217                .max(1);
218
219            let fallback_display = first
220                .path
221                .strip_prefix(repo_path)
222                .unwrap_or(&first.path)
223                .to_string_lossy()
224                .to_string();
225
226            selected_files.push(ReportFile {
227                path: first.path.clone(),
228                relative_path: fallback_display.clone(),
229                content: fallback_content,
230                size: first.size,
231                estimated_tokens,
232                importance_score: analysis
233                    .final_scores
234                    .get(&first.path.to_string_lossy().to_string())
235                    .copied()
236                    .unwrap_or(0.0),
237                centrality_score: first.centrality_score.unwrap_or(0.0),
238                query_relevance_score: 0.0,
239                entry_point_proximity: 0.0,
240                content_quality_score: 0.0,
241                repository_role_score: 0.0,
242                recency_score: 0.0,
243                modified: first.modified,
244            });
245            selected_file_infos.push(first.clone());
246        }
247    }
248
249    let total_tokens_estimated: usize = selected_files.iter().map(|f| f.estimated_tokens).sum();
250    let selection_time_ms = selection_start.elapsed().as_millis() as u128;
251
252    let coverage_score = if total_files_discovered > 0 {
253        selected_files.len() as f64 / total_files_discovered as f64
254    } else {
255        1.0
256    };
257
258    let relevance_score = if selected_files.is_empty() {
259        0.0
260    } else {
261        selected_files
262            .iter()
263            .map(|f| f.importance_score)
264            .sum::<f64>()
265            / selected_files.len() as f64
266    };
267
268    let algorithm_label = match (&options.algorithm_name, unlimited_budget) {
269        (Some(name), true) => format!("{} (unlimited)", name),
270        (Some(name), false) => name.clone(),
271        (None, true) => "Tiered (unlimited budget)".to_string(),
272        (None, false) => "Tiered (token-budget)".to_string(),
273    };
274
275    let metrics = SelectionMetrics {
276        total_files_discovered,
277        files_selected: selected_files.len(),
278        total_tokens_estimated,
279        selection_time_ms,
280        algorithm_used: algorithm_label,
281        coverage_score,
282        relevance_score,
283    };
284
285    Ok(SelectionOutcome {
286        selected_files,
287        selected_file_infos,
288        metrics,
289        eligible_file_count: filtered_infos.len(),
290        unlimited_budget,
291    })
292}
293
294fn build_include_filter(patterns: &[String]) -> Option<GlobSet> {
295    if patterns.is_empty() {
296        return None;
297    }
298
299    let mut builder = GlobSetBuilder::new();
300    for pattern in patterns {
301        if let Ok(glob) = Glob::new(pattern) {
302            builder.add(glob);
303        }
304    }
305
306    builder.build().ok()
307}
308
309fn build_directory_map_for_analysis(repo_path: &Path, files: &[FileInfo]) -> Option<ReportFile> {
310    let inventory = gather_inventory_entries(repo_path, files);
311    if inventory.is_empty() {
312        return None;
313    }
314
315    let directory_map = build_directory_map(&inventory)?;
316    if directory_map.is_empty() {
317        return None;
318    }
319
320    let estimated_tokens = TokenCounter::global()
321        .count_tokens(&directory_map)
322        .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&directory_map));
323    let tokens = estimated_tokens.max(1);
324    let size = directory_map.len() as u64;
325
326    Some(ReportFile {
327        path: repo_path.join("DIRECTORY_MAP.txt"),
328        relative_path: "DIRECTORY_MAP.txt".to_string(),
329        content: directory_map,
330        size,
331        estimated_tokens: tokens,
332        importance_score: 1.0,
333        centrality_score: 0.0,
334        query_relevance_score: 0.0,
335        entry_point_proximity: 0.0,
336        content_quality_score: 0.0,
337        repository_role_score: 0.0,
338        recency_score: 0.0,
339        modified: None,
340    })
341}
342
343fn gather_inventory_entries(repo_path: &Path, files: &[FileInfo]) -> Vec<InventoryEntry> {
344    if files.is_empty() {
345        return Vec::new();
346    }
347
348    let mut entries = Vec::with_capacity(files.len() + 16);
349    entries.push(InventoryEntry {
350        path: String::new(),
351    });
352
353    let mut directories: HashSet<String> = HashSet::new();
354
355    for file in files {
356        let mut ancestor = Path::new(&file.relative_path).parent();
357        while let Some(parent) = ancestor {
358            let parent_str = parent.to_string_lossy().to_string();
359            if parent_str.is_empty() {
360                break;
361            }
362            directories.insert(parent_str.clone());
363            ancestor = parent.parent();
364        }
365    }
366
367    for dir in directories {
368        if dir.is_empty() {
369            continue;
370        }
371
372        let dir_path = repo_path.join(&dir);
373        let metadata = fs::metadata(dir_path).ok();
374        let modified = metadata.as_ref().and_then(|meta| meta.modified().ok());
375
376        entries.push(InventoryEntry { path: dir });
377    }
378
379    entries
380}
381
382#[derive(Debug, Clone)]
383struct InventoryEntry {
384    path: String,
385}
386
387fn build_directory_map(entries: &[InventoryEntry]) -> Option<String> {
388    if entries.is_empty() {
389        return None;
390    }
391
392    let mut sorted = entries.to_vec();
393    sorted.sort_by(|a, b| a.path.cmp(&b.path));
394
395    let mut lines = Vec::with_capacity(sorted.len() + 4);
396    lines.push("Repository Directory Map".to_string());
397    lines.push("========================".to_string());
398    lines.push("Directory".to_string());
399    lines.push("---------".to_string());
400
401    for entry in sorted {
402        let display_path = if entry.path.is_empty() {
403            "."
404        } else {
405            entry.path.as_str()
406        };
407        lines.push(display_path.to_string());
408    }
409
410    lines.push(String::new());
411    Some(lines.join("\n"))
412}