use std::cmp::Ordering;
use std::collections::HashSet;
use std::fs;
use std::path::{Path, PathBuf};
use std::time::{Instant, SystemTime};
use globset::{Glob, GlobSet, GlobSetBuilder};
use crate::report::SelectionMetrics;
use crate::{
analyze_repository, apply_token_budget_selection, format_timestamp, report::ReportFile, Config,
RepositoryAnalysis,
};
use scribe_core::tokenization::{utils as token_utils, TokenCounter};
use scribe_core::{FileInfo, Result};
#[derive(Debug, Clone)]
pub struct SelectionOptions {
pub token_target: usize,
pub force_traditional: bool,
pub algorithm_name: Option<String>,
pub include_directory_map: bool,
}
impl Default for SelectionOptions {
fn default() -> Self {
Self {
token_target: 128_000,
force_traditional: false,
algorithm_name: None,
include_directory_map: true,
}
}
}
#[derive(Debug, Clone)]
pub struct SelectionOutcome {
pub selected_files: Vec<ReportFile>,
pub selected_file_infos: Vec<FileInfo>,
pub metrics: SelectionMetrics,
pub eligible_file_count: usize,
pub unlimited_budget: bool,
}
#[derive(Debug, Clone)]
pub struct AnalysisOutcome {
pub analysis: RepositoryAnalysis,
pub selection: SelectionOutcome,
}
pub async fn analyze_and_select<P: AsRef<Path>>(
repo_path: P,
config: &Config,
options: &SelectionOptions,
) -> Result<AnalysisOutcome> {
let repo_path = repo_path.as_ref();
let analysis = analyze_repository(repo_path, config).await?;
let selection = select_from_analysis(repo_path, config, &analysis, options).await?;
Ok(AnalysisOutcome {
analysis,
selection,
})
}
pub async fn select_from_analysis(
repo_path: &Path,
config: &Config,
analysis: &RepositoryAnalysis,
options: &SelectionOptions,
) -> Result<SelectionOutcome> {
let selection_start = Instant::now();
let token_counter = TokenCounter::global();
let total_files_discovered = analysis.files.len();
let include_filter = build_include_filter(&config.filtering.include_patterns);
let filtered_infos: Vec<FileInfo> = analysis
.files
.iter()
.filter(|info| info.decision.should_include())
.filter(|info| match &include_filter {
Some(filter) => filter.is_match(info.relative_path.as_str()),
None => true,
})
.cloned()
.collect();
let unlimited_budget = options.force_traditional || options.token_target == 0;
let mut selected_infos = if unlimited_budget {
filtered_infos.clone()
} else {
apply_token_budget_selection(filtered_infos.clone(), options.token_target, config).await?
};
selected_infos.sort_by(|a, b| {
let a_key = a.path.to_string_lossy();
let b_key = b.path.to_string_lossy();
let a_score = analysis
.final_scores
.get(&a_key.to_string())
.copied()
.unwrap_or(0.0);
let b_score = analysis
.final_scores
.get(&b_key.to_string())
.copied()
.unwrap_or(0.0);
b_score
.partial_cmp(&a_score)
.unwrap_or(Ordering::Equal)
.then_with(|| a.relative_path.cmp(&b.relative_path))
});
let mut selected_file_infos = selected_infos.clone();
let mut selected_files = Vec::new();
let mut budget_consumed = 0usize;
if options.include_directory_map {
if let Some(directory_map) = build_directory_map_for_analysis(repo_path, &analysis.files) {
let map_tokens = directory_map.estimated_tokens;
if !unlimited_budget {
budget_consumed = budget_consumed.saturating_add(map_tokens);
if map_tokens > options.token_target && std::env::var("SCRIBE_DEBUG").is_ok() {
eprintln!(
"Directory map ({} tokens) exceeds the token budget {}; proceeding regardless",
map_tokens, options.token_target
);
}
}
selected_files.push(directory_map);
}
}
for info in selected_infos {
let mut content = info.content.clone();
if content.is_none() && !info.is_binary {
if let Ok(read) = fs::read_to_string(&info.path) {
content = Some(read);
}
}
let text = content.unwrap_or_else(|| String::from("<binary or unavailable content>"));
let estimated_tokens = info.token_estimate.unwrap_or_else(|| {
token_counter
.estimate_file_tokens(&text, &info.path)
.unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&text))
.max(1)
});
if !unlimited_budget {
if budget_consumed.saturating_add(estimated_tokens) > options.token_target {
continue;
}
budget_consumed = budget_consumed.saturating_add(estimated_tokens);
}
let path_key = info.path.to_string_lossy().to_string();
let importance_score = analysis.final_scores.get(&path_key).copied().unwrap_or(0.0);
let display_path = info
.path
.strip_prefix(repo_path)
.unwrap_or(&info.path)
.to_string_lossy()
.to_string();
selected_files.push(ReportFile {
path: info.path.clone(),
relative_path: display_path,
content: text,
size: info.size,
estimated_tokens,
importance_score,
centrality_score: info.centrality_score.unwrap_or(0.0),
query_relevance_score: 0.0,
entry_point_proximity: 0.0,
content_quality_score: 0.0,
repository_role_score: 0.0,
recency_score: 0.0,
modified: info.modified,
});
}
if selected_files.is_empty() {
if let Some(first) = filtered_infos.first().or_else(|| analysis.files.first()) {
let fallback_content = fs::read_to_string(&first.path).unwrap_or_default();
let estimated_tokens = token_counter
.estimate_file_tokens(&fallback_content, &first.path)
.unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&fallback_content))
.max(1);
let fallback_display = first
.path
.strip_prefix(repo_path)
.unwrap_or(&first.path)
.to_string_lossy()
.to_string();
selected_files.push(ReportFile {
path: first.path.clone(),
relative_path: fallback_display.clone(),
content: fallback_content,
size: first.size,
estimated_tokens,
importance_score: analysis
.final_scores
.get(&first.path.to_string_lossy().to_string())
.copied()
.unwrap_or(0.0),
centrality_score: first.centrality_score.unwrap_or(0.0),
query_relevance_score: 0.0,
entry_point_proximity: 0.0,
content_quality_score: 0.0,
repository_role_score: 0.0,
recency_score: 0.0,
modified: first.modified,
});
selected_file_infos.push(first.clone());
}
}
let total_tokens_estimated: usize = selected_files.iter().map(|f| f.estimated_tokens).sum();
let selection_time_ms = selection_start.elapsed().as_millis() as u128;
let coverage_score = if total_files_discovered > 0 {
selected_files.len() as f64 / total_files_discovered as f64
} else {
1.0
};
let relevance_score = if selected_files.is_empty() {
0.0
} else {
selected_files
.iter()
.map(|f| f.importance_score)
.sum::<f64>()
/ selected_files.len() as f64
};
let algorithm_label = match (&options.algorithm_name, unlimited_budget) {
(Some(name), true) => format!("{} (unlimited)", name),
(Some(name), false) => name.clone(),
(None, true) => "Tiered (unlimited budget)".to_string(),
(None, false) => "Tiered (token-budget)".to_string(),
};
let metrics = SelectionMetrics {
total_files_discovered,
files_selected: selected_files.len(),
total_tokens_estimated,
selection_time_ms,
algorithm_used: algorithm_label,
coverage_score,
relevance_score,
};
Ok(SelectionOutcome {
selected_files,
selected_file_infos,
metrics,
eligible_file_count: filtered_infos.len(),
unlimited_budget,
})
}
fn build_include_filter(patterns: &[String]) -> Option<GlobSet> {
if patterns.is_empty() {
return None;
}
let mut builder = GlobSetBuilder::new();
for pattern in patterns {
if let Ok(glob) = Glob::new(pattern) {
builder.add(glob);
}
}
builder.build().ok()
}
fn build_directory_map_for_analysis(repo_path: &Path, files: &[FileInfo]) -> Option<ReportFile> {
let inventory = gather_inventory_entries(repo_path, files);
if inventory.is_empty() {
return None;
}
let directory_map = build_directory_map(&inventory)?;
if directory_map.is_empty() {
return None;
}
let estimated_tokens = TokenCounter::global()
.count_tokens(&directory_map)
.unwrap_or_else(|_| token_utils::estimate_tokens_legacy(&directory_map));
let tokens = estimated_tokens.max(1);
let size = directory_map.len() as u64;
Some(ReportFile {
path: repo_path.join("DIRECTORY_MAP.txt"),
relative_path: "DIRECTORY_MAP.txt".to_string(),
content: directory_map,
size,
estimated_tokens: tokens,
importance_score: 1.0,
centrality_score: 0.0,
query_relevance_score: 0.0,
entry_point_proximity: 0.0,
content_quality_score: 0.0,
repository_role_score: 0.0,
recency_score: 0.0,
modified: None,
})
}
fn gather_inventory_entries(repo_path: &Path, files: &[FileInfo]) -> Vec<InventoryEntry> {
if files.is_empty() {
return Vec::new();
}
let mut entries = Vec::with_capacity(files.len() + 16);
entries.push(InventoryEntry {
path: String::new(),
});
let mut directories: HashSet<String> = HashSet::new();
for file in files {
let mut ancestor = Path::new(&file.relative_path).parent();
while let Some(parent) = ancestor {
let parent_str = parent.to_string_lossy().to_string();
if parent_str.is_empty() {
break;
}
directories.insert(parent_str.clone());
ancestor = parent.parent();
}
}
for dir in directories {
if dir.is_empty() {
continue;
}
let dir_path = repo_path.join(&dir);
let metadata = fs::metadata(dir_path).ok();
let modified = metadata.as_ref().and_then(|meta| meta.modified().ok());
entries.push(InventoryEntry { path: dir });
}
entries
}
#[derive(Debug, Clone)]
struct InventoryEntry {
path: String,
}
fn build_directory_map(entries: &[InventoryEntry]) -> Option<String> {
if entries.is_empty() {
return None;
}
let mut sorted = entries.to_vec();
sorted.sort_by(|a, b| a.path.cmp(&b.path));
let mut lines = Vec::with_capacity(sorted.len() + 4);
lines.push("Repository Directory Map".to_string());
lines.push("========================".to_string());
lines.push("Directory".to_string());
lines.push("---------".to_string());
for entry in sorted {
let display_path = if entry.path.is_empty() {
"."
} else {
entry.path.as_str()
};
lines.push(display_path.to_string());
}
lines.push(String::new());
Some(lines.join("\n"))
}