use clap::{Arg, ArgAction, Command, ValueEnum};
use git2::{DiffOptions, Repository};
use serde_json::{self, json};
use std::collections::HashSet;
use std::fs;
use std::path::{Path, PathBuf};
use std::process;
use std::sync::Arc;
use tempfile::TempDir;
use tracing::{error, info, warn};
use tracing_subscriber::{fmt, EnvFilter};
use url::Url;
#[cfg(feature = "web")]
use async_trait::async_trait;
#[cfg(feature = "web")]
use scribe_webservice::{
AnalysisOutput, AnalysisProvider, WebReportFile, WebSelectionMetrics, WebService,
WebServiceConfig, WebServiceError,
};
use scribe::{
analyze_and_select, format_bytes, format_timestamp, generate_report, get_file_icon, Config,
ReportFile, ReportFormat, SelectionMetrics, SelectionOptions,
};
async fn clone_github_repo(
url: &str,
) -> Result<(PathBuf, Option<TempDir>), Box<dyn std::error::Error>> {
let temp_dir = TempDir::new()?;
Repository::clone(url, temp_dir.path())?;
Ok((temp_dir.path().to_path_buf(), Some(temp_dir)))
}
async fn run_covering_set_mode(
repo_dir: &Path,
entity_name: &str,
entity_type: Option<&str>,
exact_match: bool,
include_dependents: bool,
max_depth: Option<usize>,
max_files: Option<usize>,
verbose_level: u8,
) -> Result<(), Box<dyn std::error::Error>> {
use scribe_selection::{CoveringSetComputer, CoveringSetOptions, EntityQuery, EntityType};
use scribe::analyze_and_select;
use std::collections::HashMap;
if verbose_level > 0 {
info!("🎯 Covering set mode: finding '{}'", entity_name);
} else {
println!("🎯 Finding covering set for: {}", entity_name);
}
let mut config = Config::default();
config.general.working_dir = Some(repo_dir.to_path_buf());
config.analysis.token_budget = None;
let selection_options = SelectionOptions {
token_target: 0, force_traditional: false,
algorithm_name: Some("covering-set".to_string()),
include_directory_map: false,
};
if verbose_level > 0 {
info!("📊 Scanning repository...");
} else {
println!("📊 Scanning repository...");
}
let analysis_outcome = analyze_and_select(repo_dir, &config, &selection_options).await?;
let mut file_contents = HashMap::new();
for file_info in &analysis_outcome.analysis.files {
if let Ok(content) = std::fs::read_to_string(&file_info.path) {
file_contents.insert(file_info.path.display().to_string(), content);
}
}
if verbose_level > 0 {
info!("📁 Loaded {} files", file_contents.len());
}
if verbose_level > 0 {
info!("🔗 Preparing dependency graph...");
}
use scribe_graph::DependencyGraph;
let graph = DependencyGraph::new();
let parsed_entity_type = entity_type.and_then(|t| match t.to_lowercase().as_str() {
"function" => Some(EntityType::Function),
"class" => Some(EntityType::Class),
"module" => Some(EntityType::Module),
"interface" => Some(EntityType::Interface),
"constant" => Some(EntityType::Constant),
_ => None,
});
let query = EntityQuery {
entity_type: parsed_entity_type,
name_pattern: Some(entity_name.to_string()),
exact_match,
public_only: None,
};
let options = CoveringSetOptions {
include_dependencies: true,
include_dependents,
max_depth,
max_files,
min_importance: None,
};
if verbose_level > 0 {
info!("🔍 Computing covering set...");
} else {
println!("🔍 Computing covering set...");
}
let mut computer = CoveringSetComputer::new()?;
let result = computer.compute_covering_set(
&query,
&file_contents,
&graph,
&options,
)?;
if let Some(target) = &result.target_entity {
println!("\n✅ Found target entity:");
println!(" • File : {}", target.file_path);
println!(" • Type : {}", target.entity_type);
println!(" • Name : {}", target.entity_name);
println!(" • Lines : {}-{}", target.start_line, target.end_line);
println!(" • Public : {}", if target.is_public { "yes" } else { "no" });
} else {
println!("\n❌ Entity '{}' not found", entity_name);
println!(" Try:");
println!(" - Using a different name pattern");
println!(" - Removing --exact-match flag for fuzzy search");
println!(" - Specifying --entity-type (function, class, module, etc.)");
return Ok(());
}
println!("\n📦 Covering set ({} files):", result.files.len());
for (idx, file) in result.files.iter().enumerate() {
let explanation = result
.inclusion_reasons
.get(&file.path)
.map(|s| s.as_str())
.unwrap_or("Included");
println!(
" {}. {} (distance: {}, reason: {})",
idx + 1,
file.path,
file.distance,
explanation
);
}
println!("\n📊 Statistics:");
println!(" • Files examined : {}", result.statistics.files_examined);
println!(" • Files selected : {}", result.statistics.files_selected);
println!(" • Files excluded : {}", result.statistics.files_excluded);
println!(" • Max depth : {}", result.statistics.max_depth_reached);
println!(" • Limits reached : {}", if result.statistics.limits_reached { "yes" } else { "no" });
if verbose_level > 0 {
info!("✨ Covering set computation complete");
}
Ok(())
}
async fn run_covering_set_diff_mode(
repo_dir: &Path,
diff_against: Option<&str>,
include_dependents: bool,
max_depth: Option<usize>,
max_files: Option<usize>,
verbose_level: u8,
) -> Result<(), Box<dyn std::error::Error>> {
use scribe_graph::centrality::{ImportDetector, ImportResolutionConfig};
use scribe_graph::DependencyGraph;
use scribe_selection::{CoveringSetComputer, CoveringSetOptions};
use scribe_analysis::heuristics::{DocumentAnalysis, ScanResult};
use scribe_core::file::{is_entrypoint_path, is_test_path, FileType};
use scribe_core::Language;
#[derive(Debug, Clone)]
struct DiffScanFile {
path: String,
relative_path: String,
depth: usize,
is_docs: bool,
is_readme: bool,
is_test: bool,
is_entrypoint: bool,
has_examples: bool,
priority_boost: f64,
churn_score: f64,
imports: Vec<String>,
}
impl ScanResult for DiffScanFile {
fn path(&self) -> &str {
&self.path
}
fn relative_path(&self) -> &str {
&self.relative_path
}
fn depth(&self) -> usize {
self.depth
}
fn is_docs(&self) -> bool {
self.is_docs
}
fn is_readme(&self) -> bool {
self.is_readme
}
fn is_test(&self) -> bool {
self.is_test
}
fn is_entrypoint(&self) -> bool {
self.is_entrypoint
}
fn has_examples(&self) -> bool {
self.has_examples
}
fn priority_boost(&self) -> f64 {
self.priority_boost
}
fn churn_score(&self) -> f64 {
self.churn_score
}
fn centrality_in(&self) -> f64 {
0.0
}
fn imports(&self) -> Option<&[String]> {
Some(&self.imports)
}
fn doc_analysis(&self) -> Option<&DocumentAnalysis> {
None
}
}
fn extract_imports(content: &str, language: &Language) -> Vec<String> {
use std::collections::HashSet;
let mut imports = HashSet::new();
match language {
Language::Rust => {
for line in content.lines() {
let trimmed = line.trim();
if trimmed.starts_with("use ") {
let statement = trimmed
.trim_start_matches("use ")
.trim_end_matches(';')
.split_whitespace()
.next()
.unwrap_or_default()
.trim_end_matches("::");
if !statement.is_empty() {
imports.insert(statement.to_string());
}
} else if trimmed.starts_with("mod ") {
let module = trimmed
.trim_start_matches("mod ")
.trim_end_matches(';')
.trim();
if !module.is_empty() {
imports.insert(module.to_string());
}
}
}
}
Language::Python => {
for line in content.lines() {
let trimmed = line.trim();
if trimmed.starts_with("import ") {
for module in trimmed.trim_start_matches("import ").split(',') {
let module = module.trim().split_whitespace().next().unwrap_or("");
if !module.is_empty() {
imports.insert(module.to_string());
}
}
} else if trimmed.starts_with("from ") && trimmed.contains(" import ") {
let module = trimmed
.trim_start_matches("from ")
.split(" import ")
.next()
.unwrap_or("")
.trim();
if !module.is_empty() {
imports.insert(module.to_string());
}
}
}
}
Language::JavaScript | Language::TypeScript => {
for line in content.lines() {
let trimmed = line.trim();
if trimmed.starts_with("import ") {
if let Some(start) = trimmed.find('"') {
if let Some(end) = trimmed[start + 1..].find('"') {
imports.insert(trimmed[start + 1..start + 1 + end].to_string());
}
} else if let Some(start) = trimmed.find('\'') {
if let Some(end) = trimmed[start + 1..].find('\'') {
imports.insert(trimmed[start + 1..start + 1 + end].to_string());
}
}
} else if trimmed.contains("require(") {
if let Some(start) = trimmed.find("require(") {
let start = start + "require(".len();
let slice = &trimmed[start..];
if let Some(end_idx) = slice.find(')') {
let inner = &slice[..end_idx];
let inner = inner.trim_matches(&['\'', '"'][..]);
if !inner.is_empty() {
imports.insert(inner.to_string());
}
}
}
}
}
}
Language::Go => {
let mut in_block = false;
for line in content.lines() {
let trimmed = line.trim();
if trimmed == "import (" {
in_block = true;
continue;
}
if in_block {
if trimmed == ")" {
in_block = false;
continue;
}
let import_path = trimmed.trim_matches(&['"', '`'][..]);
if !import_path.is_empty() {
imports.insert(import_path.to_string());
}
} else if trimmed.starts_with("import ") {
let import_path = trimmed
.trim_start_matches("import ")
.trim_matches(&['"', '`'][..]);
if !import_path.is_empty() {
imports.insert(import_path.to_string());
}
}
}
}
_ => {}
}
let mut ordered: Vec<String> = imports.into_iter().collect();
ordered.sort();
ordered.truncate(64);
ordered
}
if verbose_level > 0 {
info!("🎯 Covering set (diff) mode");
} else {
println!("🎯 Computing covering set for git diff");
}
let repo = Repository::open(repo_dir)?;
let mut diff_opts = DiffOptions::new();
diff_opts.include_untracked(true).recurse_untracked_dirs(true);
let workdir = repo.workdir().unwrap_or(repo_dir);
let mut changed_files = std::collections::HashSet::new();
if let Some(reference) = diff_against {
let obj = repo.revparse_single(reference)?;
let commit = obj.peel_to_commit()?;
let tree = commit.tree()?;
let diff = repo.diff_tree_to_workdir_with_index(Some(&tree), Some(&mut diff_opts))?;
for delta in diff.deltas() {
if let Some(path) = delta.new_file().path().or_else(|| delta.old_file().path()) {
changed_files.insert(workdir.join(path).to_string_lossy().to_string());
}
}
} else {
let diff = repo.diff_index_to_workdir(None, Some(&mut diff_opts))?;
for delta in diff.deltas() {
if let Some(path) = delta.new_file().path().or_else(|| delta.old_file().path()) {
changed_files.insert(workdir.join(path).to_string_lossy().to_string());
}
}
}
if changed_files.is_empty() {
println!("❌ No changes detected in the diff");
return Ok(());
}
let changed_files: Vec<String> = changed_files.into_iter().collect();
if verbose_level > 0 {
info!("📁 {} changed files detected", changed_files.len());
} else {
println!("📁 {} changed files detected", changed_files.len());
}
let mut config = Config::default();
config.general.working_dir = Some(repo_dir.to_path_buf());
config.analysis.token_budget = None;
let selection_options = SelectionOptions {
token_target: 0,
force_traditional: true,
algorithm_name: Some("covering-set-diff".to_string()),
include_directory_map: false,
};
let analysis_outcome = analyze_and_select(repo_dir, &config, &selection_options).await?;
let diff_scan_files: Vec<DiffScanFile> = analysis_outcome
.analysis
.files
.iter()
.map(|file| {
let extension = file
.path
.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("");
let language = Language::from_extension(extension);
let content = if file.is_binary {
String::new()
} else {
file.content
.clone()
.or_else(|| std::fs::read_to_string(&file.path).ok())
.unwrap_or_default()
};
let imports = if file.is_binary {
Vec::new()
} else {
extract_imports(&content, &language)
};
let relative_path = file.relative_path.clone();
let depth = relative_path.matches('/').count();
let path_lower = relative_path.to_lowercase();
DiffScanFile {
path: file.path.to_string_lossy().to_string(),
relative_path,
depth,
is_docs: matches!(file.file_type, FileType::Documentation { .. }),
is_readme: path_lower.contains("readme"),
is_test: is_test_path(&file.path),
is_entrypoint: is_entrypoint_path(&file.path, &language),
has_examples: path_lower.contains("example"),
priority_boost: 0.0,
churn_score: 0.0,
imports,
}
})
.collect();
let mut graph = DependencyGraph::with_capacity(diff_scan_files.len());
for file in &diff_scan_files {
graph.add_node(file.path.clone())?;
}
let detector =
ImportDetector::with_file_index(ImportResolutionConfig::default(), &diff_scan_files);
let file_map: std::collections::HashMap<&str, &DiffScanFile> = diff_scan_files
.iter()
.map(|f| (f.path.as_str(), f))
.collect();
for file in &diff_scan_files {
if let Some(imports) = file.imports() {
for import_str in imports {
if let Some(resolved) = detector.resolve_import(import_str, &file.path, &file_map) {
graph.add_edge(file.path.clone(), resolved)?;
}
}
}
}
let options = CoveringSetOptions {
include_dependencies: true,
include_dependents,
max_depth,
max_files,
min_importance: None,
};
let computer = CoveringSetComputer::new()?;
let result =
computer.compute_covering_set_for_files(&changed_files, &graph, None, &options)?;
println!("\n📦 Covering set for diff ({} files):", result.files.len());
for (idx, file) in result.files.iter().enumerate() {
let explanation = result
.inclusion_reasons
.get(&file.path)
.map(|s| s.as_str())
.unwrap_or("Included");
println!(
" {}. {} (distance: {}, reason: {})",
idx + 1,
file.path,
file.distance,
explanation
);
}
println!("\n📊 Statistics:");
println!(" • Files examined : {}", result.statistics.files_examined);
println!(" • Files selected : {}", result.statistics.files_selected);
println!(" • Files excluded : {}", result.statistics.files_excluded);
println!(
" • Max depth : {}",
result.statistics.max_depth_reached
);
println!(
" • Limits reached : {}",
if result.statistics.limits_reached {
"yes"
} else {
"no"
}
);
if verbose_level > 0 {
info!("✨ Diff covering set computation complete");
}
Ok(())
}
#[cfg(feature = "web")]
struct CliAnalysisProvider;
#[cfg(feature = "web")]
#[async_trait]
impl AnalysisProvider for CliAnalysisProvider {
async fn analyze(
&self,
config: &WebServiceConfig,
) -> std::result::Result<AnalysisOutput, WebServiceError> {
let mut scribe_config = Config::default();
scribe_config.filtering.max_file_size = config.max_file_size as u64;
scribe_config.features.auto_exclude_tests = config.auto_exclude_tests;
scribe_config.analysis.token_budget = None;
scribe_config.general.working_dir = Some(config.repo_path.clone());
let selection_options = SelectionOptions {
token_target: config.token_budget,
force_traditional: config.token_budget == 0,
algorithm_name: Some("web-service".to_string()),
include_directory_map: true,
};
let outcome = analyze_and_select(&config.repo_path, &scribe_config, &selection_options)
.await
.map_err(|err| WebServiceError::ScribeCore(err.to_string()))?;
let selected_files = outcome
.selection
.selected_files
.into_iter()
.map(convert_report_file)
.collect();
let metrics = convert_selection_metrics(outcome.selection.metrics);
Ok(AnalysisOutput {
selected_files,
selected_file_infos: outcome.selection.selected_file_infos,
metrics,
repository_files: outcome.analysis.files,
token_budget: config.token_budget,
})
}
}
#[cfg(feature = "web")]
fn convert_report_file(file: ReportFile) -> WebReportFile {
WebReportFile {
path: file.path,
relative_path: file.relative_path,
content: file.content,
size: file.size,
estimated_tokens: file.estimated_tokens,
importance_score: file.importance_score,
centrality_score: file.centrality_score,
query_relevance_score: file.query_relevance_score,
entry_point_proximity: file.entry_point_proximity,
content_quality_score: file.content_quality_score,
repository_role_score: file.repository_role_score,
recency_score: file.recency_score,
modified: format_timestamp(file.modified),
}
}
#[cfg(feature = "web")]
fn convert_selection_metrics(metrics: SelectionMetrics) -> WebSelectionMetrics {
WebSelectionMetrics {
total_files_discovered: metrics.total_files_discovered,
files_selected: metrics.files_selected,
total_tokens_estimated: metrics.total_tokens_estimated,
selection_time_ms: metrics.selection_time_ms,
algorithm_used: metrics.algorithm_used,
coverage_score: metrics.coverage_score,
relevance_score: metrics.relevance_score,
}
}
#[cfg(feature = "web")]
async fn launch_editor_mode(
repo_dir: &Path,
token_budget: usize,
max_bytes: usize,
no_exclude_tests: bool,
) -> std::result::Result<(), Box<dyn std::error::Error>> {
use std::net::TcpListener;
info!("Launching embedded web editor for {}", repo_dir.display());
let host = "127.0.0.1";
let mut candidate_port = 5000u16;
let chosen = loop {
match TcpListener::bind((host, candidate_port)) {
Ok(listener) => break Some((candidate_port, listener)),
Err(_) => {
candidate_port = candidate_port.saturating_add(1);
if candidate_port >= 6000 {
break None;
}
}
}
};
let (port, listener) = match chosen {
Some(value) => value,
None => return Err("No available ports in range 5000-5999".into()),
};
drop(listener);
let config = WebServiceConfig {
port,
host: host.to_string(),
repo_path: repo_dir.to_path_buf(),
token_budget,
auto_open_browser: true,
max_file_size: max_bytes,
auto_exclude_tests: !no_exclude_tests,
..WebServiceConfig::default()
};
info!(
"Starting web editor at http://{}:{} (token budget: {}, max bytes: {})",
config.host, config.port, token_budget, max_bytes
);
let provider = Arc::new(CliAnalysisProvider);
let mut service = WebService::new(config, provider)?;
service.start().await?;
info!("Web editor session finished");
Ok(())
}
#[cfg(not(feature = "web"))]
async fn launch_editor_mode(
_repo_dir: &Path,
_token_budget: usize,
_max_bytes: usize,
_no_exclude_tests: bool,
) -> std::result::Result<(), Box<dyn std::error::Error>> {
Err(
"The --editor option requires the `web` feature. Rebuild Scribe with --features web."
.into(),
)
}
#[derive(Debug, Clone, Copy, ValueEnum)]
enum OutputFormat {
Html,
Cxml,
Repomix,
Xml,
Json,
Text,
Markdown,
}
impl From<OutputFormat> for ReportFormat {
fn from(value: OutputFormat) -> Self {
match value {
OutputFormat::Html => ReportFormat::Html,
OutputFormat::Cxml => ReportFormat::Cxml,
OutputFormat::Repomix => ReportFormat::Repomix,
OutputFormat::Xml => ReportFormat::Xml,
OutputFormat::Json => ReportFormat::Json,
OutputFormat::Text => ReportFormat::Text,
OutputFormat::Markdown => ReportFormat::Markdown,
}
}
}
#[derive(Debug, Clone, Copy, ValueEnum)]
enum Algorithm {
#[value(name = "v1-baseline")]
V1Baseline,
#[value(name = "v3-centrality")]
V3Centrality,
#[value(name = "v4-demotion")]
V4Demotion,
#[value(name = "v5-integrated")]
V5Integrated,
}
#[tokio::main]
async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
if std::env::var("SCRIBE_DEBUG").is_ok() {
info!("CLI main started in debug mode");
}
tracing_subscriber::fmt()
.with_env_filter(EnvFilter::from_default_env())
.init();
let app = Command::new("scribe")
.version(env!("CARGO_PKG_VERSION"))
.author("Nathan Rice <nathan@sibylline.dev>")
.about("Scribe: Intelligent repository tool")
.long_about("Scribe is a comprehensive tool that intelligently selects and processes repository files for AI consumption. It provides multiple output formats and uses advanced algorithms to optimize file selection within token budgets.")
.arg(
Arg::new("repo_path")
.help("Repository path to analyze (local directory or GitHub URL)")
.value_name("PATH_OR_URL")
.default_value(".")
.index(1),
)
.arg(
Arg::new("output")
.short('o')
.long("out")
.alias("output")
.help("Output file path (auto-generated if not specified)")
.value_name("FILE"),
)
.arg(
Arg::new("output_format")
.long("output-format")
.help("Output format: html for web page, cxml for LLM, repomix for repomix format, xml for standard XML (default: html)")
.value_parser(clap::value_parser!(OutputFormat))
.default_value("html"),
)
.arg(
Arg::new("line_numbers")
.long("line-numbers")
.help("Prefix each line of bundled files with its line number")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("token_target")
.long("token-target")
.alias("token-budget")
.help("Target token count for intelligent selection (default: 128000)")
.value_name("TOKENS")
.default_value("128000")
.value_parser(clap::value_parser!(usize)),
)
.arg(
Arg::new("max_bytes")
.long("max-bytes")
.help("Maximum file size to consider (in bytes)")
.value_name("BYTES")
.default_value("204800") .value_parser(clap::value_parser!(usize)),
)
.arg(
Arg::new("include")
.long("include")
.help("Comma-separated glob patterns for files to include")
.value_name("PATTERNS"),
)
.arg(
Arg::new("exclude")
.long("exclude")
.help("Comma-separated glob patterns for files to exclude")
.value_name("PATTERNS"),
)
.arg(
Arg::new("exclude_tests")
.long("exclude-tests")
.help("Exclude test files from selection (tests/, *_test.*, *.test.*, *.spec.*)")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("no_exclude_tests")
.long("no-exclude-tests")
.help("Include test files even when they would normally be excluded")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("ignore")
.long("ignore")
.help("Comma-separated glob patterns for files to ignore")
.value_name("PATTERNS"),
)
.arg(
Arg::new("no_gitignore")
.long("no-gitignore")
.help("Disable .gitignore handling during scanning")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("no_default_patterns")
.long("no-default-patterns")
.help("Disable built-in ignore patterns like node_modules or target")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("verbose")
.short('v')
.long("verbose")
.help("Enable verbose output")
.action(ArgAction::Count),
)
.arg(
Arg::new("force_traditional")
.long("force-traditional")
.help("Force traditional file filtering instead of intelligent selection")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("editor")
.long("editor")
.help("Launch interactive bundle editor in browser")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("algorithm")
.long("algorithm")
.alias("variant")
.help("Selection algorithm")
.value_parser(clap::value_parser!(Algorithm))
.default_value("v5-integrated"),
)
.arg(
Arg::new("query_hint")
.long("query-hint")
.help("Query hint to guide file selection (e.g., authentication, database)")
.value_name("HINT"),
)
.arg(
Arg::new("show_metrics")
.long("show-metrics")
.help("Show detailed performance and quality metrics")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("entry_points")
.long("entry-points")
.help("Focus on specific entry point files")
.value_name("FILES")
.num_args(0..),
)
.arg(
Arg::new("entry_functions")
.long("entry-functions")
.help("Focus on specific functions (format: file.py:function_name)")
.value_name("FUNCTIONS")
.num_args(0..),
)
.arg(
Arg::new("personalization_alpha")
.long("personalization-alpha")
.help("Entry point focus strength (0.0-1.0)")
.value_name("ALPHA")
.default_value("0.15")
.value_parser(clap::value_parser!(f64)),
)
.arg(
Arg::new("include_diffs")
.long("include-diffs")
.help("Include relevant Git diffs")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("diff_commits")
.long("diff-commits")
.help("Number of recent commits to analyze")
.value_name("COUNT")
.default_value("1")
.value_parser(clap::value_parser!(usize)),
)
.arg(
Arg::new("diff_branch")
.long("diff-branch")
.help("Compare with specific branch")
.value_name("BRANCH"),
)
.arg(
Arg::new("diff_relevance_threshold")
.long("diff-relevance-threshold")
.help("Minimum relevance score for including diffs")
.value_name("THRESHOLD")
.default_value("0.1")
.value_parser(clap::value_parser!(f64)),
)
.arg(
Arg::new("scaling")
.long("scaling")
.help("Enable advanced scaling optimizations for large repositories")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("covering_set")
.long("covering-set")
.help("Find covering set for a specific entity (function, class, module)")
.value_name("ENTITY_NAME"),
)
.arg(
Arg::new("covering_set_diff")
.long("covering-set-diff")
.help("Compute covering set for the current git diff")
.action(ArgAction::SetTrue)
.conflicts_with("covering_set"),
)
.arg(
Arg::new("diff_against")
.long("diff-against")
.help("Git ref to diff against (defaults to HEAD)")
.value_name("REF")
.requires("covering_set_diff"),
)
.arg(
Arg::new("entity_type")
.long("entity-type")
.help("Type of entity to find: function, class, module, interface, constant")
.value_name("TYPE")
.requires("covering_set"),
)
.arg(
Arg::new("exact_match")
.long("exact-match")
.help("Match entity name exactly (vs substring match)")
.action(ArgAction::SetTrue)
.requires("covering_set"),
)
.arg(
Arg::new("include_dependents")
.long("include-dependents")
.help("Include files that depend on the target (for impact analysis)")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("max_depth")
.long("max-depth")
.help("Maximum dependency traversal depth")
.value_name("DEPTH")
.value_parser(clap::value_parser!(usize)),
)
.arg(
Arg::new("max_files_covering")
.long("max-files")
.help("Maximum number of files in covering set")
.value_name("COUNT")
.value_parser(clap::value_parser!(usize)),
);
let matches = app.get_matches();
let repo_path_or_url = matches.get_one::<String>("repo_path").unwrap();
let output_format = matches.get_one::<OutputFormat>("output_format").unwrap();
let report_format: ReportFormat = (*output_format).into();
let token_target = *matches.get_one::<usize>("token_target").unwrap();
let max_bytes = *matches.get_one::<usize>("max_bytes").unwrap();
let verbose_level = matches.get_count("verbose");
let include_line_numbers = matches.get_flag("line_numbers");
if std::env::var("SCRIBE_DEBUG").is_ok() {
info!("Verbose level set to {}", verbose_level);
}
let (repo_dir, _temp_repo_guard) =
if repo_path_or_url.starts_with("http://") || repo_path_or_url.starts_with("https://") {
info!("🌐 Detected GitHub URL: {}", repo_path_or_url);
clone_github_repo(repo_path_or_url).await?
} else {
let path = PathBuf::from(repo_path_or_url);
if !path.exists() {
error!("Repository path does not exist: {}", repo_path_or_url);
process::exit(1);
}
if !path.is_dir() {
error!("Repository path is not a directory: {}", repo_path_or_url);
process::exit(1);
}
(path.canonicalize()?, None)
};
let editor_mode = matches.get_flag("editor");
if std::env::var("SCRIBE_DEBUG").is_ok() {
info!("Editor mode flag: {}", editor_mode);
}
if editor_mode {
return launch_editor_mode(
&repo_dir,
token_target,
max_bytes,
matches.get_flag("no_exclude_tests"),
)
.await;
}
if matches.get_flag("covering_set_diff") {
return run_covering_set_diff_mode(
&repo_dir,
matches.get_one::<String>("diff_against").map(|s| s.as_str()),
matches.get_flag("include_dependents"),
matches.get_one::<usize>("max_depth").copied(),
matches.get_one::<usize>("max_files_covering").copied(),
verbose_level,
)
.await;
}
if let Some(entity_name) = matches.get_one::<String>("covering_set") {
return run_covering_set_mode(
&repo_dir,
entity_name,
matches.get_one::<String>("entity_type").map(|s| s.as_str()),
matches.get_flag("exact_match"),
matches.get_flag("include_dependents"),
matches.get_one::<usize>("max_depth").copied(),
matches.get_one::<usize>("max_files_covering").copied(),
verbose_level,
)
.await;
}
let force_traditional = matches.get_flag("force_traditional");
let algorithm = matches.get_one::<Algorithm>("algorithm").unwrap();
let query_hint = matches.get_one::<String>("query_hint").cloned();
let show_metrics = matches.get_flag("show_metrics");
let entry_points: Vec<String> = matches
.get_many::<String>("entry_points")
.map(|vals| vals.cloned().collect())
.unwrap_or_default();
let entry_functions: Vec<String> = matches
.get_many::<String>("entry_functions")
.map(|vals| vals.cloned().collect())
.unwrap_or_default();
let personalization_alpha = *matches.get_one::<f64>("personalization_alpha").unwrap();
let include_diffs = matches.get_flag("include_diffs");
let diff_commits = *matches.get_one::<usize>("diff_commits").unwrap();
let diff_branch = matches.get_one::<String>("diff_branch").cloned();
let diff_relevance_threshold = *matches.get_one::<f64>("diff_relevance_threshold").unwrap();
let use_scaling = matches.get_flag("scaling");
let exclude_tests = matches.get_flag("exclude_tests");
let include_tests_override = matches.get_flag("no_exclude_tests");
let include_patterns_cli = matches
.get_one::<String>("include")
.map(|value| normalize_patterns(parse_pattern_list(value)));
let exclude_patterns_cli = matches
.get_one::<String>("exclude")
.map(|value| normalize_patterns(parse_pattern_list(value)));
let ignore_patterns_cli = matches
.get_one::<String>("ignore")
.map(|value| normalize_patterns(parse_pattern_list(value)));
let disable_gitignore = matches.get_flag("no_gitignore");
let disable_default_patterns = matches.get_flag("no_default_patterns");
if verbose_level > 0 {
std::env::set_var("SCRIBE_DEBUG", "1");
info!("Verbose mode enabled (level: {})", verbose_level);
}
let mut config = load_repository_config(&repo_dir);
let repo_ignore_patterns = load_ignore_patterns(&repo_dir);
if verbose_level > 0 {
info!("Analyzing repository: {}", repo_dir.display());
}
let output_path = if let Some(output) = matches.get_one::<String>("output") {
PathBuf::from(output)
} else if let Some(config_path) = &config.output.file_path {
let path = PathBuf::from(config_path);
if path.is_absolute() {
path
} else {
repo_dir.join(path)
}
} else {
let base_name = repo_dir
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("repository");
let extension = match report_format {
ReportFormat::Html => "html",
ReportFormat::Cxml => "cxml",
ReportFormat::Repomix => "repomix",
ReportFormat::Xml => "xml",
ReportFormat::Json => "json",
ReportFormat::Text => "txt",
ReportFormat::Markdown => "md",
};
PathBuf::from(format!("{}.{}", base_name, extension))
};
config.filtering.max_file_size = max_bytes as u64;
config.analysis.token_budget = None;
config.features.scaling_enabled = use_scaling;
config.filtering.include_patterns =
normalize_patterns(std::mem::take(&mut config.filtering.include_patterns));
let mut exclude_patterns =
normalize_patterns(std::mem::take(&mut config.filtering.exclude_patterns));
if disable_default_patterns {
exclude_patterns.clear();
}
if !repo_ignore_patterns.is_empty() {
exclude_patterns.extend(normalize_patterns(repo_ignore_patterns));
}
if let Some(patterns) = exclude_patterns_cli {
exclude_patterns.extend(patterns);
}
if let Some(patterns) = ignore_patterns_cli {
exclude_patterns.extend(patterns);
}
config.filtering.exclude_patterns = normalize_patterns(exclude_patterns);
if disable_gitignore {
config.filtering.respect_gitignore = false;
}
if let Some(patterns) = include_patterns_cli {
if !patterns.is_empty() {
config.filtering.include_patterns = patterns;
}
}
config.features.auto_exclude_tests = if include_tests_override {
false
} else if exclude_tests {
true
} else {
config.features.auto_exclude_tests
};
if std::env::var("SCRIBE_DEBUG").is_ok() {
eprintln!("Include patterns: {:?}", config.filtering.include_patterns);
eprintln!("Exclude patterns: {:?}", config.filtering.exclude_patterns);
}
if verbose_level > 0 {
info!("🎯 Token budget configured: {} tokens", token_target);
info!("📏 Max file size limit: {} bytes", max_bytes);
}
let algorithm_name = match algorithm {
Algorithm::V1Baseline => "v1-baseline",
Algorithm::V3Centrality => "v3-centrality",
Algorithm::V4Demotion => "v4-demotion",
Algorithm::V5Integrated => "v5-integrated",
}
.to_string();
if verbose_level > 0 {
info!("Algorithm: {}", algorithm_name);
info!("Force traditional: {}", force_traditional);
if let Some(hint) = &query_hint {
info!("Query hint: {}", hint);
}
if !entry_points.is_empty() {
info!("Entry points: {:?}", entry_points);
}
if !entry_functions.is_empty() {
info!("Entry functions: {:?}", entry_functions);
}
if include_diffs {
info!("Including diffs from {} commits", diff_commits);
if let Some(branch) = &diff_branch {
info!("Diff branch: {}", branch);
}
}
if use_scaling {
info!("Scaling optimizations: ENABLED");
}
if exclude_tests {
info!("Auto-exclude tests: ENABLED");
}
}
let selection_options = SelectionOptions {
token_target,
force_traditional,
algorithm_name: Some(algorithm_name.clone()),
include_directory_map: true,
};
let analysis_outcome = analyze_and_select(&repo_dir, &config, &selection_options).await?;
let mut selected_files = analysis_outcome.selection.selected_files;
let metrics = analysis_outcome.selection.metrics;
let eligible_file_count = analysis_outcome.selection.eligible_file_count;
let unlimited_budget = analysis_outcome.selection.unlimited_budget;
let total_files_discovered = metrics.total_files_discovered;
if verbose_level > 0 {
info!(
"Selected {} files ({} tokens)",
metrics.files_selected, metrics.total_tokens_estimated
);
} else {
println!("📊 Selection summary");
println!(" • Files scanned : {}", total_files_discovered);
println!(" • Eligible files : {}", eligible_file_count);
println!(
" • Files selected : {} ({} tokens)",
metrics.files_selected, metrics.total_tokens_estimated
);
println!(
" • Files excluded : {}",
eligible_file_count.saturating_sub(metrics.files_selected)
);
println!(
" • Coverage : {:.1}%",
metrics.coverage_score * 100.0
);
if unlimited_budget || token_target == 0 {
println!(" • Token usage : unlimited");
} else {
println!(
" • Token usage : {} / {}",
metrics.total_tokens_estimated, token_target
);
}
}
if show_metrics {
if verbose_level > 0 {
info!("Enhanced Selection Metrics:");
} else {
println!(
"
📈 Additional metrics"
);
}
let repository_complexity_factor = if total_files_discovered > 0 {
eligible_file_count as f64 / total_files_discovered as f64
} else {
0.0
};
if verbose_level > 0 {
info!(" - Algorithm: {}", metrics.algorithm_used);
info!(
" - Files: {} / {}",
metrics.files_selected, metrics.total_files_discovered
);
info!(" - Tokens: {}", metrics.total_tokens_estimated);
info!(" - Coverage: {:.1}%", metrics.coverage_score * 100.0);
info!(" - Relevance: {:.2}", metrics.relevance_score);
info!(" - Selection time: {}ms", metrics.selection_time_ms);
info!(
" - Repository complexity: {:.2}",
repository_complexity_factor
);
} else {
println!(" • Algorithm : {}", metrics.algorithm_used);
println!(
" • Coverage : {:.1}%",
metrics.coverage_score * 100.0
);
println!(" • Relevance score : {:.2}", metrics.relevance_score);
}
if !entry_points.is_empty() {
let avg_entry_proximity = selected_files
.iter()
.map(|f| f.entry_point_proximity)
.sum::<f64>()
/ selected_files.len().max(1) as f64;
info!(" - Entry point influence: {:.2}", avg_entry_proximity);
}
if query_hint.is_some() {
let avg_query_relevance = selected_files
.iter()
.map(|f| f.query_relevance_score)
.sum::<f64>()
/ selected_files.len().max(1) as f64;
info!(" - Query relevance: {:.2}", avg_query_relevance);
}
if include_diffs {
let avg_recency = selected_files.iter().map(|f| f.recency_score).sum::<f64>()
/ selected_files.len().max(1) as f64;
info!(" - Recency score: {:.2}", avg_recency);
}
let avg_content_quality = selected_files
.iter()
.map(|f| f.content_quality_score)
.sum::<f64>()
/ selected_files.len().max(1) as f64;
let avg_centrality = selected_files
.iter()
.map(|f| f.centrality_score)
.sum::<f64>()
/ selected_files.len().max(1) as f64;
info!(" - Content quality: {:.2}", avg_content_quality);
info!(" - Centrality: {:.2}", avg_centrality);
}
let format_label = match report_format {
ReportFormat::Html => "HTML",
ReportFormat::Cxml => "CXML",
ReportFormat::Repomix => "Repomix",
ReportFormat::Xml => "XML",
ReportFormat::Json => "JSON",
ReportFormat::Text => "Text",
ReportFormat::Markdown => "Markdown",
};
if verbose_level == 0 {
println!("📝 Generating {} output...", format_label);
} else {
info!("📝 Generating {} output", format_label);
}
let mut selected_files = selected_files;
if include_line_numbers {
apply_line_numbers_to_files(&mut selected_files);
}
let report_content = generate_report(report_format, &selected_files, &metrics)?;
fs::write(&output_path, report_content)?;
if verbose_level > 0 {
info!(
"🎉 Analysis complete! Output saved to: {}",
output_path.display()
);
} else {
println!(" • Output location : {}", output_path.display());
println!(
"
🎉 Analysis complete"
);
}
if config.output.file_path.is_some() && matches.get_one::<String>("output").is_none() {
info!("📋 Output path from configuration file");
}
Ok(())
}
fn load_repository_config(repo_dir: &Path) -> Config {
let candidates = [".scribe.json", "scribe.config.json"];
for candidate in &candidates {
let candidate_path = repo_dir.join(candidate);
if candidate_path.exists() {
match Config::load_from_file(&candidate_path) {
Ok(config) => {
info!(
"📋 Loaded repository configuration from: {}",
candidate_path.display()
);
return config;
}
Err(err) => {
warn!(
"Failed to load configuration from {}: {}",
candidate_path.display(),
err
);
}
}
}
}
Config::default()
}
fn load_ignore_patterns(repo_dir: &Path) -> Vec<String> {
let mut patterns = Vec::new();
let ignore_file = repo_dir.join(".scribeignore");
if ignore_file.exists() {
match fs::read_to_string(&ignore_file) {
Ok(content) => {
info!("📋 Loaded ignore patterns from: {}", ignore_file.display());
for line in content.lines() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
if !trimmed.starts_with('!') {
patterns.push(trimmed.to_string());
}
}
}
Err(err) => {
warn!("Failed to read {}: {}", ignore_file.display(), err);
}
}
}
patterns
}
fn parse_pattern_list(value: &str) -> Vec<String> {
value
.split(',')
.flat_map(|segment| segment.split_whitespace())
.map(str::trim)
.filter(|pattern| !pattern.is_empty())
.map(|pattern| pattern.to_string())
.collect()
}
fn normalize_patterns(patterns: Vec<String>) -> Vec<String> {
let mut seen = HashSet::new();
let mut result = Vec::new();
for pattern in patterns {
let trimmed = pattern.trim();
if trimmed.is_empty() {
continue;
}
let mut normalized = trimmed.to_string();
if trimmed.ends_with('/') {
normalized.push_str("**");
} else if !trimmed.contains('/') && !trimmed.contains('\\') && !trimmed.contains("**") {
normalized = format!("**/{}", trimmed);
}
if seen.insert(normalized.clone()) {
result.push(normalized);
}
}
result
}
fn apply_line_numbers_to_files(files: &mut [ReportFile]) {
for file in files {
file.content = add_line_numbers(&file.content);
}
}
fn add_line_numbers(content: &str) -> String {
let lines: Vec<&str> = content.split('\n').collect();
let width = lines.len().max(1).to_string().len().max(3);
let mut numbered = String::with_capacity(content.len() + lines.len() * (width + 3));
for (idx, line) in lines.iter().enumerate() {
let line_no = idx + 1;
numbered.push_str(&format!("{:width$} | {}", line_no, line, width = width));
if idx + 1 < lines.len() {
numbered.push('\n');
}
}
numbered
}