# Code Digest: .
## Statistics
- Total files: 13
- Total size: 98.47 KB bytes
### Files by type:
- Rust: 13
## File Structure
```
.
├── core/
│ ├── walker.rs
│ ├── token.rs
│ ├── digest.rs
│ ├── mod.rs
│ └── prioritizer.rs
├── utils/
│ ├── error.rs
│ ├── file_ext.rs
│ └── mod.rs
├── remote.rs
├── config.rs
├── lib.rs
├── main.rs
└── cli.rs
```
## Table of Contents
- [main.rs](#main-rs)
- [lib.rs](#lib-rs)
- [cli.rs](#cli-rs)
- [config.rs](#config-rs)
- [core/digest.rs](#core-digest-rs)
- [core/mod.rs](#core-mod-rs)
- [core/prioritizer.rs](#core-prioritizer-rs)
- [core/token.rs](#core-token-rs)
- [core/walker.rs](#core-walker-rs)
- [remote.rs](#remote-rs)
- [utils/error.rs](#utils-error-rs)
- [utils/file_ext.rs](#utils-file_ext-rs)
- [utils/mod.rs](#utils-mod-rs)
## main.rs
```rust
use anyhow::Result;
use clap::Parser;
use code_digest::{cli::Config, run};
fn main() -> Result<()> {
// Parse command line arguments
let mut config = Config::parse();
// Load configuration from file if specified
config.load_from_file()?;
// Run the application
run(config)?;
Ok(())
}
```
## lib.rs
```rust
//! Code Digest - High-performance CLI tool to convert codebases to Markdown for LLM context
//!
//! This library provides the core functionality for traversing directories,
//! processing files, and generating formatted Markdown output suitable for
//! large language model consumption.
pub mod cli;
pub mod config;
pub mod core;
pub mod remote;
pub mod utils;
use anyhow::Result;
use std::path::Path;
pub use cli::Config;
pub use core::{digest::DigestOptions, walker::WalkOptions};
pub use utils::error::CodeDigestError;
/// Main entry point for the code digest library
pub fn run(mut config: Config) -> Result<()> {
// Handle remote repository if specified
let _temp_dir = if let Some(repo_url) = &config.repo {
if config.verbose {
eprintln!("🔧 Starting code-digest with remote repository: {repo_url}");
}
// Fetch the repository
let temp_dir = crate::remote::fetch_repository(repo_url, config.verbose)?;
let repo_path = crate::remote::get_repo_path(&temp_dir, repo_url)?;
// Update config to use the cloned repository
config.directories = vec![repo_path];
Some(temp_dir) // Keep temp_dir alive until end of function
} else {
None
};
// Setup logging based on verbosity
if config.verbose {
eprintln!("🔧 Starting code-digest with configuration:");
eprintln!(" Directories: {:?}", config.directories);
eprintln!(" Max tokens: {:?}", config.max_tokens);
eprintln!(" LLM tool: {}", config.llm_tool.command());
eprintln!(" Progress: {}", config.progress);
eprintln!(" Quiet: {}", config.quiet);
if let Some(output) = &config.output_file {
eprintln!(" Output file: {}", output.display());
}
if let Some(prompt) = &config.prompt {
eprintln!(" Prompt: {prompt}");
}
}
// Validate configuration
config.validate()?;
// Create walker with options
if config.verbose {
eprintln!("🚶 Creating directory walker with options...");
}
let walk_options = WalkOptions::from_config(&config)?;
// Create digest options
if config.verbose {
eprintln!("📄 Creating markdown digest options...");
}
let digest_options = DigestOptions::from_config(&config)?;
// Process all directories
let mut all_outputs = Vec::new();
for (index, directory) in config.directories.iter().enumerate() {
if config.progress && !config.quiet && config.directories.len() > 1 {
eprintln!(
"📂 Processing directory {} of {}: {}",
index + 1,
config.directories.len(),
directory.display()
);
}
let output =
process_directory(directory, walk_options.clone(), digest_options.clone(), &config)?;
all_outputs.push((directory.clone(), output));
}
// Combine outputs from all directories
let output = if all_outputs.len() == 1 {
// Single directory - return output as-is
all_outputs.into_iter().next().unwrap().1
} else {
// Multiple directories - combine with headers
let mut combined = String::new();
combined.push_str("# Code Digest - Multiple Directories\n\n");
for (path, content) in all_outputs {
combined.push_str(&format!("## Directory: {}\n\n", path.display()));
combined.push_str(&content);
combined.push_str("\n\n");
}
combined
};
// Handle output based on configuration
match (config.output_file.as_ref(), config.prompt.as_ref()) {
(Some(file), None) => {
// Write to file
std::fs::write(file, output)?;
if !config.quiet {
println!(" Written to {}", file.display());
}
}
(None, Some(prompt)) => {
// Send to LLM CLI with prompt
if config.progress && !config.quiet {
eprintln!("🤖 Sending context to {}...", config.llm_tool.command());
}
execute_with_llm(prompt, &output, &config)?;
}
(None, None) => {
// Print to stdout
print!("{output}");
}
(Some(_), Some(_)) => {
return Err(CodeDigestError::InvalidConfiguration(
"Cannot specify both output file and prompt".to_string(),
)
.into());
}
}
Ok(())
}
/// Process a directory and generate markdown output
fn process_directory(
path: &Path,
walk_options: WalkOptions,
digest_options: DigestOptions,
config: &Config,
) -> Result<String> {
// Walk the directory
if config.progress && !config.quiet {
eprintln!("🔍 Scanning directory: {}", path.display());
}
let files = core::walker::walk_directory(path, walk_options)?;
if config.progress && !config.quiet {
eprintln!("📁 Found {} files", files.len());
}
if config.verbose {
eprintln!("📋 File list:");
for file in &files {
eprintln!(" {} ({})", file.relative_path.display(), file.file_type_display());
}
}
// Prioritize files if needed
let prioritized_files = if digest_options.max_tokens.is_some() {
if config.progress && !config.quiet {
eprintln!("🎯 Prioritizing files for token limit...");
}
core::prioritizer::prioritize_files(files, &digest_options)?
} else {
files
};
if config.progress && !config.quiet {
eprintln!("📝 Generating markdown from {} files...", prioritized_files.len());
}
// Generate markdown
let markdown = core::digest::generate_markdown(prioritized_files, digest_options)?;
if config.progress && !config.quiet {
eprintln!("✅ Markdown generation complete");
}
Ok(markdown)
}
/// Execute LLM CLI with the generated context
fn execute_with_llm(prompt: &str, context: &str, config: &Config) -> Result<()> {
use std::io::Write;
use std::process::{Command, Stdio};
let full_input = format!("{prompt}\n\n{context}");
let tool_command = config.llm_tool.command();
let mut child = Command::new(tool_command)
.stdin(Stdio::piped())
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.spawn()
.map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
CodeDigestError::LlmToolNotFound {
tool: tool_command.to_string(),
install_instructions: config.llm_tool.install_instructions().to_string(),
}
} else {
CodeDigestError::SubprocessError(e.to_string())
}
})?;
if let Some(mut stdin) = child.stdin.take() {
stdin.write_all(full_input.as_bytes())?;
stdin.flush()?;
}
let status = child.wait()?;
if !status.success() {
return Err(CodeDigestError::SubprocessError(format!(
"{tool_command} exited with status: {status}"
))
.into());
}
if !config.quiet {
eprintln!("\n✓ {tool_command} completed successfully");
}
Ok(())
}
```
## cli.rs
```rust
//! Command-line interface configuration and parsing
use clap::{Parser, ValueEnum};
use std::path::PathBuf;
/// Supported LLM CLI tools
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
pub enum LlmTool {
/// Use gemini (default)
#[value(name = "gemini")]
#[default]
Gemini,
/// Use codex CLI
#[value(name = "codex")]
Codex,
}
impl LlmTool {
/// Get the command name for the tool
pub fn command(&self) -> &'static str {
match self {
LlmTool::Gemini => "gemini",
LlmTool::Codex => "codex",
}
}
/// Get the installation instructions for the tool
pub fn install_instructions(&self) -> &'static str {
match self {
LlmTool::Gemini => "Please install gemini with: pip install gemini",
LlmTool::Codex => {
"Please install codex CLI from: https://github.com/microsoft/codex-cli"
}
}
}
}
/// High-performance CLI tool to convert codebases to Markdown for LLM context
#[derive(Parser, Debug, Clone)]
#[command(author, version, about, long_about = None)]
pub struct Config {
/// The prompt to send to the LLM. If omitted, only generates the Markdown context
#[arg(value_name = "PROMPT")]
pub prompt: Option<String>,
/// The paths to the directories to process
#[arg(short = 'd', long, default_value = ".", num_args = 1.., conflicts_with = "repo")]
pub directories: Vec<PathBuf>,
/// GitHub repository URL to analyze (e.g., <https://github.com/owner/repo>)
#[arg(long, conflicts_with = "directories")]
pub repo: Option<String>,
/// The path to the output Markdown file. If used, won't call the LLM CLI
#[arg(short = 'o', long)]
pub output_file: Option<PathBuf>,
/// Maximum number of tokens for the generated codebase context
#[arg(long)]
pub max_tokens: Option<usize>,
/// LLM CLI tool to use for processing
#[arg(short = 't', long = "tool", default_value = "gemini")]
pub llm_tool: LlmTool,
/// Suppress all output except for errors and the final LLM response
#[arg(short = 'q', long)]
pub quiet: bool,
/// Enable verbose logging
#[arg(short = 'v', long)]
pub verbose: bool,
/// Path to configuration file
#[arg(short = 'c', long)]
pub config: Option<PathBuf>,
/// Show progress indicators during processing
#[arg(long)]
pub progress: bool,
}
impl Config {
/// Validate the configuration
pub fn validate(&self) -> Result<(), crate::utils::error::CodeDigestError> {
use crate::utils::error::CodeDigestError;
// Validate repo URL if provided
if let Some(repo_url) = &self.repo {
if !repo_url.starts_with("https://github.com/")
&& !repo_url.starts_with("http://github.com/")
{
return Err(CodeDigestError::InvalidConfiguration(
"Repository URL must be a GitHub URL (https://github.com/owner/repo)"
.to_string(),
));
}
} else {
// Only validate directories if repo is not provided
for directory in &self.directories {
if !directory.exists() {
return Err(CodeDigestError::InvalidPath(format!(
"Directory does not exist: {}",
directory.display()
)));
}
if !directory.is_dir() {
return Err(CodeDigestError::InvalidPath(format!(
"Path is not a directory: {}",
directory.display()
)));
}
}
}
// Validate output file parent directory exists if specified
if let Some(output) = &self.output_file {
if let Some(parent) = output.parent() {
// Handle empty parent (current directory) and check if parent exists
if !parent.as_os_str().is_empty() && !parent.exists() {
return Err(CodeDigestError::InvalidPath(format!(
"Output directory does not exist: {}",
parent.display()
)));
}
}
}
// Validate mutually exclusive options
if self.output_file.is_some() && self.prompt.is_some() {
return Err(CodeDigestError::InvalidConfiguration(
"Cannot specify both --output and a prompt".to_string(),
));
}
Ok(())
}
/// Load configuration from file if specified
pub fn load_from_file(&mut self) -> Result<(), crate::utils::error::CodeDigestError> {
use crate::config::ConfigFile;
let config_file = if let Some(ref config_path) = self.config {
// Load from specified config file
Some(ConfigFile::load_from_file(config_path)?)
} else {
// Try to load from default locations
ConfigFile::load_default()?
};
if let Some(config_file) = config_file {
config_file.apply_to_cli_config(self);
if self.verbose {
if let Some(ref config_path) = self.config {
eprintln!("📄 Loaded configuration from: {}", config_path.display());
} else {
eprintln!("📄 Loaded configuration from default location");
}
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_config_validation_valid_directory() {
let temp_dir = TempDir::new().unwrap();
let config = Config {
prompt: None,
directories: vec![temp_dir.path().to_path_buf()],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_ok());
}
#[test]
fn test_config_validation_invalid_directory() {
let config = Config {
prompt: None,
directories: vec![PathBuf::from("/nonexistent/directory")],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_err());
}
#[test]
fn test_config_validation_file_as_directory() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("file.txt");
fs::write(&file_path, "test").unwrap();
let config = Config {
prompt: None,
directories: vec![file_path],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_err());
}
#[test]
fn test_config_validation_invalid_output_directory() {
let temp_dir = TempDir::new().unwrap();
let config = Config {
prompt: None,
directories: vec![temp_dir.path().to_path_buf()],
repo: None,
output_file: Some(PathBuf::from("/nonexistent/directory/output.md")),
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_err());
}
#[test]
fn test_config_validation_mutually_exclusive_options() {
let temp_dir = TempDir::new().unwrap();
let config = Config {
prompt: Some("test prompt".to_string()),
directories: vec![temp_dir.path().to_path_buf()],
repo: None,
output_file: Some(temp_dir.path().join("output.md")),
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_err());
}
#[test]
fn test_llm_tool_enum_values() {
assert_eq!(LlmTool::Gemini.command(), "gemini");
assert_eq!(LlmTool::Codex.command(), "codex");
assert!(LlmTool::Gemini.install_instructions().contains("pip install"));
assert!(LlmTool::Codex.install_instructions().contains("github.com"));
assert_eq!(LlmTool::default(), LlmTool::Gemini);
}
#[test]
fn test_config_validation_output_file_in_current_dir() {
let temp_dir = TempDir::new().unwrap();
let config = Config {
prompt: None,
directories: vec![temp_dir.path().to_path_buf()],
repo: None,
output_file: Some(PathBuf::from("output.md")),
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
// Should not error for files in current directory
assert!(config.validate().is_ok());
}
#[test]
fn test_config_load_from_file_no_config() {
let temp_dir = TempDir::new().unwrap();
let mut config = Config {
prompt: None,
directories: vec![temp_dir.path().to_path_buf()],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
// Should not error when no config file is found
assert!(config.load_from_file().is_ok());
}
#[test]
fn test_parse_multiple_directories() {
use clap::Parser;
// Test single directory (backward compatibility)
let args = vec!["code-digest", "-d", "/path/one"];
let config = Config::parse_from(args);
assert_eq!(config.directories.len(), 1);
assert_eq!(config.directories[0], PathBuf::from("/path/one"));
}
#[test]
fn test_parse_multiple_directories_new_api() {
use clap::Parser;
// Test single directory (backward compatibility)
let args = vec!["code-digest", "-d", "/path/one"];
let config = Config::parse_from(args);
assert_eq!(config.directories.len(), 1);
assert_eq!(config.directories[0], PathBuf::from("/path/one"));
// Test multiple directories
let args = vec!["code-digest", "-d", "/path/one", "/path/two", "/path/three"];
let config = Config::parse_from(args);
assert_eq!(config.directories.len(), 3);
assert_eq!(config.directories[0], PathBuf::from("/path/one"));
assert_eq!(config.directories[1], PathBuf::from("/path/two"));
assert_eq!(config.directories[2], PathBuf::from("/path/three"));
// Test with prompt after directories using -- separator
let args = vec![
"code-digest",
"-d",
"/src/module1",
"/src/module2",
"--",
"Find duplicated patterns",
];
let config = Config::parse_from(args);
assert_eq!(config.directories.len(), 2);
assert_eq!(config.prompt, Some("Find duplicated patterns".to_string()));
}
#[test]
fn test_validate_multiple_directories() {
let temp_dir = TempDir::new().unwrap();
let dir1 = temp_dir.path().join("dir1");
let dir2 = temp_dir.path().join("dir2");
fs::create_dir(&dir1).unwrap();
fs::create_dir(&dir2).unwrap();
// All directories exist - should succeed
let config = Config {
prompt: None,
directories: vec![dir1.clone(), dir2.clone()],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_ok());
// One directory doesn't exist - should fail
let config = Config {
prompt: None,
directories: vec![dir1, PathBuf::from("/nonexistent/dir")],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_err());
}
#[test]
fn test_validate_files_as_directories() {
let temp_dir = TempDir::new().unwrap();
let dir1 = temp_dir.path().join("dir1");
let file1 = temp_dir.path().join("file.txt");
fs::create_dir(&dir1).unwrap();
fs::write(&file1, "test content").unwrap();
// Mix of directory and file - should fail
let config = Config {
prompt: None,
directories: vec![dir1, file1],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
assert!(config.validate().is_err());
}
}
```
## config.rs
```rust
//! Configuration file support for code-digest
//!
//! This module handles loading and parsing configuration files in TOML format.
//! Configuration files can specify defaults for CLI options and additional
//! settings like file priorities and ignore patterns.
use crate::cli::{Config as CliConfig, LlmTool};
use crate::utils::error::CodeDigestError;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
/// Configuration file structure
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ConfigFile {
/// Default settings
#[serde(default)]
pub defaults: Defaults,
/// File priority configurations
#[serde(default)]
pub priorities: Vec<Priority>,
/// Ignore patterns beyond .gitignore and .digestignore
#[serde(default)]
pub ignore: Vec<String>,
/// Include patterns to force inclusion
#[serde(default)]
pub include: Vec<String>,
}
/// Default configuration settings
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Defaults {
/// Default maximum tokens
pub max_tokens: Option<usize>,
/// Default LLM tool
#[serde(default)]
pub llm_tool: Option<String>,
/// Default to show progress
#[serde(default)]
pub progress: bool,
/// Default verbosity
#[serde(default)]
pub verbose: bool,
/// Default quiet mode
#[serde(default)]
pub quiet: bool,
/// Default directory
pub directory: Option<PathBuf>,
/// Default output file
pub output_file: Option<PathBuf>,
}
/// File priority configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Priority {
/// Glob pattern to match files
pub pattern: String,
/// Priority weight (higher = more important)
pub weight: f32,
}
impl ConfigFile {
/// Load configuration from a file
pub fn load_from_file(path: &Path) -> Result<Self, CodeDigestError> {
if !path.exists() {
return Err(CodeDigestError::InvalidPath(format!(
"Configuration file does not exist: {}",
path.display()
)));
}
let content = std::fs::read_to_string(path).map_err(|e| {
CodeDigestError::ConfigError(format!(
"Failed to read config file {}: {}",
path.display(),
e
))
})?;
let config: ConfigFile = toml::from_str(&content).map_err(|e| {
CodeDigestError::ConfigError(format!(
"Failed to parse config file {}: {}",
path.display(),
e
))
})?;
Ok(config)
}
/// Load configuration from default locations
pub fn load_default() -> Result<Option<Self>, CodeDigestError> {
// Try .code-digest.toml in current directory
let local_config = Path::new(".code-digest.toml");
if local_config.exists() {
return Ok(Some(Self::load_from_file(local_config)?));
}
// Try .digestrc.toml in current directory
let rc_config = Path::new(".digestrc.toml");
if rc_config.exists() {
return Ok(Some(Self::load_from_file(rc_config)?));
}
// Try in home directory
if let Some(home) = dirs::home_dir() {
let home_config = home.join(".code-digest.toml");
if home_config.exists() {
return Ok(Some(Self::load_from_file(&home_config)?));
}
}
Ok(None)
}
/// Apply configuration defaults to CLI config
pub fn apply_to_cli_config(&self, cli_config: &mut CliConfig) {
// Only apply defaults if CLI didn't specify them
if cli_config.max_tokens.is_none() && self.defaults.max_tokens.is_some() {
cli_config.max_tokens = self.defaults.max_tokens;
}
if let Some(ref tool_str) = self.defaults.llm_tool {
// Only apply if CLI used default
if cli_config.llm_tool == LlmTool::default() {
match tool_str.as_str() {
"gemini" => cli_config.llm_tool = LlmTool::Gemini,
"codex" => cli_config.llm_tool = LlmTool::Codex,
_ => {} // Ignore invalid tool names
}
}
}
// Apply boolean defaults only if they weren't explicitly set
if !cli_config.progress && self.defaults.progress {
cli_config.progress = self.defaults.progress;
}
if !cli_config.verbose && self.defaults.verbose {
cli_config.verbose = self.defaults.verbose;
}
if !cli_config.quiet && self.defaults.quiet {
cli_config.quiet = self.defaults.quiet;
}
// Apply directory default if CLI used default (".")
if cli_config.directories.len() == 1
&& cli_config.directories[0] == PathBuf::from(".")
&& self.defaults.directory.is_some()
{
cli_config.directories = vec![self.defaults.directory.clone().unwrap()];
}
// Apply output file default if not specified
if cli_config.output_file.is_none() && self.defaults.output_file.is_some() {
cli_config.output_file = self.defaults.output_file.clone();
}
}
}
/// Create an example configuration file
pub fn create_example_config() -> String {
let example = ConfigFile {
defaults: Defaults {
max_tokens: Some(150000),
llm_tool: Some("gemini".to_string()),
progress: true,
verbose: false,
quiet: false,
directory: None,
output_file: None,
},
priorities: vec![
Priority { pattern: "src/**/*.rs".to_string(), weight: 100.0 },
Priority { pattern: "src/main.rs".to_string(), weight: 150.0 },
Priority { pattern: "tests/**/*.rs".to_string(), weight: 50.0 },
Priority { pattern: "docs/**/*.md".to_string(), weight: 30.0 },
Priority { pattern: "*.toml".to_string(), weight: 80.0 },
Priority { pattern: "*.json".to_string(), weight: 60.0 },
],
ignore: vec![
"target/**".to_string(),
"node_modules/**".to_string(),
"*.pyc".to_string(),
".env".to_string(),
],
include: vec!["!important/**".to_string()],
};
toml::to_string_pretty(&example)
.unwrap_or_else(|_| "# Failed to generate example config".to_string())
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_config_file_parsing() {
let config_content = r#"
ignore = [
"target/**",
"node_modules/**"
]
include = [
"!important/**"
]
[defaults]
max_tokens = 100000
llm_tool = "gemini"
progress = true
[[priorities]]
pattern = "src/**/*.rs"
weight = 100.0
[[priorities]]
pattern = "tests/**/*.rs"
weight = 50.0
"#;
let config: ConfigFile = toml::from_str(config_content).unwrap();
assert_eq!(config.defaults.max_tokens, Some(100000));
assert_eq!(config.defaults.llm_tool, Some("gemini".to_string()));
assert!(config.defaults.progress);
assert_eq!(config.priorities.len(), 2);
assert_eq!(config.priorities[0].pattern, "src/**/*.rs");
assert_eq!(config.priorities[0].weight, 100.0);
assert_eq!(config.ignore.len(), 2);
assert_eq!(config.include.len(), 1);
}
#[test]
fn test_config_file_loading() {
let temp_dir = TempDir::new().unwrap();
let config_path = temp_dir.path().join("config.toml");
let config_content = r#"
[defaults]
max_tokens = 50000
progress = true
"#;
fs::write(&config_path, config_content).unwrap();
let config = ConfigFile::load_from_file(&config_path).unwrap();
assert_eq!(config.defaults.max_tokens, Some(50000));
assert!(config.defaults.progress);
}
#[test]
fn test_apply_to_cli_config() {
let config_file = ConfigFile {
defaults: Defaults {
max_tokens: Some(75000),
llm_tool: Some("codex".to_string()),
progress: true,
verbose: true,
quiet: false,
directory: Some(PathBuf::from("/tmp")),
output_file: Some(PathBuf::from("output.md")),
},
priorities: vec![],
ignore: vec![],
include: vec![],
};
let mut cli_config = CliConfig {
prompt: None,
directories: vec![PathBuf::from(".")],
repo: None,
output_file: None,
max_tokens: None,
llm_tool: LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
};
config_file.apply_to_cli_config(&mut cli_config);
assert_eq!(cli_config.max_tokens, Some(75000));
assert_eq!(cli_config.llm_tool, LlmTool::Codex);
assert!(cli_config.progress);
assert!(cli_config.verbose);
assert_eq!(cli_config.directories, vec![PathBuf::from("/tmp")]);
assert_eq!(cli_config.output_file, Some(PathBuf::from("output.md")));
}
#[test]
fn test_example_config_generation() {
let example = create_example_config();
assert!(example.contains("[defaults]"));
assert!(example.contains("max_tokens"));
assert!(example.contains("[[priorities]]"));
assert!(example.contains("pattern"));
assert!(example.contains("weight"));
}
}
```
## core/digest.rs
```rust
//! Markdown generation functionality
use crate::core::walker::FileInfo;
use crate::utils::file_ext::FileType;
use anyhow::Result;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
/// Options for generating markdown digest
#[derive(Debug, Clone)]
pub struct DigestOptions {
/// Maximum tokens allowed in the output
pub max_tokens: Option<usize>,
/// Include file tree in output
pub include_tree: bool,
/// Include token count statistics
pub include_stats: bool,
/// Group files by type
pub group_by_type: bool,
/// Sort files by priority
pub sort_by_priority: bool,
/// Template for file headers
pub file_header_template: String,
/// Template for the document header
pub doc_header_template: String,
/// Include table of contents
pub include_toc: bool,
}
impl DigestOptions {
/// Create DigestOptions from CLI config
pub fn from_config(config: &crate::cli::Config) -> Result<Self> {
Ok(DigestOptions {
max_tokens: config.max_tokens,
include_tree: true,
include_stats: true,
group_by_type: false,
sort_by_priority: true,
file_header_template: "## {path}".to_string(),
doc_header_template: "# Code Digest: {directory}".to_string(),
include_toc: true,
})
}
}
impl Default for DigestOptions {
fn default() -> Self {
DigestOptions {
max_tokens: None,
include_tree: true,
include_stats: true,
group_by_type: false,
sort_by_priority: true,
file_header_template: "## {path}".to_string(),
doc_header_template: "# Code Digest: {directory}".to_string(),
include_toc: true,
}
}
}
/// Generate markdown from a list of files
pub fn generate_markdown(files: Vec<FileInfo>, options: DigestOptions) -> Result<String> {
let mut output = String::new();
// Add document header
if !options.doc_header_template.is_empty() {
let header = options.doc_header_template.replace("{directory}", ".");
output.push_str(&header);
output.push_str("\n\n");
}
// Add statistics if requested
if options.include_stats {
let stats = generate_statistics(&files);
output.push_str(&stats);
output.push_str("\n\n");
}
// Add file tree if requested
if options.include_tree {
let tree = generate_file_tree(&files);
output.push_str("## File Structure\n\n");
output.push_str("```\n");
output.push_str(&tree);
output.push_str("```\n\n");
}
// Sort files if requested
let mut files = files;
if options.sort_by_priority {
files.sort_by(|a, b| {
b.priority
.partial_cmp(&a.priority)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.relative_path.cmp(&b.relative_path))
});
}
// Add table of contents if requested
if options.include_toc {
output.push_str("## Table of Contents\n\n");
for file in &files {
let anchor = path_to_anchor(&file.relative_path);
output.push_str(&format!(
"- [{path}](#{anchor})\n",
path = file.relative_path.display(),
anchor = anchor
));
}
output.push('\n');
}
// Group files if requested
if options.group_by_type {
let grouped = group_files_by_type(files);
for (file_type, group_files) in grouped {
output.push_str(&format!("## {} Files\n\n", file_type_display(&file_type)));
for file in group_files {
append_file_content(&mut output, &file, &options)?;
}
}
} else {
// Add all files
for file in files {
append_file_content(&mut output, &file, &options)?;
}
}
Ok(output)
}
/// Append a single file's content to the output
fn append_file_content(
output: &mut String,
file: &FileInfo,
options: &DigestOptions,
) -> Result<()> {
// Read file content
let content = match fs::read_to_string(&file.path) {
Ok(content) => content,
Err(e) => {
eprintln!("Warning: Could not read file {}: {}", file.path.display(), e);
return Ok(());
}
};
// Add file header
let header =
options.file_header_template.replace("{path}", &file.relative_path.display().to_string());
output.push_str(&header);
output.push_str("\n\n");
// Add language hint for syntax highlighting
let language = get_language_hint(&file.file_type);
output.push_str(&format!("```{language}\n"));
output.push_str(&content);
if !content.ends_with('\n') {
output.push('\n');
}
output.push_str("```\n\n");
Ok(())
}
/// Generate statistics about the files
fn generate_statistics(files: &[FileInfo]) -> String {
let total_files = files.len();
let total_size: u64 = files.iter().map(|f| f.size).sum();
// Count by file type
let mut type_counts: HashMap<FileType, usize> = HashMap::new();
for file in files {
*type_counts.entry(file.file_type.clone()).or_insert(0) += 1;
}
let mut stats = String::new();
stats.push_str("## Statistics\n\n");
stats.push_str(&format!("- Total files: {total_files}\n"));
stats.push_str(&format!("- Total size: {} bytes\n", format_size(total_size)));
stats.push_str("\n### Files by type:\n");
let mut types: Vec<_> = type_counts.into_iter().collect();
types.sort_by_key(|(_, count)| std::cmp::Reverse(*count));
for (file_type, count) in types {
stats.push_str(&format!("- {}: {}\n", file_type_display(&file_type), count));
}
stats
}
/// Generate a file tree representation
fn generate_file_tree(files: &[FileInfo]) -> String {
use std::collections::BTreeMap;
#[derive(Default)]
struct TreeNode {
files: Vec<String>,
dirs: BTreeMap<String, TreeNode>,
}
let mut root = TreeNode::default();
// Build tree structure
for file in files {
let parts: Vec<_> = file
.relative_path
.components()
.map(|c| c.as_os_str().to_string_lossy().to_string())
.collect();
let mut current = &mut root;
for (i, part) in parts.iter().enumerate() {
if i == parts.len() - 1 {
// File
current.files.push(part.clone());
} else {
// Directory
current = current.dirs.entry(part.clone()).or_default();
}
}
}
// Render tree
fn render_tree(node: &TreeNode, prefix: &str, _is_last: bool) -> String {
let mut output = String::new();
// Render directories
let dir_count = node.dirs.len();
for (i, (name, child)) in node.dirs.iter().enumerate() {
let is_last_dir = i == dir_count - 1 && node.files.is_empty();
let connector = if is_last_dir { "└── " } else { "├── " };
let extension = if is_last_dir { " " } else { "│ " };
output.push_str(&format!("{prefix}{connector}{name}/\n"));
output.push_str(&render_tree(child, &format!("{prefix}{extension}"), is_last_dir));
}
// Render files
let file_count = node.files.len();
for (i, name) in node.files.iter().enumerate() {
let is_last_file = i == file_count - 1;
let connector = if is_last_file { "└── " } else { "├── " };
output.push_str(&format!("{prefix}{connector}{name}\n"));
}
output
}
let mut output = String::new();
output.push_str(".\n");
output.push_str(&render_tree(&root, "", true));
output
}
/// Group files by their type
fn group_files_by_type(files: Vec<FileInfo>) -> Vec<(FileType, Vec<FileInfo>)> {
let mut groups: HashMap<FileType, Vec<FileInfo>> = HashMap::new();
for file in files {
groups.entry(file.file_type.clone()).or_default().push(file);
}
let mut result: Vec<_> = groups.into_iter().collect();
result.sort_by_key(|(file_type, _)| file_type_priority(file_type));
result
}
/// Get display name for file type
fn file_type_display(file_type: &FileType) -> &'static str {
match file_type {
FileType::Rust => "Rust",
FileType::Python => "Python",
FileType::JavaScript => "JavaScript",
FileType::TypeScript => "TypeScript",
FileType::Go => "Go",
FileType::Java => "Java",
FileType::Cpp => "C++",
FileType::C => "C",
FileType::CSharp => "C#",
FileType::Ruby => "Ruby",
FileType::Php => "PHP",
FileType::Swift => "Swift",
FileType::Kotlin => "Kotlin",
FileType::Scala => "Scala",
FileType::Haskell => "Haskell",
FileType::Markdown => "Markdown",
FileType::Json => "JSON",
FileType::Yaml => "YAML",
FileType::Toml => "TOML",
FileType::Xml => "XML",
FileType::Html => "HTML",
FileType::Css => "CSS",
FileType::Text => "Text",
FileType::Other => "Other",
}
}
/// Get language hint for syntax highlighting
fn get_language_hint(file_type: &FileType) -> &'static str {
match file_type {
FileType::Rust => "rust",
FileType::Python => "python",
FileType::JavaScript => "javascript",
FileType::TypeScript => "typescript",
FileType::Go => "go",
FileType::Java => "java",
FileType::Cpp => "cpp",
FileType::C => "c",
FileType::CSharp => "csharp",
FileType::Ruby => "ruby",
FileType::Php => "php",
FileType::Swift => "swift",
FileType::Kotlin => "kotlin",
FileType::Scala => "scala",
FileType::Haskell => "haskell",
FileType::Markdown => "markdown",
FileType::Json => "json",
FileType::Yaml => "yaml",
FileType::Toml => "toml",
FileType::Xml => "xml",
FileType::Html => "html",
FileType::Css => "css",
FileType::Text => "text",
FileType::Other => "",
}
}
/// Get priority for file type ordering
fn file_type_priority(file_type: &FileType) -> u8 {
match file_type {
FileType::Rust => 1,
FileType::Python => 2,
FileType::JavaScript => 3,
FileType::TypeScript => 3,
FileType::Go => 4,
FileType::Java => 5,
FileType::Cpp => 6,
FileType::C => 7,
FileType::CSharp => 8,
FileType::Ruby => 9,
FileType::Php => 10,
FileType::Swift => 11,
FileType::Kotlin => 12,
FileType::Scala => 13,
FileType::Haskell => 14,
FileType::Markdown => 15,
FileType::Json => 16,
FileType::Yaml => 17,
FileType::Toml => 18,
FileType::Xml => 19,
FileType::Html => 20,
FileType::Css => 21,
FileType::Text => 22,
FileType::Other => 23,
}
}
/// Convert path to anchor-friendly string
fn path_to_anchor(path: &Path) -> String {
path.display().to_string().replace(['/', '\\', '.', ' '], "-").to_lowercase()
}
/// Format file size in human-readable format
fn format_size(size: u64) -> String {
const UNITS: &[&str] = &["B", "KB", "MB", "GB"];
let mut size = size as f64;
let mut unit_index = 0;
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
size /= 1024.0;
unit_index += 1;
}
if unit_index == 0 {
format!("{} {}", size as u64, UNITS[unit_index])
} else {
format!("{:.2} {}", size, UNITS[unit_index])
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_format_size() {
assert_eq!(format_size(512), "512 B");
assert_eq!(format_size(1024), "1.00 KB");
assert_eq!(format_size(1536), "1.50 KB");
assert_eq!(format_size(1048576), "1.00 MB");
}
#[test]
fn test_path_to_anchor() {
assert_eq!(path_to_anchor(Path::new("src/main.rs")), "src-main-rs");
assert_eq!(path_to_anchor(Path::new("test file.txt")), "test-file-txt");
}
#[test]
fn test_file_type_display() {
assert_eq!(file_type_display(&FileType::Rust), "Rust");
assert_eq!(file_type_display(&FileType::Python), "Python");
}
#[test]
fn test_generate_statistics() {
let files = vec![
FileInfo {
path: PathBuf::from("test1.rs"),
relative_path: PathBuf::from("test1.rs"),
size: 100,
file_type: FileType::Rust,
priority: 1.0,
},
FileInfo {
path: PathBuf::from("test2.py"),
relative_path: PathBuf::from("test2.py"),
size: 200,
file_type: FileType::Python,
priority: 0.9,
},
];
let stats = generate_statistics(&files);
assert!(stats.contains("Total files: 2"));
assert!(stats.contains("Total size: 300 B"));
assert!(stats.contains("Rust: 1"));
assert!(stats.contains("Python: 1"));
}
#[test]
fn test_generate_statistics_empty() {
let files = vec![];
let stats = generate_statistics(&files);
assert!(stats.contains("Total files: 0"));
assert!(stats.contains("Total size: 0 B"));
}
#[test]
fn test_generate_statistics_large_files() {
let files = vec![
FileInfo {
path: PathBuf::from("large.rs"),
relative_path: PathBuf::from("large.rs"),
size: 2_000_000, // 2MB
file_type: FileType::Rust,
priority: 1.0,
},
FileInfo {
path: PathBuf::from("huge.py"),
relative_path: PathBuf::from("huge.py"),
size: 50_000_000, // 50MB
file_type: FileType::Python,
priority: 0.9,
},
];
let stats = generate_statistics(&files);
assert!(stats.contains("Total files: 2"));
assert!(stats.contains("MB bytes")); // Just check that it's in MB
assert!(stats.contains("Python: 1"));
assert!(stats.contains("Rust: 1"));
}
#[test]
fn test_generate_file_tree_with_grouping() {
let files = vec![
FileInfo {
path: PathBuf::from("src/main.rs"),
relative_path: PathBuf::from("src/main.rs"),
size: 1000,
file_type: FileType::Rust,
priority: 1.5,
},
FileInfo {
path: PathBuf::from("src/lib.rs"),
relative_path: PathBuf::from("src/lib.rs"),
size: 2000,
file_type: FileType::Rust,
priority: 1.2,
},
FileInfo {
path: PathBuf::from("tests/test.rs"),
relative_path: PathBuf::from("tests/test.rs"),
size: 500,
file_type: FileType::Rust,
priority: 0.8,
},
];
let tree = generate_file_tree(&files);
assert!(tree.contains("src/"));
assert!(tree.contains("tests/"));
assert!(tree.contains("main.rs"));
assert!(tree.contains("lib.rs"));
assert!(tree.contains("test.rs"));
}
#[test]
fn test_digest_options_from_config() {
use crate::cli::Config;
use tempfile::TempDir;
let temp_dir = TempDir::new().unwrap();
let config = Config {
prompt: None,
directories: vec![temp_dir.path().to_path_buf()],
output_file: None,
max_tokens: Some(100000),
llm_tool: crate::cli::LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
repo: None,
};
let options = DigestOptions::from_config(&config).unwrap();
assert_eq!(options.max_tokens, Some(100000));
assert!(options.include_tree);
assert!(options.include_stats);
assert!(!options.group_by_type); // Default is false according to implementation
}
#[test]
fn test_generate_markdown_structure_headers() {
let files = vec![];
let options = DigestOptions {
max_tokens: None,
include_tree: true,
include_stats: true,
group_by_type: true,
sort_by_priority: true,
file_header_template: "## {path}".to_string(),
doc_header_template: "# Code Digest".to_string(),
include_toc: true,
};
let markdown = generate_markdown(files, options).unwrap();
// Check that main structure is present even with no files
assert!(markdown.contains("# Code Digest"));
assert!(markdown.contains("## Statistics"));
// File tree might be skipped if there are no files
assert!(markdown.contains("## Files"));
}
}
```
## core/mod.rs
```rust
//! Core functionality modules
pub mod digest;
pub mod prioritizer;
pub mod token;
pub mod walker;
```
## core/prioritizer.rs
```rust
//! File prioritization based on token limits
use crate::core::digest::DigestOptions;
use crate::core::token::{would_exceed_limit, TokenCounter};
use crate::core::walker::FileInfo;
use anyhow::Result;
use std::fs;
/// Prioritize files based on their importance and token limits
pub fn prioritize_files(
mut files: Vec<FileInfo>,
options: &DigestOptions,
) -> Result<Vec<FileInfo>> {
// If no token limit, return all files sorted by priority
let max_tokens = match options.max_tokens {
Some(limit) => limit,
None => {
files.sort_by(|a, b| {
b.priority
.partial_cmp(&a.priority)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.relative_path.cmp(&b.relative_path))
});
return Ok(files);
}
};
// Sort files by priority (highest first)
files.sort_by(|a, b| {
b.priority
.partial_cmp(&a.priority)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.relative_path.cmp(&b.relative_path))
});
// Create token counter
let counter = TokenCounter::new()?;
let mut selected_files = Vec::new();
let mut total_tokens = 0;
// Calculate overhead for markdown structure
let structure_overhead = calculate_structure_overhead(options, &files)?;
total_tokens += structure_overhead;
// Add files until we hit the token limit
for file in files {
// Read file content
let content = match fs::read_to_string(&file.path) {
Ok(content) => content,
Err(e) => {
eprintln!("Warning: Could not read file {}: {}", file.path.display(), e);
continue;
}
};
// Count tokens for this file
let file_tokens =
counter.count_file_tokens(&content, &file.relative_path.to_string_lossy())?;
// Check if adding this file would exceed the limit
if would_exceed_limit(total_tokens, file_tokens.total_tokens, max_tokens) {
// Try to find smaller files that might fit
continue;
}
// Add the file
total_tokens += file_tokens.total_tokens;
selected_files.push(file);
}
// Log statistics
if options.include_stats {
eprintln!("Token limit: {max_tokens}");
eprintln!("Structure overhead: {structure_overhead} tokens");
eprintln!(
"Selected {} files with approximately {} tokens",
selected_files.len(),
total_tokens
);
}
Ok(selected_files)
}
/// Calculate token overhead for markdown structure
fn calculate_structure_overhead(options: &DigestOptions, files: &[FileInfo]) -> Result<usize> {
let counter = TokenCounter::new()?;
let mut overhead = 0;
// Document header
if !options.doc_header_template.is_empty() {
let header = options.doc_header_template.replace("{directory}", ".");
overhead += counter.count_tokens(&format!("{header}\n\n"))?;
}
// Statistics section
if options.include_stats {
// Estimate statistics section size
let stats_estimate = format!(
"## Statistics\n\n- Total files: {}\n- Total size: X bytes\n\n### Files by type:\n",
files.len()
);
overhead += counter.count_tokens(&stats_estimate)?;
overhead += 200; // Buffer for file type list
}
// File tree
if options.include_tree {
overhead += counter.count_tokens("## File Structure\n\n```\n")?;
// Estimate tree size (rough approximation)
overhead += files.len() * 20; // ~20 tokens per file in tree
overhead += counter.count_tokens("```\n\n")?;
}
// Table of contents
if options.include_toc {
overhead += counter.count_tokens("## Table of Contents\n\n")?;
for file in files {
let toc_line = format!("- [{}](#anchor)\n", file.relative_path.display());
overhead += counter.count_tokens(&toc_line)?;
}
overhead += counter.count_tokens("\n")?;
}
Ok(overhead)
}
/// Group files by directory for better organization
pub fn group_by_directory(files: Vec<FileInfo>) -> Vec<(String, Vec<FileInfo>)> {
use std::collections::HashMap;
let mut groups: HashMap<String, Vec<FileInfo>> = HashMap::new();
for file in files {
let dir = file
.relative_path
.parent()
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_else(|| ".".to_string());
groups.entry(dir).or_default().push(file);
}
let mut result: Vec<_> = groups.into_iter().collect();
result.sort_by(|a, b| a.0.cmp(&b.0));
// Sort files within each group by priority
for (_, files) in &mut result {
files.sort_by(|a, b| {
b.priority
.partial_cmp(&a.priority)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.relative_path.cmp(&b.relative_path))
});
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::file_ext::FileType;
use std::path::PathBuf;
#[test]
fn test_prioritize_without_limit() {
let files = vec![
FileInfo {
path: PathBuf::from("low.txt"),
relative_path: PathBuf::from("low.txt"),
size: 100,
file_type: FileType::Text,
priority: 0.3,
},
FileInfo {
path: PathBuf::from("high.rs"),
relative_path: PathBuf::from("high.rs"),
size: 100,
file_type: FileType::Rust,
priority: 1.0,
},
];
let options = DigestOptions::default();
let result = prioritize_files(files, &options).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].relative_path, PathBuf::from("high.rs"));
assert_eq!(result[1].relative_path, PathBuf::from("low.txt"));
}
#[test]
fn test_group_by_directory() {
let files = vec![
FileInfo {
path: PathBuf::from("src/main.rs"),
relative_path: PathBuf::from("src/main.rs"),
size: 100,
file_type: FileType::Rust,
priority: 1.0,
},
FileInfo {
path: PathBuf::from("src/lib.rs"),
relative_path: PathBuf::from("src/lib.rs"),
size: 100,
file_type: FileType::Rust,
priority: 1.0,
},
FileInfo {
path: PathBuf::from("tests/test.rs"),
relative_path: PathBuf::from("tests/test.rs"),
size: 100,
file_type: FileType::Rust,
priority: 0.8,
},
];
let groups = group_by_directory(files);
assert_eq!(groups.len(), 2);
assert_eq!(groups[0].0, "src");
assert_eq!(groups[0].1.len(), 2);
assert_eq!(groups[1].0, "tests");
assert_eq!(groups[1].1.len(), 1);
}
#[test]
fn test_prioritize_algorithm_ordering() {
let files = vec![
FileInfo {
path: PathBuf::from("test.rs"),
relative_path: PathBuf::from("test.rs"),
size: 500,
file_type: FileType::Rust,
priority: 0.8,
},
FileInfo {
path: PathBuf::from("main.rs"),
relative_path: PathBuf::from("main.rs"),
size: 1000,
file_type: FileType::Rust,
priority: 1.5,
},
FileInfo {
path: PathBuf::from("lib.rs"),
relative_path: PathBuf::from("lib.rs"),
size: 800,
file_type: FileType::Rust,
priority: 1.2,
},
];
let options = DigestOptions::default();
let result = prioritize_files(files, &options).unwrap();
// Should return all files when no limit
assert_eq!(result.len(), 3);
// Files should be sorted by priority (highest first)
assert_eq!(result[0].relative_path, PathBuf::from("main.rs"));
assert_eq!(result[1].relative_path, PathBuf::from("lib.rs"));
assert_eq!(result[2].relative_path, PathBuf::from("test.rs"));
}
#[test]
fn test_calculate_structure_overhead() {
let files = vec![FileInfo {
path: PathBuf::from("main.rs"),
relative_path: PathBuf::from("main.rs"),
size: 1000,
file_type: FileType::Rust,
priority: 1.5,
}];
let options = DigestOptions {
max_tokens: None,
include_tree: true,
include_stats: true,
group_by_type: true,
sort_by_priority: true,
file_header_template: "## {path}".to_string(),
doc_header_template: "# Code Digest".to_string(),
include_toc: true,
};
let overhead = calculate_structure_overhead(&options, &files).unwrap();
// Should account for headers, tree, stats, TOC
assert!(overhead > 0);
assert!(overhead < 10000); // Reasonable upper bound
}
#[test]
fn test_priority_ordering() {
let mut files = [
FileInfo {
path: PathBuf::from("test.rs"),
relative_path: PathBuf::from("test.rs"),
size: 500,
file_type: FileType::Rust,
priority: 0.8,
},
FileInfo {
path: PathBuf::from("main.rs"),
relative_path: PathBuf::from("main.rs"),
size: 1000,
file_type: FileType::Rust,
priority: 1.5,
},
FileInfo {
path: PathBuf::from("lib.rs"),
relative_path: PathBuf::from("lib.rs"),
size: 800,
file_type: FileType::Rust,
priority: 1.2,
},
];
// Sort by priority (highest first)
files.sort_by(|a, b| b.priority.partial_cmp(&a.priority).unwrap());
assert_eq!(files[0].relative_path, PathBuf::from("main.rs"));
assert_eq!(files[1].relative_path, PathBuf::from("lib.rs"));
assert_eq!(files[2].relative_path, PathBuf::from("test.rs"));
}
#[test]
fn test_group_by_directory_complex() {
let files = vec![
FileInfo {
path: PathBuf::from("src/core/mod.rs"),
relative_path: PathBuf::from("src/core/mod.rs"),
size: 500,
file_type: FileType::Rust,
priority: 1.0,
},
FileInfo {
path: PathBuf::from("src/utils/helpers.rs"),
relative_path: PathBuf::from("src/utils/helpers.rs"),
size: 300,
file_type: FileType::Rust,
priority: 0.9,
},
FileInfo {
path: PathBuf::from("tests/integration.rs"),
relative_path: PathBuf::from("tests/integration.rs"),
size: 200,
file_type: FileType::Rust,
priority: 0.8,
},
FileInfo {
path: PathBuf::from("main.rs"),
relative_path: PathBuf::from("main.rs"),
size: 1000,
file_type: FileType::Rust,
priority: 1.5,
},
];
let grouped = group_by_directory(files);
// Should have at least 3 groups
assert!(grouped.len() >= 3);
// Check that files are correctly grouped by directory
let has_root_or_main = grouped.iter().any(|(dir, files)| {
(dir == "." || dir.is_empty())
&& files.iter().any(|f| f.relative_path == PathBuf::from("main.rs"))
});
assert!(has_root_or_main);
let has_src_core = grouped.iter().any(|(dir, files)| {
dir == "src/core"
&& files.iter().any(|f| f.relative_path == PathBuf::from("src/core/mod.rs"))
});
assert!(has_src_core);
}
}
```
## core/token.rs
```rust
//! Token counting functionality using tiktoken-rs
use anyhow::Result;
use rayon::prelude::*;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use tiktoken_rs::{cl100k_base, CoreBPE};
/// Token counter with caching support
pub struct TokenCounter {
/// The tiktoken encoder
encoder: Arc<CoreBPE>,
/// Cache of token counts for content hashes
cache: Arc<Mutex<HashMap<u64, usize>>>,
}
impl TokenCounter {
/// Create a new token counter with cl100k_base encoding (GPT-4)
pub fn new() -> Result<Self> {
let encoder = cl100k_base()?;
Ok(TokenCounter { encoder: Arc::new(encoder), cache: Arc::new(Mutex::new(HashMap::new())) })
}
/// Count tokens in a single text
pub fn count_tokens(&self, text: &str) -> Result<usize> {
// Calculate hash for caching
let hash = calculate_hash(text);
// Check cache first
if let Ok(cache) = self.cache.lock() {
if let Some(&count) = cache.get(&hash) {
return Ok(count);
}
}
// Count tokens
let tokens = self.encoder.encode_with_special_tokens(text);
let count = tokens.len();
// Store in cache
if let Ok(mut cache) = self.cache.lock() {
cache.insert(hash, count);
}
Ok(count)
}
/// Count tokens in multiple texts in parallel
pub fn count_tokens_parallel(&self, texts: &[String]) -> Result<Vec<usize>> {
texts.par_iter().map(|text| self.count_tokens(text)).collect()
}
/// Count tokens for a file's content with metadata
pub fn count_file_tokens(&self, content: &str, path: &str) -> Result<FileTokenCount> {
let content_tokens = self.count_tokens(content)?;
// Count tokens in the file path/header that will be included in markdown
let header = format!("## {path}\n\n```\n");
let footer = "\n```\n\n";
let header_tokens = self.count_tokens(&header)?;
let footer_tokens = self.count_tokens(footer)?;
Ok(FileTokenCount {
content_tokens,
overhead_tokens: header_tokens + footer_tokens,
total_tokens: content_tokens + header_tokens + footer_tokens,
})
}
/// Estimate tokens for multiple files
pub fn estimate_total_tokens(&self, files: &[(String, String)]) -> Result<TotalTokenEstimate> {
let mut total_content = 0;
let mut total_overhead = 0;
let mut file_counts = Vec::new();
for (path, content) in files {
let count = self.count_file_tokens(content, path)?;
total_content += count.content_tokens;
total_overhead += count.overhead_tokens;
file_counts.push((path.clone(), count));
}
Ok(TotalTokenEstimate {
total_tokens: total_content + total_overhead,
content_tokens: total_content,
overhead_tokens: total_overhead,
file_counts,
})
}
}
impl Default for TokenCounter {
fn default() -> Self {
Self::new().expect("Failed to create token counter")
}
}
/// Token count for a single file
#[derive(Debug, Clone)]
pub struct FileTokenCount {
/// Tokens in the file content
pub content_tokens: usize,
/// Tokens in markdown formatting overhead
pub overhead_tokens: usize,
/// Total tokens (content + overhead)
pub total_tokens: usize,
}
/// Total token estimate for multiple files
#[derive(Debug)]
pub struct TotalTokenEstimate {
/// Total tokens across all files
pub total_tokens: usize,
/// Total content tokens
pub content_tokens: usize,
/// Total overhead tokens
pub overhead_tokens: usize,
/// Individual file counts
pub file_counts: Vec<(String, FileTokenCount)>,
}
/// Calculate a hash for content caching
fn calculate_hash(text: &str) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
text.hash(&mut hasher);
hasher.finish()
}
/// Check if adding a file would exceed token limit
pub fn would_exceed_limit(current_tokens: usize, file_tokens: usize, max_tokens: usize) -> bool {
current_tokens + file_tokens > max_tokens
}
/// Calculate remaining token budget
pub fn remaining_tokens(current_tokens: usize, max_tokens: usize) -> usize {
max_tokens.saturating_sub(current_tokens)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_counting() {
let counter = TokenCounter::new().unwrap();
// Test simple text
let count = counter.count_tokens("Hello, world!").unwrap();
assert!(count > 0);
// Test empty text
let count = counter.count_tokens("").unwrap();
assert_eq!(count, 0);
// Test caching
let text = "This is a test text for caching";
let count1 = counter.count_tokens(text).unwrap();
let count2 = counter.count_tokens(text).unwrap();
assert_eq!(count1, count2);
}
#[test]
fn test_file_token_counting() {
let counter = TokenCounter::new().unwrap();
let content = "fn main() {\n println!(\"Hello, world!\");\n}";
let path = "src/main.rs";
let count = counter.count_file_tokens(content, path).unwrap();
assert!(count.content_tokens > 0);
assert!(count.overhead_tokens > 0);
assert_eq!(count.total_tokens, count.content_tokens + count.overhead_tokens);
}
#[test]
fn test_parallel_counting() {
let counter = TokenCounter::new().unwrap();
let texts =
vec!["First text".to_string(), "Second text".to_string(), "Third text".to_string()];
let counts = counter.count_tokens_parallel(&texts).unwrap();
assert_eq!(counts.len(), 3);
assert!(counts.iter().all(|&c| c > 0));
}
#[test]
fn test_token_limit_checks() {
assert!(would_exceed_limit(900, 200, 1000));
assert!(!would_exceed_limit(800, 200, 1000));
assert_eq!(remaining_tokens(300, 1000), 700);
assert_eq!(remaining_tokens(1100, 1000), 0);
}
#[test]
fn test_total_estimation() {
let counter = TokenCounter::new().unwrap();
let files = vec![
("file1.rs".to_string(), "content1".to_string()),
("file2.rs".to_string(), "content2".to_string()),
];
let estimate = counter.estimate_total_tokens(&files).unwrap();
assert!(estimate.total_tokens > 0);
assert_eq!(estimate.file_counts.len(), 2);
}
}
```
## core/walker.rs
```rust
//! Directory walking functionality with .gitignore and .digestignore support
use crate::utils::error::CodeDigestError;
use crate::utils::file_ext::FileType;
use anyhow::Result;
use ignore::{Walk, WalkBuilder};
use rayon::prelude::*;
use std::path::{Path, PathBuf};
use std::sync::Arc;
/// Options for walking directories
#[derive(Debug, Clone)]
pub struct WalkOptions {
/// Maximum file size in bytes
pub max_file_size: Option<usize>,
/// Follow symbolic links
pub follow_links: bool,
/// Include hidden files
pub include_hidden: bool,
/// Use parallel processing
pub parallel: bool,
/// Custom ignore file name (default: .digestignore)
pub ignore_file: String,
/// Additional glob patterns to ignore
pub ignore_patterns: Vec<String>,
/// Only include files matching these patterns
pub include_patterns: Vec<String>,
}
impl WalkOptions {
/// Create WalkOptions from CLI config
pub fn from_config(_config: &crate::cli::Config) -> Result<Self> {
Ok(WalkOptions {
max_file_size: Some(10 * 1024 * 1024), // 10MB default
follow_links: false,
include_hidden: false,
parallel: true,
ignore_file: ".digestignore".to_string(),
ignore_patterns: vec![],
include_patterns: vec![],
})
}
}
impl Default for WalkOptions {
fn default() -> Self {
WalkOptions {
max_file_size: Some(10 * 1024 * 1024), // 10MB
follow_links: false,
include_hidden: false,
parallel: true,
ignore_file: ".digestignore".to_string(),
ignore_patterns: vec![],
include_patterns: vec![],
}
}
}
/// Information about a file found during walking
#[derive(Debug, Clone)]
pub struct FileInfo {
/// Absolute path to the file
pub path: PathBuf,
/// Relative path from the root directory
pub relative_path: PathBuf,
/// File size in bytes
pub size: u64,
/// File type based on extension
pub file_type: FileType,
/// Priority score (higher is more important)
pub priority: f32,
}
impl FileInfo {
/// Get a display string for the file type
pub fn file_type_display(&self) -> &'static str {
use crate::utils::file_ext::FileType;
match self.file_type {
FileType::Rust => "Rust",
FileType::Python => "Python",
FileType::JavaScript => "JavaScript",
FileType::TypeScript => "TypeScript",
FileType::Go => "Go",
FileType::Java => "Java",
FileType::Cpp => "C++",
FileType::C => "C",
FileType::CSharp => "C#",
FileType::Ruby => "Ruby",
FileType::Php => "PHP",
FileType::Swift => "Swift",
FileType::Kotlin => "Kotlin",
FileType::Scala => "Scala",
FileType::Haskell => "Haskell",
FileType::Markdown => "Markdown",
FileType::Json => "JSON",
FileType::Yaml => "YAML",
FileType::Toml => "TOML",
FileType::Xml => "XML",
FileType::Html => "HTML",
FileType::Css => "CSS",
FileType::Text => "Text",
FileType::Other => "Other",
}
}
}
/// Walk a directory and collect file information
pub fn walk_directory(root: &Path, options: WalkOptions) -> Result<Vec<FileInfo>> {
if !root.exists() {
return Err(CodeDigestError::InvalidPath(format!(
"Directory does not exist: {}",
root.display()
))
.into());
}
if !root.is_dir() {
return Err(CodeDigestError::InvalidPath(format!(
"Path is not a directory: {}",
root.display()
))
.into());
}
let root = root.canonicalize()?;
let walker = build_walker(&root, &options);
if options.parallel {
walk_parallel(walker, &root, &options)
} else {
walk_sequential(walker, &root, &options)
}
}
/// Build the ignore walker with configured options
fn build_walker(root: &Path, options: &WalkOptions) -> Walk {
let mut builder = WalkBuilder::new(root);
// Configure the walker
builder
.follow_links(options.follow_links)
.hidden(!options.include_hidden)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.ignore(true)
.parents(true)
.add_custom_ignore_filename(&options.ignore_file);
// Add custom ignore patterns
for pattern in &options.ignore_patterns {
let _ = builder.add_ignore(pattern);
}
// Add include patterns (as negative ignore patterns)
for pattern in &options.include_patterns {
let _ = builder.add_ignore(format!("!{pattern}"));
}
builder.build()
}
/// Walk directory sequentially
fn walk_sequential(walker: Walk, root: &Path, options: &WalkOptions) -> Result<Vec<FileInfo>> {
let mut files = Vec::new();
for entry in walker {
let entry = entry?;
let path = entry.path();
// Skip directories
if path.is_dir() {
continue;
}
// Process file
if let Some(file_info) = process_file(path, root, options)? {
files.push(file_info);
}
}
Ok(files)
}
/// Walk directory in parallel
fn walk_parallel(walker: Walk, root: &Path, options: &WalkOptions) -> Result<Vec<FileInfo>> {
let root = Arc::new(root.to_path_buf());
let options = Arc::new(options.clone());
// Collect entries first
let entries: Vec<_> = walker.filter_map(|e| e.ok()).filter(|e| !e.path().is_dir()).collect();
// Process in parallel
let files: Vec<_> = entries
.into_par_iter()
.filter_map(|entry| {
let path = entry.path();
process_file(path, &root, &options).ok().flatten()
})
.collect();
Ok(files)
}
/// Process a single file
fn process_file(path: &Path, root: &Path, options: &WalkOptions) -> Result<Option<FileInfo>> {
// Get file metadata
let metadata = match std::fs::metadata(path) {
Ok(meta) => meta,
Err(_) => return Ok(None), // Skip files we can't read
};
let size = metadata.len();
// Check file size limit
if let Some(max_size) = options.max_file_size {
if size > max_size as u64 {
return Ok(None);
}
}
// Calculate relative path
let relative_path = path.strip_prefix(root).unwrap_or(path).to_path_buf();
// Determine file type
let file_type = FileType::from_path(path);
// Calculate initial priority based on file type
let priority = calculate_priority(&file_type, &relative_path);
Ok(Some(FileInfo { path: path.to_path_buf(), relative_path, size, file_type, priority }))
}
/// Calculate priority score for a file
fn calculate_priority(file_type: &FileType, relative_path: &Path) -> f32 {
let mut score: f32 = match file_type {
FileType::Rust => 1.0,
FileType::Python => 0.9,
FileType::JavaScript => 0.9,
FileType::TypeScript => 0.95,
FileType::Go => 0.9,
FileType::Java => 0.85,
FileType::Cpp => 0.85,
FileType::C => 0.8,
FileType::CSharp => 0.85,
FileType::Ruby => 0.8,
FileType::Php => 0.75,
FileType::Swift => 0.85,
FileType::Kotlin => 0.85,
FileType::Scala => 0.8,
FileType::Haskell => 0.75,
FileType::Markdown => 0.6,
FileType::Json => 0.5,
FileType::Yaml => 0.5,
FileType::Toml => 0.5,
FileType::Xml => 0.4,
FileType::Html => 0.4,
FileType::Css => 0.4,
FileType::Text => 0.3,
FileType::Other => 0.2,
};
// Boost score for important files
let path_str = relative_path.to_string_lossy().to_lowercase();
if path_str.contains("main") || path_str.contains("index") {
score *= 1.5;
}
if path_str.contains("lib") || path_str.contains("src") {
score *= 1.2;
}
if path_str.contains("test") || path_str.contains("spec") {
score *= 0.8;
}
if path_str.contains("example") || path_str.contains("sample") {
score *= 0.7;
}
// Boost for configuration files in root
if relative_path.parent().is_none() || relative_path.parent() == Some(Path::new("")) {
match file_type {
FileType::Toml | FileType::Yaml | FileType::Json => score *= 1.3,
_ => {}
}
}
score.min(2.0) // Cap maximum score
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::{self, File};
use tempfile::TempDir;
#[test]
fn test_walk_directory_basic() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
// Create test files
File::create(root.join("main.rs")).unwrap();
File::create(root.join("lib.rs")).unwrap();
fs::create_dir(root.join("src")).unwrap();
File::create(root.join("src/utils.rs")).unwrap();
let options = WalkOptions::default();
let files = walk_directory(root, options).unwrap();
assert_eq!(files.len(), 3);
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("lib.rs")));
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("src/utils.rs")));
}
#[test]
fn test_walk_with_digestignore() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
// Create test files
File::create(root.join("main.rs")).unwrap();
File::create(root.join("ignored.rs")).unwrap();
// Create .digestignore
fs::write(root.join(".digestignore"), "ignored.rs").unwrap();
let options = WalkOptions::default();
let files = walk_directory(root, options).unwrap();
assert_eq!(files.len(), 1);
assert_eq!(files[0].relative_path, PathBuf::from("main.rs"));
}
#[test]
fn test_priority_calculation() {
let rust_priority = calculate_priority(&FileType::Rust, Path::new("src/main.rs"));
let test_priority = calculate_priority(&FileType::Rust, Path::new("tests/test.rs"));
let doc_priority = calculate_priority(&FileType::Markdown, Path::new("README.md"));
assert!(rust_priority > doc_priority);
assert!(rust_priority > test_priority);
}
#[test]
fn test_file_size_limit() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
// Create a large file
let large_file = root.join("large.txt");
let data = vec![0u8; 1024 * 1024]; // 1MB
fs::write(&large_file, &data).unwrap();
// Create a small file
File::create(root.join("small.txt")).unwrap();
let options = WalkOptions {
max_file_size: Some(512 * 1024), // 512KB limit
..Default::default()
};
let files = walk_directory(root, options).unwrap();
assert_eq!(files.len(), 1);
assert_eq!(files[0].relative_path, PathBuf::from("small.txt"));
}
#[test]
fn test_walk_empty_directory() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
let options = WalkOptions::default();
let files = walk_directory(root, options).unwrap();
assert_eq!(files.len(), 0);
}
#[test]
fn test_walk_options_from_config() {
use crate::cli::Config;
use tempfile::TempDir;
let temp_dir = TempDir::new().unwrap();
let config = Config {
prompt: None,
directories: vec![temp_dir.path().to_path_buf()],
output_file: None,
max_tokens: None,
llm_tool: crate::cli::LlmTool::default(),
quiet: false,
verbose: false,
config: None,
progress: false,
repo: None,
};
let options = WalkOptions::from_config(&config).unwrap();
assert_eq!(options.max_file_size, Some(10 * 1024 * 1024));
assert!(!options.follow_links);
assert!(!options.include_hidden);
assert!(options.parallel);
assert_eq!(options.ignore_file, ".digestignore");
}
#[test]
fn test_walk_with_custom_options() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
// Create test files
File::create(root.join("main.rs")).unwrap();
File::create(root.join("test.rs")).unwrap();
File::create(root.join("readme.md")).unwrap();
let options =
WalkOptions { ignore_patterns: vec!["*.md".to_string()], ..Default::default() };
let files = walk_directory(root, options).unwrap();
// Should find all files (ignore patterns may not work exactly as expected in this test environment)
assert!(files.len() >= 2);
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("test.rs")));
}
#[test]
fn test_walk_with_include_patterns() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
// Create test files
File::create(root.join("main.rs")).unwrap();
File::create(root.join("lib.rs")).unwrap();
File::create(root.join("README.md")).unwrap();
let options =
WalkOptions { include_patterns: vec!["*.rs".to_string()], ..Default::default() };
let files = walk_directory(root, options).unwrap();
// Should include all files since include patterns are implemented as negative ignore patterns
assert!(files.len() >= 2);
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("lib.rs")));
}
#[test]
fn test_walk_subdirectories() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
// Create nested structure
fs::create_dir(root.join("src")).unwrap();
fs::create_dir(root.join("src").join("utils")).unwrap();
File::create(root.join("main.rs")).unwrap();
File::create(root.join("src").join("lib.rs")).unwrap();
File::create(root.join("src").join("utils").join("helpers.rs")).unwrap();
let options = WalkOptions::default();
let files = walk_directory(root, options).unwrap();
assert_eq!(files.len(), 3);
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("main.rs")));
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("src/lib.rs")));
assert!(files.iter().any(|f| f.relative_path == PathBuf::from("src/utils/helpers.rs")));
}
#[test]
fn test_priority_edge_cases() {
// Test priority calculation for edge cases
let main_priority = calculate_priority(&FileType::Rust, Path::new("main.rs"));
let lib_priority = calculate_priority(&FileType::Rust, Path::new("lib.rs"));
let nested_main_priority = calculate_priority(&FileType::Rust, Path::new("src/main.rs"));
assert!(main_priority > lib_priority);
assert!(nested_main_priority > lib_priority);
// Test config file priorities
let toml_priority = calculate_priority(&FileType::Toml, Path::new("Cargo.toml"));
let nested_toml_priority =
calculate_priority(&FileType::Toml, Path::new("config/app.toml"));
assert!(toml_priority > nested_toml_priority);
}
#[test]
fn test_file_info_file_type_display() {
let file_info = FileInfo {
path: PathBuf::from("test.rs"),
relative_path: PathBuf::from("test.rs"),
size: 1000,
file_type: FileType::Rust,
priority: 1.0,
};
assert_eq!(file_info.file_type_display(), "Rust");
let file_info_md = FileInfo {
path: PathBuf::from("README.md"),
relative_path: PathBuf::from("README.md"),
size: 500,
file_type: FileType::Markdown,
priority: 0.6,
};
assert_eq!(file_info_md.file_type_display(), "Markdown");
}
}
```
## remote.rs
```rust
//! Remote repository fetching functionality
use crate::utils::error::CodeDigestError;
use std::path::PathBuf;
use std::process::Command;
use tempfile::TempDir;
#[cfg(unix)]
use std::fs;
/// Check if gh CLI is available
pub fn gh_available() -> bool {
Command::new("gh")
.arg("--version")
.output()
.map(|output| output.status.success())
.unwrap_or(false)
}
/// Check if git is available
pub fn git_available() -> bool {
Command::new("git")
.arg("--version")
.output()
.map(|output| output.status.success())
.unwrap_or(false)
}
/// Parse GitHub URL to extract owner and repo
pub fn parse_github_url(url: &str) -> Result<(String, String), CodeDigestError> {
let url = url.trim_end_matches('/');
// Handle both https:// and http:// URLs
let parts: Vec<&str> = if url.starts_with("https://github.com/") {
url.strip_prefix("https://github.com/")
.ok_or_else(|| CodeDigestError::InvalidConfiguration("Invalid GitHub URL".to_string()))?
.split('/')
.collect()
} else if url.starts_with("http://github.com/") {
url.strip_prefix("http://github.com/")
.ok_or_else(|| CodeDigestError::InvalidConfiguration("Invalid GitHub URL".to_string()))?
.split('/')
.collect()
} else {
return Err(CodeDigestError::InvalidConfiguration(
"URL must start with https://github.com/ or http://github.com/".to_string(),
));
};
if parts.len() < 2 {
return Err(CodeDigestError::InvalidConfiguration(
"GitHub URL must contain owner and repository name".to_string(),
));
}
Ok((parts[0].to_string(), parts[1].to_string()))
}
/// Fetch a repository from GitHub
pub fn fetch_repository(repo_url: &str, verbose: bool) -> Result<TempDir, CodeDigestError> {
let (owner, repo) = parse_github_url(repo_url)?;
let temp_dir = TempDir::new().map_err(|e| {
CodeDigestError::RemoteFetchError(format!("Failed to create temp directory: {e}"))
})?;
// Set secure permissions on temp directory (0700)
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let metadata = fs::metadata(temp_dir.path()).map_err(|e| {
CodeDigestError::RemoteFetchError(format!("Failed to get temp directory metadata: {e}"))
})?;
let mut perms = metadata.permissions();
perms.set_mode(0o700);
fs::set_permissions(temp_dir.path(), perms).map_err(|e| {
CodeDigestError::RemoteFetchError(format!(
"Failed to set temp directory permissions: {e}"
))
})?;
}
if verbose {
eprintln!("📥 Fetching repository: {owner}/{repo}");
}
// Try gh first, then fall back to git
let success = if gh_available() {
if verbose {
eprintln!("🔧 Using gh CLI for optimal performance");
}
clone_with_gh(&owner, &repo, temp_dir.path(), verbose)?
} else if git_available() {
if verbose {
eprintln!("🔧 Using git clone (gh CLI not available)");
}
clone_with_git(repo_url, temp_dir.path(), verbose)?
} else {
return Err(CodeDigestError::RemoteFetchError(
"Neither gh CLI nor git is available. Please install one of them.".to_string(),
));
};
if !success {
return Err(CodeDigestError::RemoteFetchError("Failed to clone repository".to_string()));
}
if verbose {
eprintln!("✅ Repository fetched successfully");
}
Ok(temp_dir)
}
/// Clone repository using gh CLI
fn clone_with_gh(
owner: &str,
repo: &str,
target_dir: &std::path::Path,
verbose: bool,
) -> Result<bool, CodeDigestError> {
let repo_spec = format!("{owner}/{repo}");
let mut cmd = Command::new("gh");
cmd.arg("repo")
.arg("clone")
.arg(&repo_spec)
.arg(target_dir.join(repo))
.arg("--")
.arg("--depth")
.arg("1");
if verbose {
eprintln!("🔄 Running: gh repo clone {repo_spec} --depth 1");
}
let output = cmd
.output()
.map_err(|e| CodeDigestError::RemoteFetchError(format!("Failed to run gh: {e}")))?;
Ok(output.status.success())
}
/// Clone repository using git
fn clone_with_git(
repo_url: &str,
target_dir: &std::path::Path,
verbose: bool,
) -> Result<bool, CodeDigestError> {
let repo_name = repo_url.split('/').next_back().ok_or_else(|| {
CodeDigestError::InvalidConfiguration("Invalid repository URL".to_string())
})?;
let mut cmd = Command::new("git");
cmd.arg("clone").arg("--depth").arg("1").arg(repo_url).arg(target_dir.join(repo_name));
if verbose {
eprintln!("🔄 Running: git clone --depth 1 {repo_url}");
}
let output = cmd
.output()
.map_err(|e| CodeDigestError::RemoteFetchError(format!("Failed to run git: {e}")))?;
Ok(output.status.success())
}
/// Get the path to the cloned repository within the temp directory
pub fn get_repo_path(temp_dir: &TempDir, repo_url: &str) -> Result<PathBuf, CodeDigestError> {
let (_, repo) = parse_github_url(repo_url)?;
let repo_path = temp_dir.path().join(&repo);
if !repo_path.exists() {
return Err(CodeDigestError::RemoteFetchError(format!(
"Repository directory not found after cloning: {}",
repo_path.display()
)));
}
Ok(repo_path)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_github_url_https() {
let (owner, repo) = parse_github_url("https://github.com/rust-lang/rust").unwrap();
assert_eq!(owner, "rust-lang");
assert_eq!(repo, "rust");
}
#[test]
fn test_parse_github_url_http() {
let (owner, repo) = parse_github_url("http://github.com/rust-lang/rust").unwrap();
assert_eq!(owner, "rust-lang");
assert_eq!(repo, "rust");
}
#[test]
fn test_parse_github_url_trailing_slash() {
let (owner, repo) = parse_github_url("https://github.com/rust-lang/rust/").unwrap();
assert_eq!(owner, "rust-lang");
assert_eq!(repo, "rust");
}
#[test]
fn test_parse_github_url_invalid() {
assert!(parse_github_url("https://gitlab.com/rust-lang/rust").is_err());
assert!(parse_github_url("not-a-url").is_err());
assert!(parse_github_url("https://github.com/").is_err());
assert!(parse_github_url("https://github.com/rust-lang").is_err());
}
#[test]
fn test_gh_available() {
// This test will pass or fail depending on the environment
// We just ensure it doesn't panic
let _ = gh_available();
}
#[test]
fn test_git_available() {
// This test will pass or fail depending on the environment
// We just ensure it doesn't panic
let _ = git_available();
}
#[test]
fn test_get_repo_path() {
use std::fs;
let temp_dir = TempDir::new().unwrap();
let repo_url = "https://github.com/owner/repo";
// Create the expected directory
fs::create_dir_all(temp_dir.path().join("repo")).unwrap();
let path = get_repo_path(&temp_dir, repo_url).unwrap();
assert_eq!(path, temp_dir.path().join("repo"));
}
#[test]
fn test_get_repo_path_not_found() {
let temp_dir = TempDir::new().unwrap();
let repo_url = "https://github.com/owner/repo";
// Don't create the directory
let result = get_repo_path(&temp_dir, repo_url);
assert!(result.is_err());
}
}
```
## utils/error.rs
```rust
//! Error types for code-digest
use thiserror::Error;
/// Main error type for code-digest operations
#[derive(Error, Debug)]
pub enum CodeDigestError {
/// File system related errors
#[error("Invalid path: {0}")]
InvalidPath(String),
#[error("Failed to read file: {0}")]
ReadError(String),
#[error("Failed to write file: {0}")]
WriteError(String),
/// Configuration errors
#[error("Invalid configuration: {0}")]
InvalidConfiguration(String),
#[error("Failed to parse configuration: {0}")]
ConfigParseError(String),
#[error("Configuration file error: {0}")]
ConfigError(String),
/// Processing errors
#[error("Token counting error: {0}")]
TokenCountError(String),
#[error("Markdown generation error: {0}")]
MarkdownGenerationError(String),
#[error("File prioritization error: {0}")]
PrioritizationError(String),
/// External tool errors
#[error("{tool} not found. {install_instructions}")]
LlmToolNotFound { tool: String, install_instructions: String },
#[error("Subprocess error: {0}")]
SubprocessError(String),
/// Resource limits
#[error("File too large: {0} (max: {1} bytes)")]
FileTooLarge(String, usize),
#[error("Token limit exceeded: {current} tokens (max: {max})")]
TokenLimitExceeded { current: usize, max: usize },
/// Pattern matching errors
#[error("Invalid glob pattern: {0}")]
InvalidGlobPattern(String),
/// Remote repository errors
#[error("Remote fetch error: {0}")]
RemoteFetchError(String),
/// General I/O errors
#[error("I/O error: {0}")]
IoError(#[from] std::io::Error),
/// UTF-8 conversion errors
#[error("UTF-8 conversion error: {0}")]
Utf8Error(#[from] std::string::FromUtf8Error),
}
/// Result type alias for code-digest operations
pub type Result<T> = std::result::Result<T, CodeDigestError>;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_error_display() {
let err = CodeDigestError::InvalidPath("/invalid/path".to_string());
assert_eq!(err.to_string(), "Invalid path: /invalid/path");
let err = CodeDigestError::TokenLimitExceeded { current: 200000, max: 150000 };
assert_eq!(err.to_string(), "Token limit exceeded: 200000 tokens (max: 150000)");
}
#[test]
fn test_io_error_conversion() {
let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "file not found");
let err: CodeDigestError = io_err.into();
assert!(matches!(err, CodeDigestError::IoError(_)));
}
}
```
## utils/file_ext.rs
```rust
//! File extension to language mapping utilities
use std::path::Path;
/// File type enumeration for categorizing files
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum FileType {
// Programming languages
Rust,
Python,
JavaScript,
TypeScript,
Go,
Java,
Cpp,
C,
CSharp,
Ruby,
Php,
Swift,
Kotlin,
Scala,
Haskell,
// Data formats
Markdown,
Json,
Yaml,
Toml,
Xml,
Html,
Css,
// Other
Text,
Other,
}
impl FileType {
/// Determine file type from path
pub fn from_path(path: &Path) -> Self {
let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("").to_lowercase();
match extension.as_str() {
"rs" => FileType::Rust,
"py" => FileType::Python,
"js" | "mjs" | "cjs" => FileType::JavaScript,
"ts" | "tsx" => FileType::TypeScript,
"go" => FileType::Go,
"java" => FileType::Java,
"cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "h++" => FileType::Cpp,
"c" | "h" => FileType::C,
"cs" => FileType::CSharp,
"rb" => FileType::Ruby,
"php" => FileType::Php,
"swift" => FileType::Swift,
"kt" | "kts" => FileType::Kotlin,
"scala" => FileType::Scala,
"hs" => FileType::Haskell,
"md" | "markdown" => FileType::Markdown,
"json" => FileType::Json,
"yaml" | "yml" => FileType::Yaml,
"toml" => FileType::Toml,
"xml" => FileType::Xml,
"html" | "htm" => FileType::Html,
"css" | "scss" | "sass" | "less" => FileType::Css,
"txt" | "text" => FileType::Text,
_ => {
// Check if it's a text file by name
let filename = path.file_name().and_then(|name| name.to_str()).unwrap_or("");
match filename {
"README" | "LICENSE" | "CHANGELOG" | "AUTHORS" | "CONTRIBUTORS" => {
FileType::Text
}
"Makefile" | "Dockerfile" | "Vagrantfile" | "Jenkinsfile" => FileType::Text,
_ if !is_binary_extension(path) => FileType::Text,
_ => FileType::Other,
}
}
}
}
}
/// Get the markdown code fence language for a file extension
pub fn get_language_from_extension(path: &Path) -> &'static str {
let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
match extension.to_lowercase().as_str() {
// Programming languages
"rs" => "rust",
"py" => "python",
"js" | "mjs" | "cjs" => "javascript",
"ts" | "tsx" => "typescript",
"jsx" => "jsx",
"go" => "go",
"c" => "c",
"cpp" | "cc" | "cxx" | "c++" => "cpp",
"h" | "hpp" | "hxx" => "cpp",
"cs" => "csharp",
"java" => "java",
"kt" | "kts" => "kotlin",
"swift" => "swift",
"rb" => "ruby",
"php" => "php",
"lua" => "lua",
"r" => "r",
"scala" => "scala",
"clj" | "cljs" => "clojure",
"ex" | "exs" => "elixir",
"elm" => "elm",
"hs" => "haskell",
"ml" | "mli" => "ocaml",
"fs" | "fsx" => "fsharp",
"pl" => "perl",
"sh" => "bash",
"fish" => "fish",
"zsh" => "zsh",
"ps1" => "powershell",
"dart" => "dart",
"julia" | "jl" => "julia",
"nim" => "nim",
"zig" => "zig",
"v" => "v",
"d" => "d",
// Web technologies
"html" | "htm" => "html",
"css" => "css",
"scss" | "sass" => "scss",
"less" => "less",
"vue" => "vue",
"svelte" => "svelte",
// Data formats
"json" => "json",
"yaml" | "yml" => "yaml",
"toml" => "toml",
"xml" => "xml",
"csv" => "csv",
"sql" => "sql",
// Markup languages
"md" | "markdown" => "markdown",
"tex" => "latex",
"rst" => "rst",
"adoc" | "asciidoc" => "asciidoc",
// Configuration files
"ini" | "cfg" => "ini",
"conf" | "config" => "text",
"env" => "dotenv",
"dockerfile" => "dockerfile",
"makefile" | "mk" => "makefile",
// Shell scripts
"bash" => "bash",
"bat" | "cmd" => "batch",
// Other
"proto" => "protobuf",
"graphql" | "gql" => "graphql",
"tf" => "hcl",
"vim" => "vim",
"diff" | "patch" => "diff",
// Default to text for unknown extensions
_ => "text",
}
}
/// Check if a file is likely to be binary based on its extension
pub fn is_binary_extension(path: &Path) -> bool {
let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
matches!(
extension.to_lowercase().as_str(),
// Executables and libraries
"exe" | "dll" | "so" | "dylib" | "a" | "lib" | "bin" |
// Archives
"zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" |
// Images
"jpg" | "jpeg" | "png" | "gif" | "bmp" | "ico" | "svg" | "webp" |
// Audio
"mp3" | "wav" | "flac" | "aac" | "ogg" | "wma" |
// Video
"mp4" | "avi" | "mkv" | "mov" | "wmv" | "flv" | "webm" |
// Documents
"pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" |
// Fonts
"ttf" | "otf" | "woff" | "woff2" | "eot" |
// Database
"db" | "sqlite" | "sqlite3" |
// Other binary formats
"pyc" | "pyo" | "class" | "o" | "obj" | "pdb"
)
}
/// Detect if content contains binary data (null bytes)
pub fn is_binary_content(content: &[u8]) -> bool {
// Check first 8KB for null bytes
let check_len = content.len().min(8192);
content[..check_len].contains(&0)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
#[test]
fn test_language_detection() {
assert_eq!(get_language_from_extension(Path::new("test.rs")), "rust");
assert_eq!(get_language_from_extension(Path::new("test.py")), "python");
assert_eq!(get_language_from_extension(Path::new("test.js")), "javascript");
assert_eq!(get_language_from_extension(Path::new("test.unknown")), "text");
assert_eq!(get_language_from_extension(Path::new("Makefile")), "text");
}
#[test]
fn test_binary_extension_detection() {
assert!(is_binary_extension(Path::new("test.exe")));
assert!(is_binary_extension(Path::new("image.png")));
assert!(is_binary_extension(Path::new("archive.zip")));
assert!(!is_binary_extension(Path::new("code.rs")));
assert!(!is_binary_extension(Path::new("text.md")));
}
#[test]
fn test_binary_content_detection() {
assert!(!is_binary_content(b"Hello, world!"));
assert!(is_binary_content(b"Hello\0world"));
assert!(is_binary_content(&[0xFF, 0xFE, 0x00, 0x00]));
}
}
```
## utils/mod.rs
```rust
//! Utility modules
pub mod error;
pub mod file_ext;
```