repo-flatten 0.2.1

A utility to flatten all files in the repository into a single file, consumed by LLMs. Will ignore .gitignore and hidden files.
//! Repository handling module for git operations.

use anyhow::{Context, Result};
use git2::Repository;
use std::collections::HashSet;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use tracing::{info, info_span};

/// Repository context containing the git repository and its root path.
pub struct RepositoryContext {
    pub repo: Repository,
    pub root: PathBuf,
}

/// Information about the current working directory state
pub struct WorkingDirInfo {
    pub branch_name: String,
    pub status_summary: String,
}

/// Discover a git repository starting from the given path.
#[tracing::instrument(level = "info", skip(repo_path), fields(repo_path=%repo_path.display()), err)]
pub fn discover_repository(repo_path: &Path) -> Result<RepositoryContext> {
    let span = info_span!("repository.discover");
    let _guard = span.enter();

    info!("discovering git repository");

    // Discover the repository, searching upwards from the given path.
    let repo = Repository::discover(repo_path).with_context(|| {
        format!(
            "Failed to discover repository from '{}'",
            repo_path.display()
        )
    })?;

    // The workdir is the root of the repository.
    let repo_root = repo
        .workdir()
        .context("Repository is bare, cannot flatten.")?
        .to_path_buf();

    info!(repo_root=%repo_root.display(), "repository discovered successfully");

    Ok(RepositoryContext {
        repo,
        root: repo_root,
    })
}

/// Get information about the current working directory state.
#[tracing::instrument(level = "info", skip(repo), err)]
pub fn get_working_dir_info(repo: &Repository) -> Result<WorkingDirInfo> {
    let span = info_span!("repository.get_working_dir_info");
    let _guard = span.enter();

    info!("analyzing working directory state");

    // Get current branch name
    let branch_name = match repo.head() {
        Ok(head) => {
            if let Some(name) = head.shorthand() {
                name.to_string()
            } else {
                "detached HEAD".to_string()
            }
        }
        Err(_) => "unknown".to_string(),
    };

    // Check if there are any changes (staged or unstaged)
    let mut change_count = 0;

    // Get repository status
    let statuses = repo.statuses(None)?;
    for status in statuses.iter() {
        let status_flags = status.status();
        if !status_flags.is_ignored() && !status_flags.is_empty() {
            change_count += 1;
        }
    }

    let status_summary = if change_count > 0 {
        format!("{change_count} files with changes")
    } else {
        "clean working directory".to_string()
    };

    info!(branch=%branch_name, changes=%change_count, "working directory analyzed");

    Ok(WorkingDirInfo {
        branch_name,
        status_summary,
    })
}

/// Walk the working directory and write current files (including unstaged changes) to the output writer.
#[tracing::instrument(level = "info", skip(repo, writer), fields(filter_count=filter_paths.len()), err)]
pub fn walk_and_write_working_dir<W: Write>(
    repo: &Repository,
    filter_paths: &[PathBuf],
    writer: &mut W,
) -> Result<usize> {
    let span = info_span!("repository.walk_working_dir");
    let _guard = span.enter();

    info!("starting working directory walk");

    let repo_root = repo
        .workdir()
        .context("Repository has no working directory")?;

    // Get all files that should be included using git status
    // This is much faster than checking each file individually
    let mut files_to_include = HashSet::new();
    let statuses = repo.statuses(None)?;
    
    for status in statuses.iter() {
        if let Some(path) = status.path() {
            let status_flags = status.status();
            // Include files that are not ignored and not in a conflicted state
            if !status_flags.is_ignored() {
                let full_path = repo_root.join(path);
                files_to_include.insert(full_path);
            }
        }
    }

    // Also include all files that are currently tracked in the index
    // This ensures we get committed files even if they have no changes
    let index = repo.index()?;
    for entry in index.iter() {
        let path_str = std::str::from_utf8(&entry.path)?;
        let full_path = repo_root.join(path_str);
        if full_path.exists() {
            files_to_include.insert(full_path);
        }
    }

    // Filter files based on user-provided filter paths
    let files_to_process: Vec<PathBuf> = if filter_paths.is_empty() {
        files_to_include.into_iter().collect()
    } else {
        let filter_set: HashSet<PathBuf> = filter_paths
            .iter()
            .map(|p| {
                if p.is_absolute() {
                    p.clone()
                } else {
                    repo_root.join(p)
                }
            })
            .collect();

        files_to_include
            .into_iter()
            .filter(|file_path| {
                // Check if the file matches any of the filter paths
                filter_set.iter().any(|filter_path| {
                    if filter_path.is_file() {
                        file_path == filter_path
                    } else {
                        file_path.starts_with(filter_path)
                    }
                })
            })
            .collect()
    };

    // Process all the files
    let mut file_count = 0;
    for file_path in files_to_process {
        if file_path.exists() && file_path.is_file() {
            process_file(&file_path, repo_root, writer)?;
            file_count += 1;
        }
    }

    info!(
        files_processed = file_count,
        "working directory walk completed successfully"
    );
    Ok(file_count)
}

/// Process a single file and write it to the output.
fn process_file<W: Write>(file_path: &Path, repo_root: &Path, writer: &mut W) -> Result<()> {
    let file_span = info_span!("repository.process_file", file_path=%file_path.display());
    let _file_guard = file_span.enter();

    // Get relative path for display
    let relative_path = file_path.strip_prefix(repo_root).unwrap_or(file_path);

    // Write the file header
    writeln!(writer, "--- File: {} ---", relative_path.display())
        .context("Failed to write file header")?;

    // Read file content
    let content = fs::read(file_path)
        .with_context(|| format!("Failed to read file: {}", file_path.display()))?;

    // Check if content is binary
    if is_binary_content(&content) {
        writeln!(writer, "[Binary file: content not included]\n")
            .context("Failed to write binary file placeholder")?;
    } else {
        // Write text content
        writer
            .write_all(&content)
            .context("Failed to write file content")?;
        writeln!(writer, "\n").context("Failed to write trailing newline")?;
    }

    info!(file_path=%relative_path.display(), "file processed successfully");
    Ok(())
}

/// Simple heuristic to detect binary content.
fn is_binary_content(content: &[u8]) -> bool {
    // Check first 8KB for null bytes (common binary indicator)
    let check_len = content.len().min(8192);
    content[..check_len].contains(&0)
}