repo-flatten 0.2.0

A utility to flatten all files in the repository into a single file, consumed by LLMs. Will ignore .gitignore and hidden files.
//! Repository handling module for git operations.

use anyhow::{Context, Result};
use git2::Repository;
use std::collections::HashSet;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use tracing::{info, info_span, warn};

/// Repository context containing the git repository and its root path.
pub struct RepositoryContext {
    pub repo: Repository,
    pub root: PathBuf,
}

/// Information about the current working directory state
pub struct WorkingDirInfo {
    pub branch_name: String,
    pub status_summary: String,
}

/// Discover a git repository starting from the given path.
#[tracing::instrument(level = "info", skip(repo_path), fields(repo_path=%repo_path.display()), err)]
pub fn discover_repository(repo_path: &Path) -> Result<RepositoryContext> {
    let span = info_span!("repository.discover");
    let _guard = span.enter();

    info!("discovering git repository");

    // Discover the repository, searching upwards from the given path.
    let repo = Repository::discover(repo_path).with_context(|| {
        format!(
            "Failed to discover repository from '{}'",
            repo_path.display()
        )
    })?;

    // The workdir is the root of the repository.
    let repo_root = repo
        .workdir()
        .context("Repository is bare, cannot flatten.")?
        .to_path_buf();

    info!(repo_root=%repo_root.display(), "repository discovered successfully");

    Ok(RepositoryContext {
        repo,
        root: repo_root,
    })
}

/// Get information about the current working directory state.
#[tracing::instrument(level = "info", skip(repo), err)]
pub fn get_working_dir_info(repo: &Repository) -> Result<WorkingDirInfo> {
    let span = info_span!("repository.get_working_dir_info");
    let _guard = span.enter();

    info!("analyzing working directory state");

    // Get current branch name
    let branch_name = match repo.head() {
        Ok(head) => {
            if let Some(name) = head.shorthand() {
                name.to_string()
            } else {
                "detached HEAD".to_string()
            }
        }
        Err(_) => "unknown".to_string(),
    };

    // Check if there are any changes (staged or unstaged)
    let mut change_count = 0;

    // Get repository status
    let statuses = repo.statuses(None)?;
    for status in statuses.iter() {
        let status_flags = status.status();
        if !status_flags.is_ignored() && !status_flags.is_empty() {
            change_count += 1;
        }
    }

    let status_summary = if change_count > 0 {
        format!("{change_count} files with changes")
    } else {
        "clean working directory".to_string()
    };

    info!(branch=%branch_name, changes=%change_count, "working directory analyzed");

    Ok(WorkingDirInfo {
        branch_name,
        status_summary,
    })
}

/// Walk the working directory and write current files (including unstaged changes) to the output writer.
#[tracing::instrument(level = "info", skip(repo, writer), fields(filter_count=filter_paths.len()), err)]
pub fn walk_and_write_working_dir<W: Write>(
    repo: &Repository,
    filter_paths: &[PathBuf],
    writer: &mut W,
) -> Result<usize> {
    let span = info_span!("repository.walk_working_dir");
    let _guard = span.enter();

    info!("starting working directory walk");

    let repo_root = repo
        .workdir()
        .context("Repository has no working directory")?;

    // Get git status to understand which files to include
    let statuses = repo.statuses(None)?;
    let mut git_tracked_files = HashSet::new();
    let mut ignored_files = HashSet::new();

    // Collect information about git-tracked and ignored files
    for status in statuses.iter() {
        if let Some(path) = status.path() {
            let file_path = repo_root.join(path);
            if status.status().is_ignored() {
                ignored_files.insert(file_path);
            } else {
                git_tracked_files.insert(file_path);
            }
        }
    }

    // Determine which paths to walk
    let paths_to_walk = if filter_paths.is_empty() {
        vec![repo_root.to_path_buf()]
    } else {
        filter_paths
            .iter()
            .map(|p| {
                if p.is_absolute() {
                    p.clone()
                } else {
                    repo_root.join(p)
                }
            })
            .collect()
    };

    let mut file_count = 0;

    for start_path in paths_to_walk {
        if !start_path.exists() {
            warn!(path=%start_path.display(), "filter path does not exist, skipping");
            continue;
        }

        file_count += walk_directory_recursive(
            &start_path,
            repo_root,
            &git_tracked_files,
            &ignored_files,
            writer,
        )?;
    }

    info!(
        files_processed = file_count,
        "working directory walk completed successfully"
    );
    Ok(file_count)
}

/// Recursively walk a directory and write files that are not ignored by git.
fn walk_directory_recursive<W: Write>(
    dir_path: &Path,
    repo_root: &Path,
    git_tracked_files: &HashSet<PathBuf>,
    ignored_files: &HashSet<PathBuf>,
    writer: &mut W,
) -> Result<usize> {
    let mut file_count = 0;

    if dir_path.is_file() {
        // If it's a single file, process it directly
        if should_include_file(dir_path, git_tracked_files, ignored_files)? {
            process_file(dir_path, repo_root, writer)?;
            file_count += 1;
        }
        return Ok(file_count);
    }

    // Read directory contents
    let entries = fs::read_dir(dir_path)
        .with_context(|| format!("Failed to read directory: {}", dir_path.display()))?;

    for entry in entries {
        let entry = entry?;
        let path = entry.path();

        // Skip .git directory
        if path.file_name().map(|n| n == ".git").unwrap_or(false) {
            continue;
        }

        if path.is_file() {
            if should_include_file(&path, git_tracked_files, ignored_files)? {
                process_file(&path, repo_root, writer)?;
                file_count += 1;
            }
        } else if path.is_dir() {
            // Recursively walk subdirectories
            file_count += walk_directory_recursive(
                &path,
                repo_root,
                git_tracked_files,
                ignored_files,
                writer,
            )?;
        }
    }

    Ok(file_count)
}

/// Determine if a file should be included based on git status.
fn should_include_file(
    file_path: &Path,
    git_tracked_files: &HashSet<PathBuf>,
    ignored_files: &HashSet<PathBuf>,
) -> Result<bool> {
    // Never include ignored files
    if ignored_files.contains(file_path) {
        return Ok(false);
    }

    // Include if it's explicitly tracked by git OR if it's a new file that's not ignored
    Ok(git_tracked_files.contains(file_path) || !ignored_files.contains(file_path))
}

/// Process a single file and write it to the output.
fn process_file<W: Write>(file_path: &Path, repo_root: &Path, writer: &mut W) -> Result<()> {
    let file_span = info_span!("repository.process_file", file_path=%file_path.display());
    let _file_guard = file_span.enter();

    // Get relative path for display
    let relative_path = file_path.strip_prefix(repo_root).unwrap_or(file_path);

    // Write the file header
    writeln!(writer, "--- File: {} ---", relative_path.display())
        .context("Failed to write file header")?;

    // Read file content
    let content = fs::read(file_path)
        .with_context(|| format!("Failed to read file: {}", file_path.display()))?;

    // Check if content is binary
    if is_binary_content(&content) {
        writeln!(writer, "[Binary file: content not included]\n")
            .context("Failed to write binary file placeholder")?;
    } else {
        // Write text content
        writer
            .write_all(&content)
            .context("Failed to write file content")?;
        writeln!(writer, "\n").context("Failed to write trailing newline")?;
    }

    info!(file_path=%relative_path.display(), "file processed successfully");
    Ok(())
}

/// Simple heuristic to detect binary content.
fn is_binary_content(content: &[u8]) -> bool {
    // Check first 8KB for null bytes (common binary indicator)
    let check_len = content.len().min(8192);
    content[..check_len].contains(&0)
}