dumpfiles 0.3.0

A CLI and library for generating structured YAML representations of directory contents, optimized for efficiently sharing codebases with LLMs.
Documentation
extern crate anyhow;
extern crate ignore;
extern crate tempfile;
extern crate tiktoken_rs;

use anyhow::Context;
use ignore::{overrides::OverrideBuilder, WalkBuilder};
use std::io::{BufWriter, Write};
use std::path::Path;
use tempfile::NamedTempFile;
use tiktoken_rs::o200k_base;

pub enum GitignoreMode {
    Auto,
    Path(std::path::PathBuf),
    Disabled,
}

pub fn write_directory_contents_yaml(
    directory: &Path,
    output: &Path,
    ignore_patterns: &[String],
    gitignore_mode: GitignoreMode,
    dumpignore_path: Option<&Path>,
) -> anyhow::Result<()> {
    // Load BPE tokenizer
    let bpe = o200k_base().context("Failed to load BPE tokenizer")?;

    // Canonicalize the directory for absolute paths.
    let absolute_directory = directory
        .canonicalize()
        .context("Failed to get absolute path")?;

    // Set up the walker with ignore patterns and .gitignore/.dumpignore.
    let mut walker = WalkBuilder::new(&absolute_directory);
    if let Some(dumpignore) = dumpignore_path {
        walker.add_custom_ignore_filename(dumpignore);
    }

    // Create override builder for ignore patterns
    if !ignore_patterns.is_empty() {
        let mut override_builder = OverrideBuilder::new(&absolute_directory);
        for pattern in ignore_patterns {
            // Add patterns with ! prefix to ignore them (gitignore semantics)
            let ignore_pattern = if pattern.starts_with('!') {
                pattern.to_string()
            } else {
                format!("!{}", pattern)
            };
            if let Err(err) = override_builder.add(&ignore_pattern) {
                log::warn!("Failed to add ignore pattern '{}': {}", pattern, err);
            }
        }
        if let Ok(overrides) = override_builder.build() {
            walker.overrides(overrides);
        }
    }

    // Gitignore behavior per the desired semantics
    match gitignore_mode {
        GitignoreMode::Auto => {
            // Default discovery ON (this is WalkBuilder’s default, but be explicit)
            walker.require_git(false); // do not require a .git directory
            walker.ignore(true); // .ignore
            walker.git_ignore(true); // discovered .gitignore
            walker.git_global(true); // global excludes
            walker.git_exclude(true); // core excludes
        }
        GitignoreMode::Path(ref p) => {
            // Use only the provided file; disable discovery
            walker.require_git(false);
            walker.ignore(false);
            walker.git_ignore(false);
            walker.git_global(false);
            walker.git_exclude(false);
            walker.add_ignore(p);
        }
        GitignoreMode::Disabled => {
            // Disable everything
            walker.require_git(false);
            walker.ignore(false);
            walker.git_ignore(false);
            walker.git_global(false);
            walker.git_exclude(false);
        }
    }

    walker.follow_links(false);

    // Always ignore the output file.
    let output_abs = if output.is_absolute() {
        output.to_path_buf()
    } else {
        absolute_directory.join(output)
    };

    // Canonicalize to match walker's canonicalized entries, but fall back if it doesn't exist yet.
    let output_abs = output_abs.canonicalize().unwrap_or(output_abs);

    walker.filter_entry(move |entry| entry.path() != output_abs);

    let walker_iter = walker.build();

    // Temporary file to write output before moving to final destination
    let tmp = NamedTempFile::new().context("Failed to create temporary file")?;
    let mut writer = BufWriter::new(tmp.reopen()?);

    // Write the YAML header. Here, the project name is taken from the directory's final component.
    let project_name = absolute_directory
        .file_name()
        .map_or("project", |s| s.to_str().unwrap());
    writeln!(writer, "project: {}", project_name)?;
    writeln!(writer, "files:")?;

    // Track statistics for logging
    let mut file_count = 0;
    let mut total_characters = 0;
    let mut total_tokens = 0;

    // Iterate over the walker and write each file as a flat YAML entry.
    for entry in walker_iter {
        let entry = entry.context("Failed to read directory entry")?;
        if entry.file_type().is_some_and(|ft| ft.is_file()) {
            // Get the file's path relative to the directory.
            let relative_path = entry
                .path()
                .strip_prefix(&absolute_directory)
                .context("Failed to get relative path")?;
            let relative_path_str = relative_path.to_string_lossy();

            // Retrieve file metadata to compute its size.
            let metadata = entry.metadata().context("Failed to get metadata")?;
            let size_bytes = metadata.len();
            let size_str = if size_bytes < 1024 {
                format!("{} B", size_bytes)
            } else {
                let kb = size_bytes as f64 / 1024.0;
                format!("{:.1} KB", kb)
            };

            let (lines, tokens, content) = match std::fs::read_to_string(entry.path()) {
                Ok(text) => {
                    let line_count = text.lines().count();
                    let tokens = bpe.encode_with_special_tokens(&text);
                    (line_count, tokens.len(), text)
                }
                Err(_) => (
                    0,
                    0,
                    format!("Binary or inaccessible file: {}", entry.path().display()),
                ),
            };

            // Update statistics
            file_count += 1;
            total_characters += content.chars().count();
            total_tokens += tokens;

            // Write the YAML mapping for this file.
            writeln!(writer, "  - path: {:?}", relative_path_str)?;
            writeln!(writer, "    size: \"{}\"", size_str)?;
            writeln!(writer, "    lines: {}", lines)?;
            writeln!(writer, "    tokens: {}", tokens)?;
            writeln!(writer, "    content: |")?;
            // Indent each line of content with six spaces.
            for line in content.lines() {
                writeln!(writer, "      {}", line)?;
            }
        }
    }

    // Rename temporary file to the output location
    if let Some(parent) = output.parent() {
        std::fs::create_dir_all(parent).ok();
    }
    tmp.persist(output).map_err(|e| anyhow::anyhow!(e.error))?;

    writer.flush()?;

    // Log summary statistics
    log::info!(
        "Processed {} files with {} total characters and {} total tokens.",
        file_count,
        total_characters,
        total_tokens
    );

    Ok(())
}