use crate::token_cleaner::{clean_and_redact, count_tokens};
use anyhow::{anyhow, Result};
use regex::Regex;
use std::collections::HashSet;
use std::path::{Component, Path, PathBuf};
use std::process::Command;
use tokio::fs;
#[cfg(all(target_os = "macos", feature = "embeddings"))]
const OCR_FILE_TYPES: &[&str] = &[
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf",
];
#[cfg(all(target_os = "macos", feature = "embeddings"))]
const DEFAULT_FILE_TYPE_EXCLUSIONS: &[&str] = &[
".svg", ".ico", ".ttf", ".woff", ".woff2", ".eot", ".otf", ".lock", ".lockb", ".exe", ".dll",
".so", ".dylib", ".bin", ".dat", ".pyc", ".pyo", ".class", ".jar", ".zip", ".tar", ".gz",
".rar", ".7z", ".mp3", ".mp4", ".avi", ".mov", ".wav", ".db", ".sqlite", ".sqlite3",
];
#[cfg(any(not(target_os = "macos"), not(feature = "embeddings")))]
const DEFAULT_FILE_TYPE_EXCLUSIONS: &[&str] = &[
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf", ".svg", ".ico", ".ttf",
".woff", ".woff2", ".eot", ".otf", ".lock", ".lockb", ".exe", ".dll", ".so", ".dylib", ".bin",
".dat", ".pyc", ".pyo", ".class", ".jar", ".zip", ".tar", ".gz", ".rar", ".7z", ".mp3", ".mp4",
".avi", ".mov", ".wav", ".db", ".sqlite", ".sqlite3",
];
const DEFAULT_FILE_EXCLUSIONS: &[&str] = &[
"**/.*rc",
"**/.*rc.{js,json,yaml,yml}",
"**/*.config.{js,ts}",
"**/tsconfig.json",
"**/tsconfig*.json",
"**/jsconfig.json",
"**/jsconfig*.json",
"**/package-lock.json",
"**/.prettierignore",
"**/.dockerignore",
"**/.env*",
"**/*.vars",
"**/secrets.*",
"**/.git*",
"**/.hg*",
"**/.svn*",
"**/CVS",
"**/.github/",
"**/.gitlab-ci.yml",
"**/azure-pipelines.yml",
"**/jenkins*",
"**/node_modules/",
"**/target/",
"**/__pycache__/",
"**/venv/",
"**/.venv/",
"**/env/",
"**/build/",
"**/dist/",
"**/out/",
"**/bin/",
"**/obj/",
"**/README*",
"**/CHANGELOG*",
"**/CONTRIBUTING*",
"**/LICENSE*",
"**/docs/",
"**/documentation/",
"**/.idea/",
"**/.vscode/",
"**/.eclipse/",
"**/.settings/",
"**/.zed/",
"**/.cursor/",
"**/.project",
"**/.classpath",
"**/.factorypath",
"**/test{s,}/",
"**/spec/",
"**/fixtures/",
"**/testdata/",
"**/__tests__/",
"**/*.{test,spec}.*",
"**/coverage/",
"**/jest.config.*",
"**/logs/",
"**/tmp/",
"**/temp/",
"**/*.log",
];
pub struct MarkdownGeneratorOptions {
pub dir: PathBuf,
pub output_file_path: PathBuf,
pub file_type_exclusions: HashSet<String>,
pub file_exclusions: Vec<String>,
pub verbose: bool,
}
impl Default for MarkdownGeneratorOptions {
fn default() -> Self {
Self {
dir: PathBuf::from("."),
output_file_path: PathBuf::from("prompt.md"),
file_type_exclusions: DEFAULT_FILE_TYPE_EXCLUSIONS
.iter()
.map(|s| s.to_string())
.collect(),
file_exclusions: DEFAULT_FILE_EXCLUSIONS
.iter()
.map(|s| s.to_string())
.collect(),
verbose: true,
}
}
}
pub struct MarkdownGenerator {
options: MarkdownGeneratorOptions,
file_exclusions: Vec<String>,
initialized: bool,
behavior: MarkdownGeneratorBehavior,
}
#[derive(Debug, Clone, Copy)]
struct MarkdownGeneratorBehavior {
include_todo: bool,
create_todo_file: bool,
update_gitignore: bool,
include_embeddings_artifact: bool,
}
impl MarkdownGeneratorBehavior {
fn programmatic() -> Self {
Self {
include_todo: true,
create_todo_file: false,
update_gitignore: false,
include_embeddings_artifact: false,
}
}
fn cli(generate_embeddings: bool) -> Self {
Self {
include_todo: true,
create_todo_file: true,
update_gitignore: true,
include_embeddings_artifact: generate_embeddings,
}
}
}
impl MarkdownGenerator {
pub fn new(options: MarkdownGeneratorOptions) -> Self {
Self::with_behavior(options, MarkdownGeneratorBehavior::programmatic())
}
pub fn new_for_cli(options: MarkdownGeneratorOptions, generate_embeddings: bool) -> Self {
Self::with_behavior(options, MarkdownGeneratorBehavior::cli(generate_embeddings))
}
fn with_behavior(options: MarkdownGeneratorOptions, behavior: MarkdownGeneratorBehavior) -> Self {
let mut file_exclusions = options.file_exclusions.clone();
Self::add_generated_file_exclusions(&options, behavior, &mut file_exclusions);
Self {
file_exclusions,
options,
initialized: false,
behavior,
}
}
fn add_generated_file_exclusions(
options: &MarkdownGeneratorOptions,
behavior: MarkdownGeneratorBehavior,
file_exclusions: &mut Vec<String>,
) {
if let Some(output_path) =
Self::repo_relative_path(&options.dir, &options.output_file_path)
{
Self::push_unique(file_exclusions, output_path);
}
if behavior.include_todo {
Self::push_unique(file_exclusions, String::from("todo"));
}
if behavior.include_embeddings_artifact {
Self::push_unique(file_exclusions, String::from("embeddings.json"));
}
}
fn push_unique(values: &mut Vec<String>, value: String) {
if !value.is_empty() && !values.iter().any(|existing| existing == &value) {
values.push(value);
}
}
fn repo_relative_path(repo_dir: &Path, path: &Path) -> Option<String> {
let relative_path = if path.is_absolute() {
if let Ok(path) = path.strip_prefix(repo_dir) {
path
} else {
let repo_dir = if repo_dir.is_absolute() {
repo_dir.to_path_buf()
} else {
std::env::current_dir().ok()?.join(repo_dir)
};
let repo_dir = repo_dir.canonicalize().unwrap_or(repo_dir);
path.strip_prefix(repo_dir).ok()?
}
} else {
path
};
let mut parts = Vec::new();
for component in relative_path.components() {
match component {
Component::CurDir => {}
Component::Normal(part) => parts.push(part.to_string_lossy().into_owned()),
_ => return None,
}
}
if parts.is_empty() {
None
} else {
Some(parts.join("/"))
}
}
async fn load_nested_ignore_files(&mut self) -> Result<()> {
if self.options.verbose {
println!("Loading ignore patterns...");
}
let mut ignore_files = Vec::new();
self.find_ignore_files(&self.options.dir, &mut ignore_files)?;
if self.options.verbose {
println!("Found {} ignore files", ignore_files.len());
}
for ignore_file in ignore_files {
if let Ok(content) = fs::read_to_string(&ignore_file).await {
let patterns: Vec<String> = content
.lines()
.map(|line| line.trim())
.filter(|line| !line.is_empty() && !line.starts_with('#'))
.map(|s| s.to_string())
.collect();
if let Ok(ignore_dir) = ignore_file
.parent()
.unwrap_or_else(|| Path::new("."))
.to_path_buf()
.strip_prefix(&self.options.dir)
{
for pattern in patterns {
let relative_pattern = if ignore_dir.as_os_str().is_empty()
|| pattern.starts_with('/')
|| pattern.starts_with("**")
{
pattern
} else {
format!("{}/{}", ignore_dir.display(), pattern)
};
self.file_exclusions.push(relative_pattern);
}
}
}
}
self.file_exclusions.sort();
self.file_exclusions.dedup();
if self.options.verbose {
println!("Total exclusion patterns: {}", self.file_exclusions.len());
}
Ok(())
}
fn find_ignore_files(&self, dir: &Path, results: &mut Vec<PathBuf>) -> Result<()> {
use walkdir::WalkDir;
for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
if entry.file_name() == ".aiignore" {
results.push(entry.path().to_path_buf());
}
}
Ok(())
}
async fn initialize(&mut self) -> Result<()> {
if !self.initialized {
self.load_nested_ignore_files().await?;
self.initialized = true;
}
Ok(())
}
async fn get_tracked_files(&mut self) -> Result<Vec<String>> {
self.initialize().await?;
let output = Command::new("git")
.arg("ls-files")
.current_dir(&self.options.dir)
.output()
.map_err(|e| anyhow!("Failed to execute git ls-files: {}", e))?;
if !output.status.success() {
return Err(anyhow!("git ls-files failed"));
}
let output_str = String::from_utf8(output.stdout)
.map_err(|e| anyhow!("Failed to decode git output: {}", e))?;
let tracked_files: Vec<String> = output_str
.lines()
.filter(|line| !line.trim().is_empty())
.map(|s| s.to_string())
.collect();
if self.options.verbose {
println!("Total tracked files: {}", tracked_files.len());
}
let total_files = tracked_files.len();
let filtered_files = tracked_files
.into_iter()
.filter(|file| {
let path = Path::new(file);
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e))
.unwrap_or_default();
if self.options.file_type_exclusions.contains(&ext) {
return false;
}
!self.matches_exclusion_patterns(file)
})
.collect::<Vec<_>>();
if self.options.verbose {
println!("Excluded files: {}", total_files - filtered_files.len());
println!(
"Files to process after exclusions: {}",
filtered_files.len()
);
}
Ok(filtered_files)
}
fn matches_exclusion_patterns(&self, file: &str) -> bool {
for pattern in &self.file_exclusions {
if self.glob_match(pattern, file) {
return true;
}
}
false
}
fn glob_match(&self, pattern: &str, path: &str) -> bool {
let pattern = pattern
.replace("**", ".*")
.replace("*", "[^/]*")
.replace("?", "[^/]");
let pattern = format!("^{}$", pattern);
if let Ok(re) = Regex::new(&pattern) {
re.is_match(path)
} else {
false
}
}
#[cfg(all(target_os = "macos", feature = "embeddings"))]
fn is_ocr_file(ext: &str) -> bool {
OCR_FILE_TYPES.contains(&ext)
}
async fn read_file_content(&self, file_path: &Path) -> Result<String> {
#[cfg(all(target_os = "macos", feature = "embeddings"))]
{
let ext = file_path
.extension()
.and_then(|e| e.to_str())
.map(|e| format!(".{}", e.to_lowercase()))
.unwrap_or_default();
if Self::is_ocr_file(&ext) {
return self.read_file_content_ocr(file_path).await;
}
}
let content = fs::read_to_string(file_path).await?;
let cleaned = clean_and_redact(&content);
if self.options.verbose && !cleaned.is_empty() {
let token_count = count_tokens(&cleaned);
println!("{}: Tokens[{}]", file_path.display(), token_count);
}
Ok(cleaned.trim_end().to_string())
}
#[cfg(all(target_os = "macos", feature = "embeddings"))]
async fn read_file_content_ocr(&self, file_path: &Path) -> Result<String> {
use toak_ocr::{AppleOcrEngine, OcrEngine, OcrInput};
let engine = AppleOcrEngine::new();
let input = OcrInput::FilePath(file_path.to_path_buf());
let output = engine
.recognize(&input)
.await
.map_err(|e| anyhow!("OCR failed for {}: {}", file_path.display(), e))?;
if self.options.verbose && !output.text.is_empty() {
let token_count = count_tokens(&output.text);
println!("{}: Tokens[{}] (OCR)", file_path.display(), token_count);
}
Ok(output.text.trim_end().to_string())
}
async fn generate_markdown(&mut self) -> Result<String> {
let tracked_files = self.get_tracked_files().await?;
if self.options.verbose {
println!("Generating markdown for {} files", tracked_files.len());
}
let mut markdown = String::from("# Project Files\n\n");
for file in tracked_files {
let absolute_path = self.options.dir.join(&file);
match self.read_file_content(&absolute_path).await {
Ok(content) => {
if !content.trim().is_empty() {
markdown.push_str(&format!("## {}\n~~~\n{}\n~~~\n\n", file, content.trim()));
} else if self.options.verbose {
println!("Skipping {} as it has no content after cleaning.", file);
}
}
Err(e) => {
if self.options.verbose {
eprintln!("Error reading file {}: {}", file, e);
}
}
}
}
Ok(markdown)
}
async fn get_todo(&self) -> Result<Option<String>> {
let todo_path = self.options.dir.join("todo");
if self.options.verbose {
println!("Reading todo file");
}
match fs::read_to_string(&todo_path).await {
Ok(content) => Ok(Some(content)),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
if !self.behavior.create_todo_file {
return Ok(None);
}
if self.options.verbose {
println!("File not found, creating a new 'todo' file.");
}
fs::write(&todo_path, "").await?;
Ok(Some(String::new()))
}
Err(e) => Err(anyhow!("Error reading todo file: {}", e)),
}
}
fn gitignore_entries(&self) -> Vec<String> {
let mut entries = Vec::new();
if let Some(output_path) =
Self::repo_relative_path(&self.options.dir, &self.options.output_file_path)
{
Self::push_unique(&mut entries, output_path);
}
if self.behavior.include_todo {
Self::push_unique(&mut entries, String::from("todo"));
}
if self.behavior.include_embeddings_artifact {
Self::push_unique(&mut entries, String::from("embeddings.json"));
}
entries
}
async fn update_gitignore(&self) -> Result<()> {
let entries = self.gitignore_entries();
if entries.is_empty() {
return Ok(());
}
let gitignore_path = self.options.dir.join(".gitignore");
let content = match fs::read_to_string(&gitignore_path).await {
Ok(c) => c,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
if self.options.verbose {
println!("File not found, creating a '.gitignore' file.");
}
String::new()
}
Err(e) => return Err(anyhow!("Error reading .gitignore: {}", e)),
};
let lines: Vec<&str> = content.lines().map(|l| l.trim()).collect();
let missing_entries = entries
.into_iter()
.filter(|entry| !lines.contains(&entry.as_str()))
.collect::<Vec<_>>();
if !missing_entries.is_empty() {
if self.options.verbose {
println!("Updating .gitignore with generated files");
}
let mut new_content = content;
if !new_content.is_empty() && !new_content.ends_with('\n') {
new_content.push('\n');
}
for entry in missing_entries {
new_content.push_str(&entry);
new_content.push('\n');
}
fs::write(&gitignore_path, new_content).await?;
}
Ok(())
}
pub async fn create_markdown_document(&mut self) -> Result<MarkdownResult> {
let code_markdown = self.generate_markdown().await?;
let markdown = if self.behavior.include_todo {
match self.get_todo().await? {
Some(todos) => format!("{}\n---\n\n{}\n", code_markdown, todos),
None => code_markdown,
}
} else {
code_markdown
};
if self.behavior.update_gitignore {
self.update_gitignore().await?;
}
let token_count = count_tokens(&markdown);
if self.options.verbose {
println!(
"Markdown document created at {}",
self.options.output_file_path.display()
);
println!("{{ \"total_tokens\": {} }}", token_count);
}
fs::write(&self.options.output_file_path, &markdown).await?;
Ok(MarkdownResult {
success: true,
token_count: Some(token_count),
error: None,
})
}
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct MarkdownResult {
pub success: bool,
pub token_count: Option<usize>,
pub error: Option<String>,
}