use std::io::Write;
use std::fs::File;
use git2::{Commit, DiffFormat, DiffOptions, Repository};
use anyhow::{Context, Result};
fn main() -> Result<()> {
env_logger::init();
let max_tokens = 49999;
let file_name = "fine-tune.md";
let max_commits = 100;
log::info!("Creating fine-tune file with {} commits and {} tokens", max_commits, max_tokens);
let repo = Repository::open(".").context("Failed to open git repository")?;
let mut revwalk = repo.revwalk().context("Failed to create Revwalk")?;
let mut file = File::create(file_name).context("Failed to create file")?;
file
.write_all(b"# Examples\n\nExamples of best practices for writing git commit messages:\n\n")
.context("Failed to write to file")?;
revwalk.push_head().expect("Failed to push head");
let mut curr_size = 0;
let mut commit_count = 0;
for oid in revwalk {
let oid = oid.context("Failed to get oid")?;
let commit = repo.find_commit(oid).context("Couldn't find commit")?;
if commit.parent_count() > 1 {
continue;
}
let Ok(Some(content)) = generate_commit_diff(&repo, &commit) else {
continue;
};
let Some(commit) = commit.message() else {
continue;
};
if commit.starts_with("Merge") {
continue;
}
if commit.starts_with("Revert") {
continue;
}
if commit.len() > 72 {
continue;
}
if commit.trim().contains("\n") {
continue;
}
if commit.contains("[") && commit.contains("]") {
continue;
}
let message = format!(
"## Example {}\n\n### GIT DIFF:\n\n{}\n### COMMIT MESSAGE:\n\n{}\n",
commit_count, content, commit
);
curr_size += message.split_whitespace().count();
if curr_size > max_tokens {
log::warn!("Max tokens reached: {}", max_tokens);
break;
}
commit_count += 1;
if commit_count >= max_commits {
break;
}
file.write_all(message.as_bytes()).context("Failed to write to file")?;
}
log::info!(
"Wrote {} commits to train file and {} commits to validate file",
commit_count / 2,
commit_count / 2
);
Ok(())
}
fn should_exclude_path(file_path: &str) -> bool {
let exclude_patterns = vec![
"/docs/", "/documentation/", "/guides/", "/assets/", "/images/", "/graphics/", "/designs/", "Gemfile", "Gemfile.lock", "/config/", "/settings/", "/initializers/", "/vendor/", "/third-party/", "/external/", "/submodules/", "/.github/", "/.gitignore", "/.gitmodules",
"/.gitattributes", "/.gitlab-ci.yml", "/.travis.yml", "/appveyor.yml", "/Dockerfile", "/docker-compose.yml", "/.dockerignore", "/.editorconfig", "/.rubocop.yml", "/.eslintignore",
"/.eslintrc", "/test/", "/spec/", "/tests/", "/specs/", "/locales/", "/i18n/", "/logs/", "/tmp/", "/public/", "/node_modules/", "/package.json", "/yarn.lock", "/.env", "/.env.example", "/db/schema.rb", "/db/migrate/", "/scripts/", "/tools/", "/CHANGELOG", "/LICENSE", "/README.md", ];
exclude_patterns.iter().any(|pattern| file_path.contains(pattern))
}
fn generate_commit_diff(repo: &Repository, commit: &Commit) -> Result<Option<String>> {
let parent = commit.parents().next().unwrap_or_else(|| commit.clone());
let tree = commit.tree().expect("Couldn't get commit tree");
let parent_tree = parent.tree().expect("Couldn't get parent tree");
let mut opts = DiffOptions::new();
opts
.ignore_whitespace_change(true)
.recurse_untracked_dirs(false)
.recurse_ignored_dirs(false)
.ignore_whitespace_eol(true)
.ignore_blank_lines(true)
.include_untracked(false)
.ignore_whitespace(true)
.indent_heuristic(false)
.ignore_submodules(true)
.include_ignored(false)
.interhunk_lines(0)
.context_lines(0)
.patience(true)
.minimal(true);
let diff = repo
.diff_tree_to_tree(Some(&parent_tree), Some(&tree), Some(&mut opts))
.context("Failed to get diff")?;
let mut patch: Vec<u8> = Vec::new();
#[rustfmt::skip]
diff.print(DiffFormat::Patch, |delta, _, line| {
if line.origin() == 'B' {
return false;
}
let file_path = delta.new_file().path().unwrap_or_else(|| delta.old_file().path().unwrap());
if should_exclude_path(file_path.to_str().unwrap()) {
return false;
}
let content = line.content();
patch.extend_from_slice(content);
true
}).context("Failed to print diff")?;
let content = String::from_utf8(patch).context("Failed to convert patch to string")?;
if content.split_whitespace().count() > 600 { Ok(None) } else { Ok(Some(content)) }
}