1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// use std::io::Write;
// use std::fs::File;
// use git2::{Commit, DiffFormat, DiffOptions, Repository};
// use anyhow::{Context, Result};
// use serde_json::json;
// fn prompt() -> String {
// let language = config::APP.language;
// let max_length = config::APP.max_length;
// format!("Create concise and meaningful git commit messages based on diffs, incorporating these practices:
// - Language: {language}.
// - Maximum Length: {max_length} characters for the summary.
// - Structure: Begin with a clear summary. Use present tense.
// - Clarity and Relevance: Focus on detailing the changes and their reasons. Exclude irrelevant details.
// - Consistency: Maintain a consistent style of tense, punctuation, and capitalization.
// - Review: Ensure the commit message accurately reflects the changes made and their purpose without leaving the description blank.
// Refer to examples.jsonl for examples of how commit messages can be mapped to git diffs")
// }
// fn main() -> Result<()> {
// env_logger::init();
// let max_tokens = 16385;
// let validate_file_name = "validate.jsonl";
// let train_file_name = "train.jsonl";
// let max_commits = 20;
// log::info!("Creating fine-tune file with {} commits and {} tokens", max_commits, max_tokens);
// let repo = Repository::open(".").context("Failed to open git repository")?;
// // let config = repo.config().context("Couldn't access repository config")?;
// // let user_email = config.get_string("user.email").context("Couldn't get user email")?;
// let mut revwalk = repo.revwalk().context("Failed to create Revwalk")?;
// let mut validate_file = File::create(validate_file_name).context("Failed to create file")?;
// let mut train_file = File::create(train_file_name).context("Failed to create file")?;
// validate_file.write_all(b"").context("Failed to write to file")?;
// train_file.write_all(b"").context("Failed to write to file")?;
// revwalk.push_head().expect("Failed to push head");
// let mut curr_size = 0;
// let mut commit_count = 0;
// let mut result = vec![];
// for oid in revwalk {
// let oid = oid.context("Failed to get oid")?;
// let commit = repo.find_commit(oid).context("Couldn't find commit")?;
// if commit.parent_count() > 1 {
// continue;
// }
// // let weight = if commit.author().email() == Some(&user_email) {
// // 1
// // } else if commit.committer().email() == Some(&user_email) {
// // 1
// // } else {
// // 0
// // };
// let Ok(Some(content)) = generate_commit_diff(&repo, &commit) else {
// continue;
// };
// let Some(commit) = commit.message() else {
// continue;
// };
// if commit.starts_with("Merge") {
// continue;
// }
// if commit.starts_with("Revert") {
// continue;
// }
// if commit.len() > 72 {
// continue;
// }
// // Check if it contains a new line
// if commit.trim().contains("\n") {
// continue;
// }
// if commit.contains("[") && commit.contains("]") {
// continue;
// }
// let message = json!({
// "messages": [
// { "role": "assistant", "content": commit.trim() },
// { "role": "user", "content": content.trim() },
// { "role": "system", "content": "Convert from git patch into git commit message" }
// ]
// });
// let content = serde_json::to_string(&message)?;
// curr_size += content.split_whitespace().count();
// if curr_size > max_tokens {
// log::warn!("Max tokens reached: {}", max_tokens);
// break;
// }
// commit_count += 1;
// result.push(message);
// if commit_count >= max_commits {
// break;
// }
// }
// let train_result = result[..(result.len() / 2)].to_vec();
// for (i, message) in train_result.iter().enumerate() {
// let content = serde_json::to_string(&message)?;
// if i > 0 {
// train_file.write_all(b"\n").context("Failed to write to file")?;
// }
// train_file.write_all(content.as_bytes()).context("Failed to write to file")?;
// }
// let validate_result = result[(result.len() / 2)..].to_vec();
// for (i, message) in validate_result.iter().enumerate() {
// let content = serde_json::to_string(&message)?;
// if i > 0 {
// validate_file.write_all(b"\n").context("Failed to write to file")?;
// }
// validate_file.write_all(content.as_bytes()).context("Failed to write to file")?;
// }
// log::info!(
// "Wrote {} commits to train file and {} commits to validate file",
// commit_count / 2,
// commit_count / 2
// );
// Ok(())
// }
// fn should_exclude_path(file_path: &str) -> bool {
// let exclude_patterns = vec![
// "/docs/", "/documentation/", "/guides/", // Documentation
// "/assets/", "/images/", "/graphics/", "/designs/", // Assets and design-related files
// "Gemfile", "Gemfile.lock", // Dependency files
// "/config/", "/settings/", "/initializers/", // Configuration files
// "/vendor/", "/third-party/", "/external/", // Third-party and vendor code
// "/submodules/", // Git submodules
// "/.github/", "/.gitignore", "/.gitmodules",
// "/.gitattributes", // Git and GitHub specific files
// "/.gitlab-ci.yml", "/.travis.yml", "/appveyor.yml", // CI/CD configuration files
// "/Dockerfile", "/docker-compose.yml", "/.dockerignore", // Docker files
// "/.editorconfig", "/.rubocop.yml", "/.eslintignore",
// "/.eslintrc", // Linter and editor configuration
// "/test/", "/spec/", "/tests/", "/specs/", // Test files and directories
// "/locales/", "/i18n/", // Localization files
// "/logs/", "/tmp/", // Logs and temporary files
// "/public/", // Public assets
// "/node_modules/", "/package.json", "/yarn.lock", // Node.js specific files
// "/.env", "/.env.example", // Environment files
// "/db/schema.rb", "/db/migrate/", // Database schema and migrations
// "/scripts/", "/tools/", // Utility scripts and tools
// "/CHANGELOG", "/LICENSE", "/README.md", // Project meta-files
// ];
// exclude_patterns.iter().any(|pattern| file_path.contains(pattern))
// }
// fn generate_commit_diff(repo: &Repository, commit: &Commit) -> Result<Option<String>> {
// let parent = commit.parents().next().unwrap_or_else(|| commit.clone());
// let tree = commit.tree().expect("Couldn't get commit tree");
// let parent_tree = parent.tree().expect("Couldn't get parent tree");
// let mut opts = DiffOptions::new();
// opts
// .ignore_whitespace_change(true)
// .recurse_untracked_dirs(false)
// .recurse_ignored_dirs(false)
// .ignore_whitespace_eol(true)
// .ignore_blank_lines(true)
// .include_untracked(false)
// .ignore_whitespace(true)
// .indent_heuristic(false)
// .ignore_submodules(true)
// .include_ignored(false)
// .interhunk_lines(0)
// .context_lines(0)
// .patience(true)
// .minimal(true);
// let diff = repo
// .diff_tree_to_tree(Some(&parent_tree), Some(&tree), Some(&mut opts))
// .context("Failed to get diff")?;
// let mut patch: Vec<u8> = Vec::new();
// #[rustfmt::skip]
// diff.print(DiffFormat::Patch, |delta, _, line| {
// // Ignore if line is a binary file
// if line.origin() == 'B' {
// return false;
// }
// let file_path = delta.new_file().path().unwrap_or_else(|| delta.old_file().path().unwrap());
// if should_exclude_path(file_path.to_str().unwrap()) {
// return false;
// }
// let content = line.content();
// patch.extend_from_slice(content);
// true
// }).context("Failed to print diff")?;
// let content = String::from_utf8(patch).context("Failed to convert patch to string")?;
// if content.split_whitespace().count() > 600 { Ok(None) } else { Ok(Some(content)) }
// }