use std::path::Path;
use git2::{ObjectType, Repository};
pub const MAX_FILE_BYTES: u64 = 1_048_576;
pub const CHUNK_LINES: usize = 40;
pub const CHUNK_OVERLAP: usize = 10;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
pub path: String,
pub start_line: u32,
pub end_line: u32,
pub content: String,
pub prefix: String,
}
impl Chunk {
#[must_use]
pub fn index_text(&self) -> String {
format!("{}\n{}", self.prefix, self.content)
}
}
#[derive(Debug, Clone)]
pub struct TrackedFile {
pub path: String,
pub content_hash: String,
pub text: String,
}
pub fn tracked_files(repo: &Repository, repo_path: &Path) -> Result<Vec<TrackedFile>, git2::Error> {
let index = repo.index()?;
let mut out = Vec::new();
for i in 0..index.len() {
let Some(entry) = index.get(i) else {
continue;
};
let Ok(rel) = std::str::from_utf8(&entry.path) else {
continue; };
let rel = rel.replace('\\', "/");
let abs = repo_path.join(&rel);
let Ok(meta) = std::fs::metadata(&abs) else {
continue; };
if !meta.is_file() || meta.len() > MAX_FILE_BYTES {
continue;
}
let Ok(bytes) = std::fs::read(&abs) else {
continue;
};
let Ok(text) = String::from_utf8(bytes) else {
continue; };
let content_hash = blob_hash(text.as_bytes());
out.push(TrackedFile {
path: rel,
content_hash,
text,
});
}
Ok(out)
}
fn blob_hash(bytes: &[u8]) -> String {
git2::Oid::hash_object(ObjectType::Blob, bytes)
.map_or_else(|_| format!("len:{}", bytes.len()), |oid| oid.to_string())
}
#[must_use]
pub fn chunk_file(repo: &str, path: &str, text: &str) -> Vec<Chunk> {
let lines: Vec<&str> = text.lines().collect();
if lines.iter().all(|l| l.trim().is_empty()) {
return Vec::new();
}
let stride = CHUNK_LINES.saturating_sub(CHUNK_OVERLAP).max(1);
let mut chunks = Vec::new();
let mut start = 0usize;
while start < lines.len() {
let end = (start + CHUNK_LINES).min(lines.len());
let content = lines[start..end].join("\n");
if !content.trim().is_empty() {
let start_line = u32::try_from(start + 1).unwrap_or(u32::MAX);
let end_line = u32::try_from(end).unwrap_or(u32::MAX);
let prefix = format!("{repo} › {path} › L{start_line}-{end_line}");
chunks.push(Chunk {
path: path.to_string(),
start_line,
end_line,
content,
prefix,
});
}
if end == lines.len() {
break;
}
start += stride;
}
chunks
}
#[cfg(test)]
mod tests {
#![allow(
clippy::unwrap_used,
clippy::cast_possible_truncation,
clippy::format_collect
)]
use super::*;
use std::path::PathBuf;
use tempfile::TempDir;
fn init_repo_with(files: &[(&str, &str)]) -> (TempDir, PathBuf) {
let tmp = TempDir::new().unwrap();
let dir = tmp.path().join("r");
std::fs::create_dir_all(&dir).unwrap();
let repo = git2::Repository::init(&dir).unwrap();
for (rel, body) in files {
let abs = dir.join(rel);
if let Some(parent) = abs.parent() {
std::fs::create_dir_all(parent).unwrap();
}
std::fs::write(&abs, body).unwrap();
}
let mut index = repo.index().unwrap();
index
.add_all(["*"], git2::IndexAddOption::DEFAULT, None)
.unwrap();
index.write().unwrap();
(tmp, dir)
}
#[test]
fn tracked_files_returns_added_text_files() {
let (_tmp, dir) = init_repo_with(&[("src/a.rs", "fn a() {}\n"), ("README.md", "# hi\n")]);
let repo = git2::Repository::open(&dir).unwrap();
let mut files = tracked_files(&repo, &dir).unwrap();
files.sort_by(|a, b| a.path.cmp(&b.path));
let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
assert_eq!(paths, vec!["README.md", "src/a.rs"]);
assert!(files.iter().all(|f| !f.content_hash.is_empty()));
}
#[test]
fn tracked_files_excludes_untracked() {
let (_tmp, dir) = init_repo_with(&[("tracked.rs", "fn t() {}\n")]);
std::fs::write(dir.join("untracked.rs"), "fn u() {}\n").unwrap();
let repo = git2::Repository::open(&dir).unwrap();
let files = tracked_files(&repo, &dir).unwrap();
let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
assert_eq!(paths, vec!["tracked.rs"]);
}
#[test]
fn tracked_files_skips_oversize_and_binary() {
let big = "x".repeat((MAX_FILE_BYTES + 1) as usize);
let (_tmp, dir) = init_repo_with(&[
("ok.txt", "small\n"),
("big.txt", big.as_str()),
("bin.dat", "\u{0}"),
]);
std::fs::write(dir.join("bin.dat"), [0xff, 0xfe, 0x00]).unwrap();
let repo = git2::Repository::open(&dir).unwrap();
let files = tracked_files(&repo, &dir).unwrap();
let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
assert_eq!(paths, vec!["ok.txt"], "oversize + binary skipped");
}
#[test]
fn blob_hash_matches_git_blob_identity() {
assert_eq!(blob_hash(b""), "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391");
}
#[test]
fn chunk_file_windows_with_overlap_and_prefix() {
let body: String = (1..=100).map(|n| format!("line{n}\n")).collect();
let chunks = chunk_file("repo", "src/big.rs", &body);
assert!(chunks.len() > 1, "long file splits into multiple chunks");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, CHUNK_LINES as u32);
let stride = (CHUNK_LINES - CHUNK_OVERLAP) as u32;
assert_eq!(chunks[1].start_line, stride + 1);
assert!(chunks[0].prefix.contains("repo › src/big.rs › L1-"));
assert!(chunks[0].index_text().starts_with("repo › src/big.rs"));
}
#[test]
fn chunk_file_empty_yields_nothing() {
assert!(chunk_file("r", "empty.txt", " \n\n").is_empty());
}
#[test]
fn chunk_file_short_file_is_single_chunk() {
let chunks = chunk_file("r", "a.rs", "fn a() {}\nfn b() {}\n");
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 2);
}
}