repograph_core/search/
chunk.rs1use std::path::Path;
10
11use git2::{ObjectType, Repository};
12
13pub const MAX_FILE_BYTES: u64 = 1_048_576;
19
20pub const CHUNK_LINES: usize = 40;
22
23pub const CHUNK_OVERLAP: usize = 10;
26
27#[derive(Debug, Clone, PartialEq, Eq)]
30pub struct Chunk {
31 pub path: String,
33 pub start_line: u32,
35 pub end_line: u32,
37 pub content: String,
39 pub prefix: String,
41}
42
43impl Chunk {
44 #[must_use]
47 pub fn index_text(&self) -> String {
48 format!("{}\n{}", self.prefix, self.content)
49 }
50}
51
52#[derive(Debug, Clone)]
56pub struct TrackedFile {
57 pub path: String,
58 pub content_hash: String,
59 pub text: String,
60}
61
62pub fn tracked_files(repo: &Repository, repo_path: &Path) -> Result<Vec<TrackedFile>, git2::Error> {
76 let index = repo.index()?;
77 let mut out = Vec::new();
78 for i in 0..index.len() {
79 let Some(entry) = index.get(i) else {
80 continue;
81 };
82 let Ok(rel) = std::str::from_utf8(&entry.path) else {
83 continue; };
85 let rel = rel.replace('\\', "/");
86 let abs = repo_path.join(&rel);
87 let Ok(meta) = std::fs::metadata(&abs) else {
88 continue; };
90 if !meta.is_file() || meta.len() > MAX_FILE_BYTES {
91 continue;
92 }
93 let Ok(bytes) = std::fs::read(&abs) else {
94 continue;
95 };
96 let Ok(text) = String::from_utf8(bytes) else {
97 continue; };
99 let content_hash = blob_hash(text.as_bytes());
100 out.push(TrackedFile {
101 path: rel,
102 content_hash,
103 text,
104 });
105 }
106 Ok(out)
107}
108
109fn blob_hash(bytes: &[u8]) -> String {
114 git2::Oid::hash_object(ObjectType::Blob, bytes)
115 .map_or_else(|_| format!("len:{}", bytes.len()), |oid| oid.to_string())
116}
117
118#[must_use]
121pub fn chunk_file(repo: &str, path: &str, text: &str) -> Vec<Chunk> {
122 let lines: Vec<&str> = text.lines().collect();
123 if lines.iter().all(|l| l.trim().is_empty()) {
124 return Vec::new();
125 }
126 let stride = CHUNK_LINES.saturating_sub(CHUNK_OVERLAP).max(1);
127 let mut chunks = Vec::new();
128 let mut start = 0usize;
129 while start < lines.len() {
130 let end = (start + CHUNK_LINES).min(lines.len());
131 let content = lines[start..end].join("\n");
132 if !content.trim().is_empty() {
133 let start_line = u32::try_from(start + 1).unwrap_or(u32::MAX);
134 let end_line = u32::try_from(end).unwrap_or(u32::MAX);
135 let prefix = format!("{repo} › {path} › L{start_line}-{end_line}");
136 chunks.push(Chunk {
137 path: path.to_string(),
138 start_line,
139 end_line,
140 content,
141 prefix,
142 });
143 }
144 if end == lines.len() {
145 break;
146 }
147 start += stride;
148 }
149 chunks
150}
151
152#[cfg(test)]
153mod tests {
154 #![allow(
157 clippy::unwrap_used,
158 clippy::cast_possible_truncation,
159 clippy::format_collect
160 )]
161 use super::*;
162 use std::path::PathBuf;
163 use tempfile::TempDir;
164
165 fn init_repo_with(files: &[(&str, &str)]) -> (TempDir, PathBuf) {
166 let tmp = TempDir::new().unwrap();
167 let dir = tmp.path().join("r");
168 std::fs::create_dir_all(&dir).unwrap();
169 let repo = git2::Repository::init(&dir).unwrap();
170 for (rel, body) in files {
171 let abs = dir.join(rel);
172 if let Some(parent) = abs.parent() {
173 std::fs::create_dir_all(parent).unwrap();
174 }
175 std::fs::write(&abs, body).unwrap();
176 }
177 let mut index = repo.index().unwrap();
178 index
179 .add_all(["*"], git2::IndexAddOption::DEFAULT, None)
180 .unwrap();
181 index.write().unwrap();
182 (tmp, dir)
183 }
184
185 #[test]
186 fn tracked_files_returns_added_text_files() {
187 let (_tmp, dir) = init_repo_with(&[("src/a.rs", "fn a() {}\n"), ("README.md", "# hi\n")]);
188 let repo = git2::Repository::open(&dir).unwrap();
189 let mut files = tracked_files(&repo, &dir).unwrap();
190 files.sort_by(|a, b| a.path.cmp(&b.path));
191 let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
192 assert_eq!(paths, vec!["README.md", "src/a.rs"]);
193 assert!(files.iter().all(|f| !f.content_hash.is_empty()));
194 }
195
196 #[test]
197 fn tracked_files_excludes_untracked() {
198 let (_tmp, dir) = init_repo_with(&[("tracked.rs", "fn t() {}\n")]);
199 std::fs::write(dir.join("untracked.rs"), "fn u() {}\n").unwrap();
200 let repo = git2::Repository::open(&dir).unwrap();
201 let files = tracked_files(&repo, &dir).unwrap();
202 let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
203 assert_eq!(paths, vec!["tracked.rs"]);
204 }
205
206 #[test]
207 fn tracked_files_skips_oversize_and_binary() {
208 let big = "x".repeat((MAX_FILE_BYTES + 1) as usize);
209 let (_tmp, dir) = init_repo_with(&[
210 ("ok.txt", "small\n"),
211 ("big.txt", big.as_str()),
212 ("bin.dat", "\u{0}"),
213 ]);
214 std::fs::write(dir.join("bin.dat"), [0xff, 0xfe, 0x00]).unwrap();
216 let repo = git2::Repository::open(&dir).unwrap();
217 let files = tracked_files(&repo, &dir).unwrap();
218 let paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
219 assert_eq!(paths, vec!["ok.txt"], "oversize + binary skipped");
220 }
221
222 #[test]
223 fn blob_hash_matches_git_blob_identity() {
224 assert_eq!(blob_hash(b""), "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391");
226 }
227
228 #[test]
229 fn chunk_file_windows_with_overlap_and_prefix() {
230 let body: String = (1..=100).map(|n| format!("line{n}\n")).collect();
231 let chunks = chunk_file("repo", "src/big.rs", &body);
232 assert!(chunks.len() > 1, "long file splits into multiple chunks");
233 assert_eq!(chunks[0].start_line, 1);
234 assert_eq!(chunks[0].end_line, CHUNK_LINES as u32);
235 let stride = (CHUNK_LINES - CHUNK_OVERLAP) as u32;
237 assert_eq!(chunks[1].start_line, stride + 1);
238 assert!(chunks[0].prefix.contains("repo › src/big.rs › L1-"));
239 assert!(chunks[0].index_text().starts_with("repo › src/big.rs"));
240 }
241
242 #[test]
243 fn chunk_file_empty_yields_nothing() {
244 assert!(chunk_file("r", "empty.txt", " \n\n").is_empty());
245 }
246
247 #[test]
248 fn chunk_file_short_file_is_single_chunk() {
249 let chunks = chunk_file("r", "a.rs", "fn a() {}\nfn b() {}\n");
250 assert_eq!(chunks.len(), 1);
251 assert_eq!(chunks[0].start_line, 1);
252 assert_eq!(chunks[0].end_line, 2);
253 }
254}